PyPI - nucliadb-models - Versions diffs - 6.8.1.post4983__py3-none-any.whl → 6.10.0.post5694__py3-none-any.whl - Mend

nucliadb-models 6.8.1.post4983py3-none-any.whl → 6.10.0.post5694py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nucliadb-models might be problematic. Click here for more details.

Files changed (34) hide show

nucliadb_models/agents/ingestion.py +4 -4
nucliadb_models/augment.py +359 -0
nucliadb_models/common.py +66 -57
nucliadb_models/configuration.py +9 -9
nucliadb_models/content_types.py +13 -11
nucliadb_models/conversation.py +30 -29
nucliadb_models/entities.py +17 -18
nucliadb_models/external_index_providers.py +5 -20
nucliadb_models/extracted.py +82 -83
nucliadb_models/file.py +10 -11
nucliadb_models/filters.py +78 -74
nucliadb_models/graph/requests.py +38 -47
nucliadb_models/hydration.py +423 -0
nucliadb_models/internal/predict.py +7 -9
nucliadb_models/internal/shards.py +2 -3
nucliadb_models/labels.py +18 -11
nucliadb_models/link.py +18 -19
nucliadb_models/metadata.py +80 -53
nucliadb_models/notifications.py +3 -3
nucliadb_models/processing.py +1 -2
nucliadb_models/resource.py +85 -102
nucliadb_models/retrieval.py +147 -0
nucliadb_models/search.py +360 -306
nucliadb_models/security.py +2 -3
nucliadb_models/text.py +7 -8
nucliadb_models/trainset.py +1 -2
nucliadb_models/utils.py +2 -3
nucliadb_models/vectors.py +2 -5
nucliadb_models/writer.py +56 -57
{nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/METADATA +2 -3
nucliadb_models-6.10.0.post5694.dist-info/RECORD +41 -0
nucliadb_models-6.8.1.post4983.dist-info/RECORD +0 -38
{nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/WHEEL +0 -0
{nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/top_level.txt +0 -0

nucliadb_models/agents/ingestion.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 #
 from enum import Enum
-from typing import Optional
 from pydantic import BaseModel, Field
@@ -41,11 +40,12 @@ class AgentsFilter(BaseModel):
 class ResourceAgentsRequest(BaseModel):
-    filters: Optional[list[AgentsFilter]] = Field(
+    filters: list[AgentsFilter] | None = Field(
+        title="Resource Agent Filters",
         default=None,
         description="Filters to apply to the agents. If None, all curently configured agents are applied.",
     )
-    agent_ids: Optional[list[str]] = Field(
+    agent_ids: list[str] | None = Field(
         default=None,
         title="An optional list of Data Augmentation Agent IDs to run. If None, all configured agents that match the filters are run.",
     )
@@ -57,7 +57,7 @@ class NewTextField(BaseModel):
 class AppliedDataAugmentation(BaseModel):
-    qas: Optional[QuestionAnswers] = Field(
+    qas: QuestionAnswers | None = Field(
         default=None,
         description="Question and answers generated by the Question Answers agent",
     )

nucliadb_models/augment.py ADDED Viewed

@@ -0,0 +1,359 @@
+# Copyright 2025 Bosutech XXI S.L.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from enum import Enum
+from typing import Annotated
+from pydantic import BaseModel, Field, StringConstraints, model_validator
+from typing_extensions import Self
+from nucliadb_models import filters
+from nucliadb_models.common import FieldTypeName
+from nucliadb_models.resource import ExtractedDataTypeName, Resource
+from nucliadb_models.search import ResourceProperties
+ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
+ResourceId = Annotated[
+    str,
+    StringConstraints(pattern=ResourceIdPattern, min_length=32, max_length=36),
+]
+FieldIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?$"
+FieldId = Annotated[
+    str,
+    StringConstraints(
+        pattern=FieldIdPattern,
+        min_length=32 + 1 + 1 + 1 + 1 + 0 + 0,
+        # max field id of 250
+        max_length=32 + 1 + 1 + 1 + 250 + 1 + 218,
+    ),
+]
+ParagraphIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$"
+ParagraphId = Annotated[
+    str,
+    StringConstraints(
+        # resource-uuid/field-type/field-id/[split-id/]paragraph-id
+        pattern=ParagraphIdPattern,
+        min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
+        # max field id of 250 and 10 digit paragraphs. More than enough
+        max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
+    ),
+]
+# Request
+class ResourceProp(str, Enum):
+    """Superset of former `show` and `extracted` serializations options."""
+    # `show` props
+    BASIC = "basic"
+    ORIGIN = "origin"
+    EXTRA = "extra"
+    RELATIONS = "relations"
+    VALUES = "values"
+    ERRORS = "errors"
+    SECURITY = "security"
+    # `extracted` props
+    EXTRACTED_TEXT = "extracted_text"
+    EXTRACTED_METADATA = "extracted_metadata"
+    EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
+    EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
+    EXTRACTED_VECTOR = "extracted_vectors"
+    EXTRACTED_LINK = "extracted_link"
+    EXTRACTED_FILE = "extracted_file"
+    EXTRACTED_QA = "extracted_question_answers"
+    # new granular props
+    TITLE = "title"
+    SUMMARY = "summary"
+    CLASSIFICATION_LABELS = "classification_labels"
+    @classmethod
+    def from_show_and_extracted(
+        cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
+    ) -> list["ResourceProp"]:
+        _show_to_prop = {
+            ResourceProperties.BASIC: cls.BASIC,
+            ResourceProperties.ORIGIN: cls.ORIGIN,
+            ResourceProperties.EXTRA: cls.EXTRA,
+            ResourceProperties.RELATIONS: cls.RELATIONS,
+            ResourceProperties.VALUES: cls.VALUES,
+            ResourceProperties.ERRORS: cls.ERRORS,
+            ResourceProperties.SECURITY: cls.SECURITY,
+        }
+        _extracted_to_prop = {
+            ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
+            ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
+            ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
+            ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
+            ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
+            ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
+            ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
+            ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
+        }
+        props = []
+        for s in show:
+            show_prop = _show_to_prop.get(s)
+            # show=extracted is not in the dict
+            if show_prop is None:
+                continue
+            props.append(show_prop)
+        if ResourceProperties.EXTRACTED in show:
+            for e in extracted:
+                extracted_prop = _extracted_to_prop[e]
+                props.append(extracted_prop)
+        return props
+class AugmentResourceFields(BaseModel):
+    text: bool = False
+    classification_labels: bool = False
+    filters: list[filters.Field | filters.Generated]
+class AugmentResources(BaseModel):
+    given: list[ResourceId]
+    # TODO(decoupled-ask): replace this select for bool fields
+    select: list[ResourceProp] = Field(default_factory=list)
+    field_type_filter: list[FieldTypeName] | None = Field(
+        default=None,
+        deprecated="Only use this for legacy resource serialization",
+        title="Field type filter",
+        description=(
+            "Define which field types are serialized on resources of search results. "
+            "If omitted and legacy serialization is used, all field types will be serialized"
+        ),
+    )
+    fields: AugmentResourceFields | None = None
+    @model_validator(mode="after")
+    def bwc_resource_serialization(self) -> Self:
+        if self.field_type_filter is not None and self.fields is not None:
+            raise ValueError("`field_type_filter` and `fields` are incompatible together")
+        return self
+class AugmentFields(BaseModel):
+    given: list[FieldId]
+    text: bool = False
+    classification_labels: bool = False
+    entities: bool = False  # also known as ners
+    # For file fields, augment the path to the thumbnail image
+    file_thumbnail: bool = False
+    # When enabled, augment all the messages from the conversation. This is
+    # incompatible with max_conversation_messages defined
+    full_conversation: bool = False
+    # When `full` disbled, this option controls the max amount of messages to be
+    # augmented. This number will be a best-effort window centered around the
+    # selected message. In addition, the 1st message of the conversation will
+    # always be included.
+    #
+    # This option is combinable with attachments.
+    max_conversation_messages: int | None = None
+    # Given a message, if it's a question, try to find an answer. Otherwise,
+    # return a window of messages following the requested one.
+    #
+    # This was previously done without explicit user consent, now it's an option.
+    conversation_answer_or_messages_after: bool = False
+    # Both attachment options will only add attachments for the full or the 1st
+    # + window, not answer nor messages after
+    # include conversation text attachments
+    conversation_text_attachments: bool = False
+    # include conversation image attachments
+    conversation_image_attachments: bool = False
+    @model_validator(mode="after")
+    def validate_cross_options(self):
+        if self.full_conversation and self.max_conversation_messages is not None:
+            raise ValueError(
+                "`full_conversation` and `max_conversation_messages` are not compatible together"
+            )
+        if (
+            (self.conversation_text_attachments or self.conversation_image_attachments)
+            and self.full_conversation is False
+            and self.max_conversation_messages is None
+        ):
+            raise ValueError(
+                "Attachments are only compatible with `full_conversation` and `max_conversation_messages`"
+            )
+        return self
+# TODO(decoupled-ask): remove unused metadata
+class ParagraphMetadata(BaseModel):
+    field_labels: list[str]
+    paragraph_labels: list[str]
+    is_an_image: bool
+    is_a_table: bool
+    # for extracted from visual content (ocr, inception, tables)
+    source_file: str | None
+    # for documents (pdf, docx...) only
+    page: int | None
+    in_page_with_visual: bool | None
+class AugmentParagraph(BaseModel):
+    id: ParagraphId
+    metadata: ParagraphMetadata | None = None
+class AugmentParagraphs(BaseModel):
+    given: list[AugmentParagraph]
+    text: bool = True
+    neighbours_before: int = 0
+    neighbours_after: int = 0
+    # paragraph extracted from an image, return an image
+    source_image: bool = False
+    # paragraph extracted from a table, return table image
+    table_image: bool = False
+    # return page_preview instead of table image if table image enabled
+    table_prefers_page_preview: bool = False
+    # paragraph from a page, return page preview image
+    page_preview_image: bool = False
+    @model_validator(mode="after")
+    def table_options_work_together(self) -> Self:
+        if not self.table_image and self.table_prefers_page_preview:
+            raise ValueError("`table_prefers_page_preview` can only be enabled with `table_image`")
+        return self
+class AugmentRequest(BaseModel):
+    resources: AugmentResources | None = None
+    fields: AugmentFields | None = None
+    paragraphs: AugmentParagraphs | None = None
+# Response
+class AugmentedParagraph(BaseModel):
+    text: str | None = None
+    neighbours_before: list[ParagraphId] | None = None
+    neighbours_after: list[ParagraphId] | None = None
+    source_image: str | None = None
+    table_image: str | None = None
+    page_preview_image: str | None = None
+class AugmentedField(BaseModel):
+    text: str | None = None
+    classification_labels: dict[str, list[str]] | None = None
+    # former ners
+    entities: dict[str, list[str]] | None = None
+class AugmentedFileField(BaseModel):
+    text: str | None = None
+    classification_labels: dict[str, list[str]] | None = None
+    # former ners
+    entities: dict[str, list[str]] | None = None
+    # TODO(decoupled-ask): implement image strategy
+    page_preview_image: str | None = None
+    # Path for the download API to retrieve the file thumbnail image
+    thumbnail_image: str | None = None
+class AugmentedConversationMessage(BaseModel):
+    ident: str
+    text: str | None = None
+    attachments: list[FieldId] | None = None
+class AugmentedConversationField(BaseModel):
+    classification_labels: dict[str, list[str]] | None = None
+    # former ners
+    entities: dict[str, list[str]] | None = None
+    messages: list[AugmentedConversationMessage] | None = None
+    @property
+    def text(self) -> str | None:
+        """Syntactic sugar to access aggregate text from all messages"""
+        if self.messages is None:
+            return None
+        text = ""
+        for message in self.messages:
+            text += message.text or ""
+        return text or None
+    @property
+    def attachments(self) -> list[FieldId] | None:
+        """Syntactic sugar to access the aggregate of attachments from all messages."""
+        if self.messages is None:
+            return None
+        has_attachments = False
+        attachments = []
+        for message in self.messages:
+            if message.attachments is None:
+                continue
+            has_attachments = True
+            attachments.extend(message.attachments)
+        if has_attachments:
+            return attachments
+        else:
+            return None
+class AugmentedResource(Resource):
+    classification_labels: dict[str, list[str]] | None = None
+    def updated_from(self, origin: Resource):
+        for key in origin.model_fields.keys():
+            self.__setattr__(key, getattr(origin, key))
+class AugmentResponse(BaseModel):
+    resources: dict[ResourceId, AugmentedResource]
+    fields: dict[FieldId, AugmentedField | AugmentedFileField | AugmentedConversationField]
+    paragraphs: dict[ParagraphId, AugmentedParagraph]

nucliadb_models/common.py CHANGED Viewed

@@ -16,7 +16,7 @@ import base64
 import hashlib
 import re
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Any
 from pydantic import (
     BaseModel,
@@ -38,7 +38,7 @@ FIELD_TYPE_CHAR_MAP = {
 }
 STORAGE_FILE_MATCH = re.compile(
-    r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"  # noqa
+    r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
 )
 DOWNLOAD_TYPE_MAP = {"f": "field", "e": "extracted"}
 DOWNLOAD_URI = "/kb/{kbid}/resource/{rid}/{field_type}/{field_id}/download/{download_type}/{key}"
@@ -50,9 +50,9 @@ class ParamDefault(BaseModel):
     default: Any = None
     title: str
     description: str
-    le: Optional[float] = None
-    gt: Optional[float] = None
-    max_items: Optional[int] = None
+    le: float | None = None
+    gt: float | None = None
+    max_items: int | None = None
     deprecated: bool = False
     def to_pydantic_field(self, default=_NOT_SET, **kw) -> Field:  # type: ignore
@@ -86,13 +86,13 @@ class FieldID(BaseModel):
 class File(BaseModel):
-    filename: Optional[str] = None
+    filename: str | None = None
     content_type: str = "application/octet-stream"
-    payload: Optional[str] = Field(default=None, description="Base64 encoded file content")
-    md5: Optional[str] = None
+    payload: str | None = Field(default=None, description="Base64 encoded file content")
+    md5: str | None = None
     # These are to be used for external files
-    uri: Optional[str] = None
-    extra_headers: Dict[str, str] = {}
+    uri: str | None = None
+    extra_headers: dict[str, str] = {}
     @model_validator(mode="after")
     def _check_internal_file_fields(self) -> Self:
@@ -108,7 +108,7 @@ class File(BaseModel):
         if self.md5 is None:
             # In case md5 is not supplied, compute it
             try:
-                result = hashlib.md5(base64.b64decode(self.payload))
+                result = hashlib.md5(base64.b64decode(self.payload), usedforsecurity=False)
                 self.md5 = result.hexdigest()
             except Exception:
                 raise ValueError("MD5 could not be computed")
@@ -134,10 +134,10 @@ class FileB64(BaseModel):
 class CloudFile(BaseModel):
-    uri: Optional[str] = None
-    size: Optional[int] = None
-    content_type: Optional[str] = None
-    bucket_name: Optional[str] = None
+    uri: str | None = None
+    size: int | None = None
+    content_type: str | None = None
+    bucket_name: str | None = None
     class Source(Enum):
         FLAPS = "FLAPS"
@@ -146,23 +146,23 @@ class CloudFile(BaseModel):
         LOCAL = "LOCAL"
         EXTERNAL = "EXTERNAL"
-    source: Optional[Source]
-    filename: Optional[str]
-    resumable_uri: Optional[str]
-    offset: Optional[int]
-    upload_uri: Optional[str]
-    parts: Optional[List[str]]
-    old_uri: Optional[str]
-    old_bucket: Optional[str]
-    md5: Optional[str]
+    source: Source | None
+    filename: str | None
+    resumable_uri: str | None
+    offset: int | None
+    upload_uri: str | None
+    parts: list[str] | None
+    old_uri: str | None
+    old_bucket: str | None
+    md5: str | None
 class CloudLink(BaseModel):
-    uri: Optional[str] = None
-    size: Optional[int] = None
-    content_type: Optional[str] = None
-    filename: Optional[str] = None
-    md5: Optional[str] = None
+    uri: str | None = None
+    size: int | None = None
+    content_type: str | None = None
+    filename: str | None = None
+    md5: str | None = None
     @staticmethod
     def format_reader_download_uri(uri: str) -> str:
@@ -203,16 +203,25 @@ class FieldTypeName(str, Enum):
             "a": FieldTypeName.GENERIC,
         }[abbr]
+    def abbreviation(self) -> str:
+        return {
+            FieldTypeName.TEXT: "t",
+            FieldTypeName.FILE: "f",
+            FieldTypeName.LINK: "u",
+            FieldTypeName.CONVERSATION: "c",
+            FieldTypeName.GENERIC: "a",
+        }[self]
 class FieldRef(BaseModel):
     field_type: FieldTypeName
     field_id: str
-    split: Optional[str] = None
+    split: str | None = None
 class Classification(BaseModel):
-    labelset: str
-    label: str
+    labelset: str = Field(title="The ID of the labelset")
+    label: str = Field(title="The label assigned from the labelset")
 class UserClassification(Classification):
@@ -220,19 +229,19 @@ class UserClassification(Classification):
 class Sentence(BaseModel):
-    start: Optional[int] = None
-    end: Optional[int] = None
-    key: Optional[str] = None
+    start: int | None = None
+    end: int | None = None
+    key: str | None = None
 class PageInformation(BaseModel):
-    page: Optional[int] = None
-    page_with_visual: Optional[bool] = None
+    page: int | None = Field(default=None, title="Page Information Page")
+    page_with_visual: bool | None = None
 class Representation(BaseModel):
-    is_a_table: Optional[bool] = None
-    reference_file: Optional[str] = None
+    is_a_table: bool | None = None
+    reference_file: str | None = None
 class ParagraphRelations(BaseModel):
@@ -242,10 +251,10 @@ class ParagraphRelations(BaseModel):
 class Paragraph(BaseModel):
-    start: Optional[int] = None
-    end: Optional[int] = None
-    start_seconds: Optional[List[int]] = None
-    end_seconds: Optional[List[int]] = None
+    start: int | None = None
+    end: int | None = None
+    start_seconds: list[int] | None = None
+    end_seconds: list[int] | None = None
     class TypeParagraph(str, Enum):
         TEXT = "TEXT"
@@ -256,35 +265,35 @@ class Paragraph(BaseModel):
         TITLE = "TITLE"
         TABLE = "TABLE"
-    kind: Optional[TypeParagraph] = None
-    classifications: Optional[List[Classification]] = None
-    sentences: Optional[List[Sentence]] = None
-    key: Optional[str] = None
-    page: Optional[PageInformation] = None
-    representation: Optional[Representation] = None
-    relations: Optional[ParagraphRelations] = None
+    kind: TypeParagraph | None = None
+    classifications: list[Classification] | None = None
+    sentences: list[Sentence] | None = None
+    key: str | None = None
+    page: PageInformation | None = None
+    representation: Representation | None = None
+    relations: ParagraphRelations | None = None
 class Shards(BaseModel):
-    shards: Optional[List[str]] = None
+    shards: list[str] | None = None
 class Question(BaseModel):
     text: str
-    language: Optional[str] = None
-    ids_paragraphs: List[str]
+    language: str | None = None
+    ids_paragraphs: list[str]
 class Answer(BaseModel):
     text: str
-    language: Optional[str] = None
-    ids_paragraphs: List[str]
+    language: str | None = None
+    ids_paragraphs: list[str]
 class QuestionAnswer(BaseModel):
     question: Question
-    answers: List[Answer]
+    answers: list[Answer]
 class QuestionAnswers(BaseModel):
-    question_answer: List[QuestionAnswer]
+    question_answer: list[QuestionAnswer]

nucliadb_models/configuration.py CHANGED Viewed

@@ -14,7 +14,7 @@
 #
 import warnings
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 from pydantic import BaseModel, Field, create_model
@@ -28,11 +28,11 @@ class KBConfiguration(BaseModel):
         super().__init__(**data)
     # Do not touch this model synced on Processing side
-    semantic_model: Optional[str] = None
-    generative_model: Optional[str] = None
-    ner_model: Optional[str] = None
-    anonymization_model: Optional[str] = None
-    visual_labeling: Optional[str] = None
+    semantic_model: str | None = None
+    generative_model: str | None = None
+    ner_model: str | None = None
+    anonymization_model: str | None = None
+    visual_labeling: str | None = None
 #
@@ -44,7 +44,7 @@ def _model_fields(model: type[BaseModel], skip: list[str]) -> dict[str, Any]:
     }
-# FindConfig is a FindConfig without `search_configuration`
+# FindConfig is a FindRequest without `search_configuration`
 FindConfig = create_model("FindConfig", **_model_fields(FindRequest, skip=["search_configuration"]))
@@ -57,7 +57,7 @@ class FindSearchConfiguration(BaseModel):
 AskConfig = create_model(
     "AskConfig",
     **_model_fields(AskRequest, skip=["query", "search_configuration"]),
-    query=(Optional[str], None),
+    query=(str | None, None),
 )
@@ -67,7 +67,7 @@ class AskSearchConfiguration(BaseModel):
 SearchConfiguration = Annotated[
-    Union[FindSearchConfiguration, AskSearchConfiguration], Field(discriminator="kind")
+    FindSearchConfiguration | AskSearchConfiguration, Field(discriminator="kind")
 ]
 # We need this to avoid issues with pydantic and generic types defined in another module

nucliadb-models 6.8.1.post4983__py3-none-any.whl → 6.10.0.post5694__py3-none-any.whl

Potentially problematic release.

nucliadb-models 6.8.1.post4983py3-none-any.whl → 6.10.0.post5694py3-none-any.whl