PyPI - nucliadb-models - Versions diffs - 6.9.7.post5583__py3-none-any.whl → 6.11.1.post5822__py3-none-any.whl - Mend

nucliadb-models 6.9.7.post5583py3-none-any.whl → 6.11.1.post5822py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

nucliadb_models/agents/ingestion.py +4 -4
nucliadb_models/augment.py +100 -84
nucliadb_models/common.py +56 -56
nucliadb_models/configuration.py +8 -8
nucliadb_models/content_types.py +13 -11
nucliadb_models/conversation.py +25 -26
nucliadb_models/entities.py +17 -18
nucliadb_models/external_index_providers.py +1 -2
nucliadb_models/extracted.py +82 -83
nucliadb_models/file.py +10 -11
nucliadb_models/filters.py +78 -74
nucliadb_models/graph/requests.py +40 -48
nucliadb_models/graph/responses.py +13 -1
nucliadb_models/hydration.py +48 -50
nucliadb_models/internal/predict.py +7 -9
nucliadb_models/internal/shards.py +2 -3
nucliadb_models/labels.py +18 -11
nucliadb_models/link.py +18 -19
nucliadb_models/metadata.py +66 -54
nucliadb_models/notifications.py +3 -3
nucliadb_models/processing.py +1 -2
nucliadb_models/resource.py +85 -93
nucliadb_models/retrieval.py +147 -0
nucliadb_models/search.py +263 -275
nucliadb_models/security.py +2 -3
nucliadb_models/text.py +7 -8
nucliadb_models/trainset.py +1 -2
nucliadb_models/utils.py +2 -3
nucliadb_models/vectors.py +2 -5
nucliadb_models/writer.py +56 -57
{nucliadb_models-6.9.7.post5583.dist-info → nucliadb_models-6.11.1.post5822.dist-info}/METADATA +1 -1
nucliadb_models-6.11.1.post5822.dist-info/RECORD +41 -0
{nucliadb_models-6.9.7.post5583.dist-info → nucliadb_models-6.11.1.post5822.dist-info}/WHEEL +1 -1
nucliadb_models-6.9.7.post5583.dist-info/RECORD +0 -40
{nucliadb_models-6.9.7.post5583.dist-info → nucliadb_models-6.11.1.post5822.dist-info}/top_level.txt +0 -0

nucliadb_models/agents/ingestion.py CHANGED Viewed

@@ -13,7 +13,6 @@
 # limitations under the License.
 #
 from enum import Enum
-from typing import Optional
 from pydantic import BaseModel, Field
@@ -41,11 +40,12 @@ class AgentsFilter(BaseModel):
 class ResourceAgentsRequest(BaseModel):
-    filters: Optional[list[AgentsFilter]] = Field(
+    filters: list[AgentsFilter] | None = Field(
+        title="Resource Agent Filters",
         default=None,
         description="Filters to apply to the agents. If None, all curently configured agents are applied.",
     )
-    agent_ids: Optional[list[str]] = Field(
+    agent_ids: list[str] | None = Field(
         default=None,
         title="An optional list of Data Augmentation Agent IDs to run. If None, all configured agents that match the filters are run.",
     )
@@ -57,7 +57,7 @@ class NewTextField(BaseModel):
 class AppliedDataAugmentation(BaseModel):
-    qas: Optional[QuestionAnswers] = Field(
+    qas: QuestionAnswers | None = Field(
         default=None,
         description="Question and answers generated by the Question Answers agent",
     )

nucliadb_models/augment.py CHANGED Viewed

@@ -13,16 +13,15 @@
 # limitations under the License.
 #
-from enum import Enum
 from typing import Annotated
 from pydantic import BaseModel, Field, StringConstraints, model_validator
-from typing_extensions import Self
+from typing_extensions import Self, assert_never
 from nucliadb_models import filters
 from nucliadb_models.common import FieldTypeName
 from nucliadb_models.resource import ExtractedDataTypeName, Resource
-from nucliadb_models.search import Image, ResourceProperties
+from nucliadb_models.search import ResourceProperties, TextPosition
 ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
 ResourceId = Annotated[
@@ -57,71 +56,6 @@ ParagraphId = Annotated[
 # Request
-class ResourceProp(str, Enum):
-    """Superset of former `show` and `extracted` serializations options."""
-    # `show` props
-    BASIC = "basic"
-    ORIGIN = "origin"
-    EXTRA = "extra"
-    RELATIONS = "relations"
-    VALUES = "values"
-    ERRORS = "errors"
-    SECURITY = "security"
-    # `extracted` props
-    EXTRACTED_TEXT = "extracted_text"
-    EXTRACTED_METADATA = "extracted_metadata"
-    EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
-    EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
-    EXTRACTED_VECTOR = "extracted_vectors"
-    EXTRACTED_LINK = "extracted_link"
-    EXTRACTED_FILE = "extracted_file"
-    EXTRACTED_QA = "extracted_question_answers"
-    # new granular props
-    TITLE = "title"
-    SUMMARY = "summary"
-    CLASSIFICATION_LABELS = "classification_labels"
-    @classmethod
-    def from_show_and_extracted(
-        cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
-    ) -> list["ResourceProp"]:
-        _show_to_prop = {
-            ResourceProperties.BASIC: cls.BASIC,
-            ResourceProperties.ORIGIN: cls.ORIGIN,
-            ResourceProperties.EXTRA: cls.EXTRA,
-            ResourceProperties.RELATIONS: cls.RELATIONS,
-            ResourceProperties.VALUES: cls.VALUES,
-            ResourceProperties.ERRORS: cls.ERRORS,
-            ResourceProperties.SECURITY: cls.SECURITY,
-        }
-        _extracted_to_prop = {
-            ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
-            ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
-            ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
-            ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
-            ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
-            ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
-            ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
-            ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
-        }
-        props = []
-        for s in show:
-            show_prop = _show_to_prop.get(s)
-            # show=extracted is not in the dict
-            if show_prop is None:
-                continue
-            props.append(show_prop)
-        if ResourceProperties.EXTRACTED in show:
-            for e in extracted:
-                extracted_prop = _extracted_to_prop[e]
-                props.append(extracted_prop)
-        return props
 class AugmentResourceFields(BaseModel):
     text: bool = False
     classification_labels: bool = False
@@ -132,8 +66,29 @@ class AugmentResourceFields(BaseModel):
 class AugmentResources(BaseModel):
     given: list[ResourceId]
-    # TODO(decoupled-ask): replace this select for bool fields
-    select: list[ResourceProp] = Field(default_factory=list)
+    # `show` props
+    basic: bool = False
+    origin: bool = False
+    extra: bool = False
+    relations: bool = False
+    values: bool = False
+    errors: bool = False
+    security: bool = False
+    # `extracted` props
+    extracted_text: bool = False
+    extracted_metadata: bool = False
+    extracted_shortened_metadata: bool = False
+    extracted_large_metadata: bool = False
+    extracted_vector: bool = False
+    extracted_link: bool = False
+    extracted_file: bool = False
+    extracted_qa: bool = False
+    # new granular props
+    title: bool = False
+    summary: bool = False
+    classification_labels: bool = False
     field_type_filter: list[FieldTypeName] | None = Field(
         default=None,
@@ -154,6 +109,51 @@ class AugmentResources(BaseModel):
         return self
+    def apply_show_and_extracted(
+        self, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
+    ):
+        show_extracted = False
+        for s in show:
+            if s == ResourceProperties.BASIC:
+                self.basic = True
+            elif s == ResourceProperties.ORIGIN:
+                self.origin = True
+            elif s == ResourceProperties.EXTRA:
+                self.extra = True
+            elif s == ResourceProperties.RELATIONS:
+                self.relations = True
+            elif s == ResourceProperties.VALUES:
+                self.values = True
+            elif s == ResourceProperties.ERRORS:
+                self.errors = True
+            elif s == ResourceProperties.SECURITY:
+                self.security = True
+            elif s == ResourceProperties.EXTRACTED:
+                show_extracted = True
+            else:  # pragma: no cover
+                assert_never(s)
+        if show_extracted:
+            for e in extracted:
+                if e == ExtractedDataTypeName.TEXT:
+                    self.extracted_text = True
+                elif e == ExtractedDataTypeName.METADATA:
+                    self.extracted_metadata = True
+                elif e == ExtractedDataTypeName.SHORTENED_METADATA:
+                    self.extracted_shortened_metadata = True
+                elif e == ExtractedDataTypeName.LARGE_METADATA:
+                    self.extracted_large_metadata = True
+                elif e == ExtractedDataTypeName.VECTOR:
+                    self.extracted_vector = True
+                elif e == ExtractedDataTypeName.LINK:
+                    self.extracted_link = True
+                elif e == ExtractedDataTypeName.FILE:
+                    self.extracted_file = True
+                elif e == ExtractedDataTypeName.QA:
+                    self.extracted_qa = True
+                else:  # pragma: no cover
+                    assert_never(s)
 class AugmentFields(BaseModel):
     given: list[FieldId]
@@ -162,6 +162,9 @@ class AugmentFields(BaseModel):
     classification_labels: bool = False
     entities: bool = False  # also known as ners
+    # For file fields, augment the path to the thumbnail image
+    file_thumbnail: bool = False
     # When enabled, augment all the messages from the conversation. This is
     # incompatible with max_conversation_messages defined
     full_conversation: bool = False
@@ -205,11 +208,7 @@ class AugmentFields(BaseModel):
         return self
-# TODO(decoupled-ask): remove unused metadata
 class ParagraphMetadata(BaseModel):
-    field_labels: list[str]
-    paragraph_labels: list[str]
     is_an_image: bool
     is_a_table: bool
@@ -234,27 +233,29 @@ class AugmentParagraphs(BaseModel):
     neighbours_before: int = 0
     neighbours_after: int = 0
-    # TODO(decoupled-ask): implement image strategy
     # paragraph extracted from an image, return an image
     source_image: bool = False
-    # TODO(decoupled-ask): implement image strategy
     # paragraph extracted from a table, return table image
     table_image: bool = False
-    # TODO(decoupled-ask): implement image strategy
     # return page_preview instead of table image if table image enabled
     table_prefers_page_preview: bool = False
-    # TODO(decoupled-ask): implement image strategy
     # paragraph from a page, return page preview image
     page_preview_image: bool = False
+    @model_validator(mode="after")
+    def table_options_work_together(self) -> Self:
+        if not self.table_image and self.table_prefers_page_preview:
+            raise ValueError("`table_prefers_page_preview` can only be enabled with `table_image`")
+        return self
 class AugmentRequest(BaseModel):
-    resources: AugmentResources | None = None
-    fields: AugmentFields | None = None
-    paragraphs: AugmentParagraphs | None = None
+    resources: list[AugmentResources] | None = Field(default=None, min_length=1)
+    fields: list[AugmentFields] | None = Field(default=None, min_length=1)
+    paragraphs: list[AugmentParagraphs] | None = Field(default=None, min_length=1)
 # Response
@@ -262,11 +263,14 @@ class AugmentRequest(BaseModel):
 class AugmentedParagraph(BaseModel):
     text: str | None = None
+    position: TextPosition | None = None
     neighbours_before: list[ParagraphId] | None = None
     neighbours_after: list[ParagraphId] | None = None
-    image: Image | None = None
+    source_image: str | None = None
+    table_image: str | None = None
+    page_preview_image: str | None = None
 class AugmentedField(BaseModel):
@@ -277,7 +281,19 @@ class AugmentedField(BaseModel):
     # former ners
     entities: dict[str, list[str]] | None = None
-    page_preview_image: Image | None = None
+class AugmentedFileField(BaseModel):
+    text: str | None = None
+    classification_labels: dict[str, list[str]] | None = None
+    # former ners
+    entities: dict[str, list[str]] | None = None
+    page_preview_image: str | None = None
+    # Path for the download API to retrieve the file thumbnail image
+    thumbnail_image: str | None = None
 class AugmentedConversationMessage(BaseModel):
@@ -335,5 +351,5 @@ class AugmentedResource(Resource):
 class AugmentResponse(BaseModel):
     resources: dict[ResourceId, AugmentedResource]
-    fields: dict[FieldId, AugmentedField | AugmentedConversationField]
+    fields: dict[FieldId, AugmentedField | AugmentedFileField | AugmentedConversationField]
     paragraphs: dict[ParagraphId, AugmentedParagraph]

nucliadb_models/common.py CHANGED Viewed

@@ -16,7 +16,7 @@ import base64
 import hashlib
 import re
 from enum import Enum
-from typing import Any, Dict, List, Optional
+from typing import Any
 from pydantic import (
     BaseModel,
@@ -38,7 +38,7 @@ FIELD_TYPE_CHAR_MAP = {
 }
 STORAGE_FILE_MATCH = re.compile(
-    r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"  # noqa
+    r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
 )
 DOWNLOAD_TYPE_MAP = {"f": "field", "e": "extracted"}
 DOWNLOAD_URI = "/kb/{kbid}/resource/{rid}/{field_type}/{field_id}/download/{download_type}/{key}"
@@ -50,9 +50,9 @@ class ParamDefault(BaseModel):
     default: Any = None
     title: str
     description: str
-    le: Optional[float] = None
-    gt: Optional[float] = None
-    max_items: Optional[int] = None
+    le: float | None = None
+    gt: float | None = None
+    max_items: int | None = None
     deprecated: bool = False
     def to_pydantic_field(self, default=_NOT_SET, **kw) -> Field:  # type: ignore
@@ -86,13 +86,13 @@ class FieldID(BaseModel):
 class File(BaseModel):
-    filename: Optional[str] = None
+    filename: str | None = None
     content_type: str = "application/octet-stream"
-    payload: Optional[str] = Field(default=None, description="Base64 encoded file content")
-    md5: Optional[str] = None
+    payload: str | None = Field(default=None, description="Base64 encoded file content")
+    md5: str | None = None
     # These are to be used for external files
-    uri: Optional[str] = None
-    extra_headers: Dict[str, str] = {}
+    uri: str | None = None
+    extra_headers: dict[str, str] = {}
     @model_validator(mode="after")
     def _check_internal_file_fields(self) -> Self:
@@ -134,10 +134,10 @@ class FileB64(BaseModel):
 class CloudFile(BaseModel):
-    uri: Optional[str] = None
-    size: Optional[int] = None
-    content_type: Optional[str] = None
-    bucket_name: Optional[str] = None
+    uri: str | None = None
+    size: int | None = None
+    content_type: str | None = None
+    bucket_name: str | None = None
     class Source(Enum):
         FLAPS = "FLAPS"
@@ -146,23 +146,23 @@ class CloudFile(BaseModel):
         LOCAL = "LOCAL"
         EXTERNAL = "EXTERNAL"
-    source: Optional[Source]
-    filename: Optional[str]
-    resumable_uri: Optional[str]
-    offset: Optional[int]
-    upload_uri: Optional[str]
-    parts: Optional[List[str]]
-    old_uri: Optional[str]
-    old_bucket: Optional[str]
-    md5: Optional[str]
+    source: Source | None
+    filename: str | None
+    resumable_uri: str | None
+    offset: int | None
+    upload_uri: str | None
+    parts: list[str] | None
+    old_uri: str | None
+    old_bucket: str | None
+    md5: str | None
 class CloudLink(BaseModel):
-    uri: Optional[str] = None
-    size: Optional[int] = None
-    content_type: Optional[str] = None
-    filename: Optional[str] = None
-    md5: Optional[str] = None
+    uri: str | None = None
+    size: int | None = None
+    content_type: str | None = None
+    filename: str | None = None
+    md5: str | None = None
     @staticmethod
     def format_reader_download_uri(uri: str) -> str:
@@ -216,12 +216,12 @@ class FieldTypeName(str, Enum):
 class FieldRef(BaseModel):
     field_type: FieldTypeName
     field_id: str
-    split: Optional[str] = None
+    split: str | None = None
 class Classification(BaseModel):
-    labelset: str
-    label: str
+    labelset: str = Field(title="The ID of the labelset")
+    label: str = Field(title="The label assigned from the labelset")
 class UserClassification(Classification):
@@ -229,19 +229,19 @@ class UserClassification(Classification):
 class Sentence(BaseModel):
-    start: Optional[int] = None
-    end: Optional[int] = None
-    key: Optional[str] = None
+    start: int | None = None
+    end: int | None = None
+    key: str | None = None
 class PageInformation(BaseModel):
-    page: Optional[int] = None
-    page_with_visual: Optional[bool] = None
+    page: int | None = Field(default=None, title="Page Information Page")
+    page_with_visual: bool | None = None
 class Representation(BaseModel):
-    is_a_table: Optional[bool] = None
-    reference_file: Optional[str] = None
+    is_a_table: bool | None = None
+    reference_file: str | None = None
 class ParagraphRelations(BaseModel):
@@ -251,10 +251,10 @@ class ParagraphRelations(BaseModel):
 class Paragraph(BaseModel):
-    start: Optional[int] = None
-    end: Optional[int] = None
-    start_seconds: Optional[List[int]] = None
-    end_seconds: Optional[List[int]] = None
+    start: int | None = None
+    end: int | None = None
+    start_seconds: list[int] | None = None
+    end_seconds: list[int] | None = None
     class TypeParagraph(str, Enum):
         TEXT = "TEXT"
@@ -265,35 +265,35 @@ class Paragraph(BaseModel):
         TITLE = "TITLE"
         TABLE = "TABLE"
-    kind: Optional[TypeParagraph] = None
-    classifications: Optional[List[Classification]] = None
-    sentences: Optional[List[Sentence]] = None
-    key: Optional[str] = None
-    page: Optional[PageInformation] = None
-    representation: Optional[Representation] = None
-    relations: Optional[ParagraphRelations] = None
+    kind: TypeParagraph | None = None
+    classifications: list[Classification] | None = None
+    sentences: list[Sentence] | None = None
+    key: str | None = None
+    page: PageInformation | None = None
+    representation: Representation | None = None
+    relations: ParagraphRelations | None = None
 class Shards(BaseModel):
-    shards: Optional[List[str]] = None
+    shards: list[str] | None = None
 class Question(BaseModel):
     text: str
-    language: Optional[str] = None
-    ids_paragraphs: List[str]
+    language: str | None = None
+    ids_paragraphs: list[str]
 class Answer(BaseModel):
     text: str
-    language: Optional[str] = None
-    ids_paragraphs: List[str]
+    language: str | None = None
+    ids_paragraphs: list[str]
 class QuestionAnswer(BaseModel):
     question: Question
-    answers: List[Answer]
+    answers: list[Answer]
 class QuestionAnswers(BaseModel):
-    question_answer: List[QuestionAnswer]
+    question_answer: list[QuestionAnswer]

nucliadb_models/configuration.py CHANGED Viewed

@@ -14,7 +14,7 @@
 #
 import warnings
-from typing import Annotated, Any, Literal, Optional, Union
+from typing import Annotated, Any, Literal
 from pydantic import BaseModel, Field, create_model
@@ -28,11 +28,11 @@ class KBConfiguration(BaseModel):
         super().__init__(**data)
     # Do not touch this model synced on Processing side
-    semantic_model: Optional[str] = None
-    generative_model: Optional[str] = None
-    ner_model: Optional[str] = None
-    anonymization_model: Optional[str] = None
-    visual_labeling: Optional[str] = None
+    semantic_model: str | None = None
+    generative_model: str | None = None
+    ner_model: str | None = None
+    anonymization_model: str | None = None
+    visual_labeling: str | None = None
 #
@@ -57,7 +57,7 @@ class FindSearchConfiguration(BaseModel):
 AskConfig = create_model(
     "AskConfig",
     **_model_fields(AskRequest, skip=["query", "search_configuration"]),
-    query=(Optional[str], None),
+    query=(str | None, None),
 )
@@ -67,7 +67,7 @@ class AskSearchConfiguration(BaseModel):
 SearchConfiguration = Annotated[
-    Union[FindSearchConfiguration, AskSearchConfiguration], Field(discriminator="kind")
+    FindSearchConfiguration | AskSearchConfiguration, Field(discriminator="kind")
 ]
 # We need this to avoid issues with pydantic and generic types defined in another module

nucliadb_models/content_types.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import mimetypes
-from typing import Optional
 GENERIC_MIME_TYPE = "application/generic"
@@ -26,7 +25,9 @@ NUCLIA_CUSTOM_CONTENT_TYPES = {
 EXTRA_VALID_CONTENT_TYPES = {
     "application/font-woff",
+    "application/javascript",
     "application/mp4",
+    "application/rtf",
     "application/toml",
     "application/vnd.jgraph.mxfile",
     "application/vnd.ms-excel.sheet.macroenabled.12",
@@ -38,6 +39,7 @@ EXTRA_VALID_CONTENT_TYPES = {
     "application/x-git",
     "application/x-gzip",
     "application/x-iwork-pages-sffpages",
+    "application/x-javascript",
     "application/x-mach-binary",
     "application/x-mobipocket-ebook",
     "application/x-ms-shortcut",
@@ -46,10 +48,15 @@ EXTRA_VALID_CONTENT_TYPES = {
     "application/x-openscad",
     "application/x-sql",
     "application/x-zip-compressed",
+    "application/x-zip",
     "application/zstd",
+    "audio/m4a",
     "audio/vnd.dlna.adts",
     "audio/wav",
     "audio/x-m4a",
+    "image/svg+xml",
+    "image/tif",
+    "image/x-ico",
     "model/stl",
     "multipart/form-data",
     "text/jsx",
@@ -58,26 +65,21 @@ EXTRA_VALID_CONTENT_TYPES = {
     "text/rtf",
     "text/x-c++",
     "text/x-java-source",
+    "text/x-javascript",
     "text/x-log",
     "text/x-python-script",
     "text/x-ruby-script",
     "text/yaml",
-    "video/x-m4v",
-    "video/YouTube",
-    "image/tif",
+    "video/mkv",
     "video/qt",
     "video/webp",
-    "application/rtf",
-    "application/x-zip",
-    "video/mkv",
-    "image/x-ico",
-    "audio/m4a",
-    "image/svg+xml",
+    "video/x-m4v",
     "video/x-msvideo",
+    "video/YouTube",
 } | NUCLIA_CUSTOM_CONTENT_TYPES
-def guess(filename: str) -> Optional[str]:
+def guess(filename: str) -> str | None:
     """
     Guess the content type of a file based on its filename.
     Returns None if the content type could not be guessed.

nucliadb-models 6.9.7.post5583__py3-none-any.whl → 6.11.1.post5822__py3-none-any.whl

nucliadb-models 6.9.7.post5583py3-none-any.whl → 6.11.1.post5822py3-none-any.whl