PyPI - nucliadb-models - Versions diffs - 6.9.3.post5300__py3-none-any.whl → 6.9.7.post5550__py3-none-any.whl - Mend

nucliadb-models 6.9.3.post5300py3-none-any.whl → 6.9.7.post5550py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nucliadb-models might be problematic. Click here for more details.

Files changed (9) hide show

nucliadb_models/augment.py ADDED Viewed

@@ -0,0 +1,246 @@
+# Copyright 2025 Bosutech XXI S.L.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from enum import Enum
+from typing import Annotated
+from pydantic import BaseModel, Field, StringConstraints, model_validator
+from typing_extensions import Self
+from nucliadb_models import filters
+from nucliadb_models.common import FieldTypeName
+from nucliadb_models.resource import ExtractedDataTypeName, Resource
+from nucliadb_models.search import Image, ResourceProperties
+ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
+ResourceId = Annotated[
+    str,
+    StringConstraints(pattern=ResourceIdPattern, min_length=32, max_length=36),
+]
+FieldIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?$"
+FieldId = Annotated[
+    str,
+    StringConstraints(
+        pattern=FieldIdPattern,
+        min_length=32 + 1 + 1 + 1 + 1 + 0 + 0,
+        # max field id of 250
+        max_length=32 + 1 + 1 + 1 + 250 + 1 + 218,
+    ),
+]
+ParagraphIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$"
+ParagraphId = Annotated[
+    str,
+    StringConstraints(
+        # resource-uuid/field-type/field-id/[split-id/]paragraph-id
+        pattern=ParagraphIdPattern,
+        min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
+        # max field id of 250 and 10 digit paragraphs. More than enough
+        max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
+    ),
+]
+# Request
+class ResourceProp(str, Enum):
+    """Superset of former `show` and `extracted` serializations options."""
+    # `show` props
+    BASIC = "basic"
+    ORIGIN = "origin"
+    EXTRA = "extra"
+    RELATIONS = "relations"
+    VALUES = "values"
+    ERRORS = "errors"
+    SECURITY = "security"
+    # `extracted` props
+    EXTRACTED_TEXT = "extracted_text"
+    EXTRACTED_METADATA = "extracted_metadata"
+    EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
+    EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
+    EXTRACTED_VECTOR = "extracted_vectors"
+    EXTRACTED_LINK = "extracted_link"
+    EXTRACTED_FILE = "extracted_file"
+    EXTRACTED_QA = "extracted_question_answers"
+    # new granular props
+    TITLE = "title"
+    SUMMARY = "summary"
+    CLASSIFICATION_LABELS = "classification_labels"
+    @classmethod
+    def from_show_and_extracted(
+        cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
+    ) -> list["ResourceProp"]:
+        _show_to_prop = {
+            ResourceProperties.BASIC: cls.BASIC,
+            ResourceProperties.ORIGIN: cls.ORIGIN,
+            ResourceProperties.EXTRA: cls.EXTRA,
+            ResourceProperties.RELATIONS: cls.RELATIONS,
+            ResourceProperties.VALUES: cls.VALUES,
+            ResourceProperties.ERRORS: cls.ERRORS,
+            ResourceProperties.SECURITY: cls.SECURITY,
+        }
+        _extracted_to_prop = {
+            ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
+            ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
+            ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
+            ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
+            ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
+            ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
+            ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
+            ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
+        }
+        props = []
+        for s in show:
+            show_prop = _show_to_prop.get(s)
+            # show=extracted is not in the dict
+            if show_prop is None:
+                continue
+            props.append(show_prop)
+        if ResourceProperties.EXTRACTED in show:
+            for e in extracted:
+                extracted_prop = _extracted_to_prop[e]
+                props.append(extracted_prop)
+        return props
+class AugmentResourceFields(BaseModel):
+    text: bool = False
+    classification_labels: bool = False
+    filters: list[filters.Field | filters.Generated]
+class AugmentResources(BaseModel):
+    given: list[ResourceId]
+    select: list[ResourceProp] = Field(default_factory=list)
+    field_type_filter: list[FieldTypeName] | None = Field(
+        default=None,
+        deprecated="Only use this for legacy resource serialization",
+        title="Field type filter",
+        description=(
+            "Define which field types are serialized on resources of search results. "
+            "If omitted and legacy serialization is used, all field types will be serialized"
+        ),
+    )
+    fields: AugmentResourceFields | None = None
+    @model_validator(mode="after")
+    def bwc_resource_serialization(self) -> Self:
+        if self.field_type_filter is not None and self.fields is not None:
+            raise ValueError("`field_type_filter` and `fields` are incompatible together")
+        return self
+class AugmentFields(BaseModel):
+    given: list[FieldId]
+    text: bool = False
+    classification_labels: bool = False
+    entities: bool = False  # also known as ners
+class ParagraphMetadata(BaseModel):
+    field_labels: list[str]
+    paragraph_labels: list[str]
+    is_an_image: bool
+    is_a_table: bool
+    # for extracted from visual content (ocr, inception, tables)
+    source_file: str | None
+    # for documents (pdf, docx...) only
+    page: int | None
+    in_page_with_visual: bool | None
+class AugmentParagraph(BaseModel):
+    id: ParagraphId
+    metadata: ParagraphMetadata | None = None
+class AugmentParagraphs(BaseModel):
+    given: list[AugmentParagraph]
+    text: bool = True
+    neighbours_before: int = 0
+    neighbours_after: int = 0
+    # paragraph extracted from an image, return an image
+    source_image: bool = False
+    # paragraph extracted from a table, return table image
+    table_image: bool = False
+    # return page_preview instead of table image if table image enabled
+    table_prefers_page_preview: bool = False
+    # paragraph from a page, return page preview image
+    page_preview_image: bool = False
+class AugmentRequest(BaseModel):
+    resources: AugmentResources | None = None
+    fields: AugmentFields | None = None
+    paragraphs: AugmentParagraphs | None = None
+# Response
+class AugmentedParagraph(BaseModel):
+    text: str | None = None
+    neighbours_before: list[ParagraphId] | None = None
+    neighbours_after: list[ParagraphId] | None = None
+    image: Image | None = None
+class AugmentedField(BaseModel):
+    text: str | None = None
+    classification_labels: dict[str, list[str]] | None = None
+    # former ners
+    entities: dict[str, list[str]] | None = None
+    page_preview_image: Image | None = None
+class AugmentedResource(Resource):
+    classification_labels: dict[str, list[str]] | None = None
+    def updated_from(self, origin: Resource):
+        for key in origin.model_fields.keys():
+            self.__setattr__(key, getattr(origin, key))
+class AugmentResponse(BaseModel):
+    resources: dict[ResourceId, AugmentedResource]
+    fields: dict[FieldId, AugmentedField]
+    paragraphs: dict[ParagraphId, AugmentedParagraph]

nucliadb_models/common.py CHANGED Viewed

@@ -108,7 +108,7 @@ class File(BaseModel):
         if self.md5 is None:
             # In case md5 is not supplied, compute it
             try:
-                result = hashlib.md5(base64.b64decode(self.payload))
+                result = hashlib.md5(base64.b64decode(self.payload), usedforsecurity=False)
                 self.md5 = result.hexdigest()
             except Exception:
                 raise ValueError("MD5 could not be computed")

nucliadb_models/filters.py CHANGED Viewed

@@ -276,7 +276,7 @@ class Generated(FilterProp, extra="forbid"):
     by: Literal["data-augmentation"] = pydantic.Field(
         description="Generator for this field. Currently, only data-augmentation is supported"
     )
-    da_task: Optional["str"] = pydantic.Field(
+    da_task: Optional[str] = pydantic.Field(
         default=None, description="Matches field generated by an specific DA task, given its prefix"
     )

nucliadb_models/hydration.py CHANGED Viewed

@@ -247,8 +247,8 @@ ParagraphId = Annotated[
     str,
     StringConstraints(
         # resource-uuid/field-type/field-id/[split-id/]paragraph-id
-        pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+/([^/]{1,128}/)?[0-9]+-[0-9]+$",
-        min_length=32 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 3,
+        pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$",
+        min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
         # max field id of 250 and 10 digit paragraphs. More than enough
         max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
     ),

nucliadb_models/search.py CHANGED Viewed

@@ -347,10 +347,12 @@ SortOrderMap = {
 class SortOptions(BaseModel):
     field: SortField
-    limit: Optional[int] = Field(None, gt=0)
     order: SortOrder = SortOrder.DESC
+MAX_RANK_FUSION_WINDOW = 500
 class RankFusionName(str, Enum):
     RECIPROCAL_RANK_FUSION = "rrf"
@@ -380,7 +382,7 @@ class ReciprocalRankFusion(_BaseRankFusion):
     )
     window: Optional[int] = Field(
         default=None,
-        le=500,
+        le=MAX_RANK_FUSION_WINDOW,
         title="RRF window",
         description="Number of elements for retrieval to do RRF. Window must be greater or equal to top_k. Greater values will increase probability of multi match at cost of retrieval time",  # noqa: E501
     )
@@ -503,10 +505,18 @@ class SearchParamDefaults:
     )
     top_k = ParamDefault(
         default=20,
+        gt=-1,
         le=200,
         title="Top k",
         description="The number of results search should return. The maximum number of results allowed is 200.",
     )
+    offset = ParamDefault(
+        default=0,
+        gt=-1,
+        le=1000,
+        title="Results offset",
+        description="The number of results to skip, starting from the beginning in sort order. Used for pagination. It can only be used with the keyword and fulltext indexes.",
+    )
     highlight = ParamDefault(
         default=False,
         title="Highlight",
@@ -532,12 +542,6 @@ class SearchParamDefaults:
         title="Sort order",
         description="Order to sort results with",
     )
-    sort_limit = ParamDefault(
-        default=None,
-        title="Sort limit",
-        description="",
-        gt=0,
-    )
     sort_field = ParamDefault(
         default=None,
         title="Sort field",
@@ -938,12 +942,32 @@ class SearchRequest(BaseSearchRequest):
     )
     faceted: list[str] = SearchParamDefaults.faceted.to_pydantic_field()
     sort: Optional[SortOptions] = SearchParamDefaults.sort.to_pydantic_field()
+    offset: int = SearchParamDefaults.offset.to_pydantic_field()
     @field_validator("faceted")
     @classmethod
     def nested_facets_not_supported(cls, facets):
         return validate_facets(facets)
+    @model_validator(mode="after")
+    def offset_sort_only_on_keyword_indexes(self):
+        has_non_keyword_indexes = set(self.features) & {SearchOptions.SEMANTIC, SearchOptions.RELATIONS}
+        if has_non_keyword_indexes:
+            if self.offset > 0:
+                raise ValueError("offset cannot be used with the semantic or relations index")
+            if self.sort and self.sort.field != SortField.SCORE:
+                raise ValueError("sort by date cannot be used with the semantic or relations index")
+        return self
+    @field_validator("sort", mode="after")
+    @classmethod
+    def sorting_by_title_not_supported(cls, value: Optional[SortOptions]) -> Optional[SortOptions]:
+        if value and value.field == SortField.TITLE:
+            raise ValueError("sorting by title not supported in /search")
+        return value
 class Author(str, Enum):
     NUCLIA = "NUCLIA"
@@ -2038,8 +2062,10 @@ class FindResource(Resource):
     fields: dict[str, FindField]
     def updated_from(self, origin: Resource):
+        find_resource_model_fields = self.model_fields.keys()
         for key in origin.model_fields.keys():
-            self.__setattr__(key, getattr(origin, key))
+            if key in find_resource_model_fields:
+                self.__setattr__(key, getattr(origin, key))
 class KnowledgeboxFindResults(JsonBaseModel):

{nucliadb_models-6.9.3.post5300.dist-info → nucliadb_models-6.9.7.post5550.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nucliadb_models
-Version: 6.9.3.post5300
+Version: 6.9.7.post5550
 Author-email: Nuclia <nucliadb@nuclia.com>
 License-Expression: Apache-2.0
 Project-URL: Homepage, https://nuclia.com

{nucliadb_models-6.9.3.post5300.dist-info → nucliadb_models-6.9.7.post5550.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,6 @@
 nucliadb_models/__init__.py,sha256=3y8-htogKuCZcbhaUZdSjTeEjUSeec9aRWyL8AlKCyM,1077
-nucliadb_models/common.py,sha256=2dtKG4ZNi9p-yoNY76Uvyu1SlMeNYpH-MnuU3Q6w9Js,8169
+nucliadb_models/augment.py,sha256=HQLWVY4BfFNhL-H1E8j1qU2fKB1pipbhv5n1F0mBlCI,7557
+nucliadb_models/common.py,sha256=gVG5kOhOwQZR-t5n3b9-hANMlLy2CHelUU5PPUf3bck,8192
 nucliadb_models/configuration.py,sha256=BBrJsNjP324Cw_5J3dBrGwvpkHQYbXEo3TUaI9IqAOg,2449
 nucliadb_models/content_types.py,sha256=36Ga-iGf4ivCqgtXC7imFgegrwHB117s9eqP62JtGv0,3456
 nucliadb_models/conversation.py,sha256=k9bKhkDiqhqmdrDfDPNoUfG7-2H_-KAyuOnETd8zV0E,5081
@@ -8,8 +9,8 @@ nucliadb_models/export_import.py,sha256=mNm9IArOLnC6TLupkwqVFhxD5d08mpIVOVFneECv
 nucliadb_models/external_index_providers.py,sha256=pL3leo4MkuJOnKlU1Sg6GT_mnK_VUBxGui-RPmDYVWU,1126
 nucliadb_models/extracted.py,sha256=Owz7LC3le3Dvau3TtRiO8NY84meOf6IxN-RrOqqpMPs,5593
 nucliadb_models/file.py,sha256=tXtgB9c7i2ADsnJ7HdbXyroAmXadGvOeA49htBh7BZo,2263
-nucliadb_models/filters.py,sha256=NQI2-4AFzzJuZy8NeY3jXlTbbU5wxiwMCP-5DrD-7lE,14759
-nucliadb_models/hydration.py,sha256=Yo8eM99q9wcaCJj8stL5z3k63ouC3LVANZIZLULZxRQ,14363
+nucliadb_models/filters.py,sha256=CyU0JppMzGpVsRXvTHS6otG59XuvqhBjVQf_9Fw01Qc,14757
+nucliadb_models/hydration.py,sha256=SlAzraJE6DX0uOpZWxu2k_9-ikYorsj0t8xwsWSBQZY,14363
 nucliadb_models/labels.py,sha256=9zqRgkpZuX3kUPwsTTgCH7JyOWK7dM5pwyuHJR86YdU,3949
 nucliadb_models/link.py,sha256=PF5hHLwdOed5TMBTxtokkgWtMh1bFnORZjybh0NwVCw,2526
 nucliadb_models/metadata.py,sha256=OOKGy_83NtlG1QKQZEwMuwu4wbVEe7P30Y2QvnGSDto,8933
@@ -17,7 +18,7 @@ nucliadb_models/notifications.py,sha256=mna8-AoD_29Wds0Thl0AF0zpERnJmYGLZX1w1fUo
 nucliadb_models/processing.py,sha256=nhKuHQjqCdb9zJVkYGPTLub23tK9e_lwL5OCDVymZjY,719
 nucliadb_models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nucliadb_models/resource.py,sha256=RzCos0QRgSMkaV-p7EoceSmt7UTzt9G9be5BKF-iGrQ,9021
-nucliadb_models/search.py,sha256=gQEXJ9bXXcxswr7aOzvBeGIQlrq5TgRWIqTxKEbSoCE,96409
+nucliadb_models/search.py,sha256=WDhihnfgVrEb_YBxxRkKMCVUcax0qpRIm9HwECxKcZs,97630
 nucliadb_models/security.py,sha256=opxaDLfvk3aU0sjesK0jGrYLx5h4YCwlKKN0moYs_ig,1150
 nucliadb_models/synonyms.py,sha256=afbaVqSQSxGLwi2PusVaLSRpkOtA5AZmWOKd1f4nl2E,690
 nucliadb_models/text.py,sha256=60bxZnOjRHnDdezR8VfR3AZsXTOwePFPs2BKB8wxBak,3414
@@ -33,7 +34,7 @@ nucliadb_models/graph/responses.py,sha256=Sdq8OgFAL1YT-1lJyLLrkqcScvj7YTEqAUwQ-k
 nucliadb_models/internal/__init__.py,sha256=zG33bUz1rHFPtvqQPWn4rDwBJt3FJodGuQYD45quiQg,583
 nucliadb_models/internal/predict.py,sha256=Pnx6MmLfK65eExe1XnVxqmSlvMwdowewwks9BOEoqMw,2029
 nucliadb_models/internal/shards.py,sha256=__y1OZtWGiNcPQEWfSFOj8yw458WGi7mM4vZe0K-L1Y,1691
-nucliadb_models-6.9.3.post5300.dist-info/METADATA,sha256=vXycSKTBX_0_Wv2DVKq1MM_L5XqmrT77eeXqgaTuM_c,745
-nucliadb_models-6.9.3.post5300.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nucliadb_models-6.9.3.post5300.dist-info/top_level.txt,sha256=UrY1I8oeovIRwkXLYplssTrxQdUjhSEFDFbnwaIV3tA,16
-nucliadb_models-6.9.3.post5300.dist-info/RECORD,,
+nucliadb_models-6.9.7.post5550.dist-info/METADATA,sha256=WGOjysUKnl_Ggf5DUXFhXmfhPmYXC1abXDtkCJvIkp0,745
+nucliadb_models-6.9.7.post5550.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nucliadb_models-6.9.7.post5550.dist-info/top_level.txt,sha256=UrY1I8oeovIRwkXLYplssTrxQdUjhSEFDFbnwaIV3tA,16
+nucliadb_models-6.9.7.post5550.dist-info/RECORD,,

{nucliadb_models-6.9.3.post5300.dist-info → nucliadb_models-6.9.7.post5550.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb_models-6.9.3.post5300.dist-info → nucliadb_models-6.9.7.post5550.dist-info}/top_level.txt RENAMED Viewed

File without changes

nucliadb-models 6.9.3.post5300__py3-none-any.whl → 6.9.7.post5550__py3-none-any.whl

Potentially problematic release.

nucliadb-models 6.9.3.post5300py3-none-any.whl → 6.9.7.post5550py3-none-any.whl