nucliadb-models 6.9.3.post5300__py3-none-any.whl → 6.9.7.post5550__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb-models might be problematic. Click here for more details.

@@ -0,0 +1,246 @@
1
+ # Copyright 2025 Bosutech XXI S.L.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ #
15
+
16
+ from enum import Enum
17
+ from typing import Annotated
18
+
19
+ from pydantic import BaseModel, Field, StringConstraints, model_validator
20
+ from typing_extensions import Self
21
+
22
+ from nucliadb_models import filters
23
+ from nucliadb_models.common import FieldTypeName
24
+ from nucliadb_models.resource import ExtractedDataTypeName, Resource
25
+ from nucliadb_models.search import Image, ResourceProperties
26
+
27
+ ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
28
+ ResourceId = Annotated[
29
+ str,
30
+ StringConstraints(pattern=ResourceIdPattern, min_length=32, max_length=36),
31
+ ]
32
+
33
+ FieldIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?$"
34
+ FieldId = Annotated[
35
+ str,
36
+ StringConstraints(
37
+ pattern=FieldIdPattern,
38
+ min_length=32 + 1 + 1 + 1 + 1 + 0 + 0,
39
+ # max field id of 250
40
+ max_length=32 + 1 + 1 + 1 + 250 + 1 + 218,
41
+ ),
42
+ ]
43
+
44
+ ParagraphIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$"
45
+ ParagraphId = Annotated[
46
+ str,
47
+ StringConstraints(
48
+ # resource-uuid/field-type/field-id/[split-id/]paragraph-id
49
+ pattern=ParagraphIdPattern,
50
+ min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
51
+ # max field id of 250 and 10 digit paragraphs. More than enough
52
+ max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
53
+ ),
54
+ ]
55
+
56
+
57
+ # Request
58
+
59
+
60
+ class ResourceProp(str, Enum):
61
+ """Superset of former `show` and `extracted` serializations options."""
62
+
63
+ # `show` props
64
+ BASIC = "basic"
65
+ ORIGIN = "origin"
66
+ EXTRA = "extra"
67
+ RELATIONS = "relations"
68
+ VALUES = "values"
69
+ ERRORS = "errors"
70
+ SECURITY = "security"
71
+ # `extracted` props
72
+ EXTRACTED_TEXT = "extracted_text"
73
+ EXTRACTED_METADATA = "extracted_metadata"
74
+ EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
75
+ EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
76
+ EXTRACTED_VECTOR = "extracted_vectors"
77
+ EXTRACTED_LINK = "extracted_link"
78
+ EXTRACTED_FILE = "extracted_file"
79
+ EXTRACTED_QA = "extracted_question_answers"
80
+ # new granular props
81
+ TITLE = "title"
82
+ SUMMARY = "summary"
83
+ CLASSIFICATION_LABELS = "classification_labels"
84
+
85
+ @classmethod
86
+ def from_show_and_extracted(
87
+ cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
88
+ ) -> list["ResourceProp"]:
89
+ _show_to_prop = {
90
+ ResourceProperties.BASIC: cls.BASIC,
91
+ ResourceProperties.ORIGIN: cls.ORIGIN,
92
+ ResourceProperties.EXTRA: cls.EXTRA,
93
+ ResourceProperties.RELATIONS: cls.RELATIONS,
94
+ ResourceProperties.VALUES: cls.VALUES,
95
+ ResourceProperties.ERRORS: cls.ERRORS,
96
+ ResourceProperties.SECURITY: cls.SECURITY,
97
+ }
98
+ _extracted_to_prop = {
99
+ ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
100
+ ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
101
+ ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
102
+ ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
103
+ ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
104
+ ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
105
+ ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
106
+ ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
107
+ }
108
+
109
+ props = []
110
+ for s in show:
111
+ show_prop = _show_to_prop.get(s)
112
+ # show=extracted is not in the dict
113
+ if show_prop is None:
114
+ continue
115
+ props.append(show_prop)
116
+
117
+ if ResourceProperties.EXTRACTED in show:
118
+ for e in extracted:
119
+ extracted_prop = _extracted_to_prop[e]
120
+ props.append(extracted_prop)
121
+
122
+ return props
123
+
124
+
125
+ class AugmentResourceFields(BaseModel):
126
+ text: bool = False
127
+ classification_labels: bool = False
128
+
129
+ filters: list[filters.Field | filters.Generated]
130
+
131
+
132
+ class AugmentResources(BaseModel):
133
+ given: list[ResourceId]
134
+
135
+ select: list[ResourceProp] = Field(default_factory=list)
136
+
137
+ field_type_filter: list[FieldTypeName] | None = Field(
138
+ default=None,
139
+ deprecated="Only use this for legacy resource serialization",
140
+ title="Field type filter",
141
+ description=(
142
+ "Define which field types are serialized on resources of search results. "
143
+ "If omitted and legacy serialization is used, all field types will be serialized"
144
+ ),
145
+ )
146
+
147
+ fields: AugmentResourceFields | None = None
148
+
149
+ @model_validator(mode="after")
150
+ def bwc_resource_serialization(self) -> Self:
151
+ if self.field_type_filter is not None and self.fields is not None:
152
+ raise ValueError("`field_type_filter` and `fields` are incompatible together")
153
+
154
+ return self
155
+
156
+
157
+ class AugmentFields(BaseModel):
158
+ given: list[FieldId]
159
+
160
+ text: bool = False
161
+ classification_labels: bool = False
162
+ entities: bool = False # also known as ners
163
+
164
+
165
+ class ParagraphMetadata(BaseModel):
166
+ field_labels: list[str]
167
+ paragraph_labels: list[str]
168
+
169
+ is_an_image: bool
170
+ is_a_table: bool
171
+
172
+ # for extracted from visual content (ocr, inception, tables)
173
+ source_file: str | None
174
+
175
+ # for documents (pdf, docx...) only
176
+ page: int | None
177
+ in_page_with_visual: bool | None
178
+
179
+
180
+ class AugmentParagraph(BaseModel):
181
+ id: ParagraphId
182
+ metadata: ParagraphMetadata | None = None
183
+
184
+
185
+ class AugmentParagraphs(BaseModel):
186
+ given: list[AugmentParagraph]
187
+
188
+ text: bool = True
189
+
190
+ neighbours_before: int = 0
191
+ neighbours_after: int = 0
192
+
193
+ # paragraph extracted from an image, return an image
194
+ source_image: bool = False
195
+
196
+ # paragraph extracted from a table, return table image
197
+ table_image: bool = False
198
+
199
+ # return page_preview instead of table image if table image enabled
200
+ table_prefers_page_preview: bool = False
201
+
202
+ # paragraph from a page, return page preview image
203
+ page_preview_image: bool = False
204
+
205
+
206
+ class AugmentRequest(BaseModel):
207
+ resources: AugmentResources | None = None
208
+ fields: AugmentFields | None = None
209
+ paragraphs: AugmentParagraphs | None = None
210
+
211
+
212
+ # Response
213
+
214
+
215
+ class AugmentedParagraph(BaseModel):
216
+ text: str | None = None
217
+
218
+ neighbours_before: list[ParagraphId] | None = None
219
+ neighbours_after: list[ParagraphId] | None = None
220
+
221
+ image: Image | None = None
222
+
223
+
224
+ class AugmentedField(BaseModel):
225
+ text: str | None = None
226
+
227
+ classification_labels: dict[str, list[str]] | None = None
228
+
229
+ # former ners
230
+ entities: dict[str, list[str]] | None = None
231
+
232
+ page_preview_image: Image | None = None
233
+
234
+
235
+ class AugmentedResource(Resource):
236
+ classification_labels: dict[str, list[str]] | None = None
237
+
238
+ def updated_from(self, origin: Resource):
239
+ for key in origin.model_fields.keys():
240
+ self.__setattr__(key, getattr(origin, key))
241
+
242
+
243
+ class AugmentResponse(BaseModel):
244
+ resources: dict[ResourceId, AugmentedResource]
245
+ fields: dict[FieldId, AugmentedField]
246
+ paragraphs: dict[ParagraphId, AugmentedParagraph]
nucliadb_models/common.py CHANGED
@@ -108,7 +108,7 @@ class File(BaseModel):
108
108
  if self.md5 is None:
109
109
  # In case md5 is not supplied, compute it
110
110
  try:
111
- result = hashlib.md5(base64.b64decode(self.payload))
111
+ result = hashlib.md5(base64.b64decode(self.payload), usedforsecurity=False)
112
112
  self.md5 = result.hexdigest()
113
113
  except Exception:
114
114
  raise ValueError("MD5 could not be computed")
@@ -276,7 +276,7 @@ class Generated(FilterProp, extra="forbid"):
276
276
  by: Literal["data-augmentation"] = pydantic.Field(
277
277
  description="Generator for this field. Currently, only data-augmentation is supported"
278
278
  )
279
- da_task: Optional["str"] = pydantic.Field(
279
+ da_task: Optional[str] = pydantic.Field(
280
280
  default=None, description="Matches field generated by an specific DA task, given its prefix"
281
281
  )
282
282
 
@@ -247,8 +247,8 @@ ParagraphId = Annotated[
247
247
  str,
248
248
  StringConstraints(
249
249
  # resource-uuid/field-type/field-id/[split-id/]paragraph-id
250
- pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+/([^/]{1,128}/)?[0-9]+-[0-9]+$",
251
- min_length=32 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 3,
250
+ pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$",
251
+ min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
252
252
  # max field id of 250 and 10 digit paragraphs. More than enough
253
253
  max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
254
254
  ),
nucliadb_models/search.py CHANGED
@@ -347,10 +347,12 @@ SortOrderMap = {
347
347
 
348
348
  class SortOptions(BaseModel):
349
349
  field: SortField
350
- limit: Optional[int] = Field(None, gt=0)
351
350
  order: SortOrder = SortOrder.DESC
352
351
 
353
352
 
353
+ MAX_RANK_FUSION_WINDOW = 500
354
+
355
+
354
356
  class RankFusionName(str, Enum):
355
357
  RECIPROCAL_RANK_FUSION = "rrf"
356
358
 
@@ -380,7 +382,7 @@ class ReciprocalRankFusion(_BaseRankFusion):
380
382
  )
381
383
  window: Optional[int] = Field(
382
384
  default=None,
383
- le=500,
385
+ le=MAX_RANK_FUSION_WINDOW,
384
386
  title="RRF window",
385
387
  description="Number of elements for retrieval to do RRF. Window must be greater or equal to top_k. Greater values will increase probability of multi match at cost of retrieval time", # noqa: E501
386
388
  )
@@ -503,10 +505,18 @@ class SearchParamDefaults:
503
505
  )
504
506
  top_k = ParamDefault(
505
507
  default=20,
508
+ gt=-1,
506
509
  le=200,
507
510
  title="Top k",
508
511
  description="The number of results search should return. The maximum number of results allowed is 200.",
509
512
  )
513
+ offset = ParamDefault(
514
+ default=0,
515
+ gt=-1,
516
+ le=1000,
517
+ title="Results offset",
518
+ description="The number of results to skip, starting from the beginning in sort order. Used for pagination. It can only be used with the keyword and fulltext indexes.",
519
+ )
510
520
  highlight = ParamDefault(
511
521
  default=False,
512
522
  title="Highlight",
@@ -532,12 +542,6 @@ class SearchParamDefaults:
532
542
  title="Sort order",
533
543
  description="Order to sort results with",
534
544
  )
535
- sort_limit = ParamDefault(
536
- default=None,
537
- title="Sort limit",
538
- description="",
539
- gt=0,
540
- )
541
545
  sort_field = ParamDefault(
542
546
  default=None,
543
547
  title="Sort field",
@@ -938,12 +942,32 @@ class SearchRequest(BaseSearchRequest):
938
942
  )
939
943
  faceted: list[str] = SearchParamDefaults.faceted.to_pydantic_field()
940
944
  sort: Optional[SortOptions] = SearchParamDefaults.sort.to_pydantic_field()
945
+ offset: int = SearchParamDefaults.offset.to_pydantic_field()
941
946
 
942
947
  @field_validator("faceted")
943
948
  @classmethod
944
949
  def nested_facets_not_supported(cls, facets):
945
950
  return validate_facets(facets)
946
951
 
952
+ @model_validator(mode="after")
953
+ def offset_sort_only_on_keyword_indexes(self):
954
+ has_non_keyword_indexes = set(self.features) & {SearchOptions.SEMANTIC, SearchOptions.RELATIONS}
955
+ if has_non_keyword_indexes:
956
+ if self.offset > 0:
957
+ raise ValueError("offset cannot be used with the semantic or relations index")
958
+ if self.sort and self.sort.field != SortField.SCORE:
959
+ raise ValueError("sort by date cannot be used with the semantic or relations index")
960
+
961
+ return self
962
+
963
+ @field_validator("sort", mode="after")
964
+ @classmethod
965
+ def sorting_by_title_not_supported(cls, value: Optional[SortOptions]) -> Optional[SortOptions]:
966
+ if value and value.field == SortField.TITLE:
967
+ raise ValueError("sorting by title not supported in /search")
968
+
969
+ return value
970
+
947
971
 
948
972
  class Author(str, Enum):
949
973
  NUCLIA = "NUCLIA"
@@ -2038,8 +2062,10 @@ class FindResource(Resource):
2038
2062
  fields: dict[str, FindField]
2039
2063
 
2040
2064
  def updated_from(self, origin: Resource):
2065
+ find_resource_model_fields = self.model_fields.keys()
2041
2066
  for key in origin.model_fields.keys():
2042
- self.__setattr__(key, getattr(origin, key))
2067
+ if key in find_resource_model_fields:
2068
+ self.__setattr__(key, getattr(origin, key))
2043
2069
 
2044
2070
 
2045
2071
  class KnowledgeboxFindResults(JsonBaseModel):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb_models
3
- Version: 6.9.3.post5300
3
+ Version: 6.9.7.post5550
4
4
  Author-email: Nuclia <nucliadb@nuclia.com>
5
5
  License-Expression: Apache-2.0
6
6
  Project-URL: Homepage, https://nuclia.com
@@ -1,5 +1,6 @@
1
1
  nucliadb_models/__init__.py,sha256=3y8-htogKuCZcbhaUZdSjTeEjUSeec9aRWyL8AlKCyM,1077
2
- nucliadb_models/common.py,sha256=2dtKG4ZNi9p-yoNY76Uvyu1SlMeNYpH-MnuU3Q6w9Js,8169
2
+ nucliadb_models/augment.py,sha256=HQLWVY4BfFNhL-H1E8j1qU2fKB1pipbhv5n1F0mBlCI,7557
3
+ nucliadb_models/common.py,sha256=gVG5kOhOwQZR-t5n3b9-hANMlLy2CHelUU5PPUf3bck,8192
3
4
  nucliadb_models/configuration.py,sha256=BBrJsNjP324Cw_5J3dBrGwvpkHQYbXEo3TUaI9IqAOg,2449
4
5
  nucliadb_models/content_types.py,sha256=36Ga-iGf4ivCqgtXC7imFgegrwHB117s9eqP62JtGv0,3456
5
6
  nucliadb_models/conversation.py,sha256=k9bKhkDiqhqmdrDfDPNoUfG7-2H_-KAyuOnETd8zV0E,5081
@@ -8,8 +9,8 @@ nucliadb_models/export_import.py,sha256=mNm9IArOLnC6TLupkwqVFhxD5d08mpIVOVFneECv
8
9
  nucliadb_models/external_index_providers.py,sha256=pL3leo4MkuJOnKlU1Sg6GT_mnK_VUBxGui-RPmDYVWU,1126
9
10
  nucliadb_models/extracted.py,sha256=Owz7LC3le3Dvau3TtRiO8NY84meOf6IxN-RrOqqpMPs,5593
10
11
  nucliadb_models/file.py,sha256=tXtgB9c7i2ADsnJ7HdbXyroAmXadGvOeA49htBh7BZo,2263
11
- nucliadb_models/filters.py,sha256=NQI2-4AFzzJuZy8NeY3jXlTbbU5wxiwMCP-5DrD-7lE,14759
12
- nucliadb_models/hydration.py,sha256=Yo8eM99q9wcaCJj8stL5z3k63ouC3LVANZIZLULZxRQ,14363
12
+ nucliadb_models/filters.py,sha256=CyU0JppMzGpVsRXvTHS6otG59XuvqhBjVQf_9Fw01Qc,14757
13
+ nucliadb_models/hydration.py,sha256=SlAzraJE6DX0uOpZWxu2k_9-ikYorsj0t8xwsWSBQZY,14363
13
14
  nucliadb_models/labels.py,sha256=9zqRgkpZuX3kUPwsTTgCH7JyOWK7dM5pwyuHJR86YdU,3949
14
15
  nucliadb_models/link.py,sha256=PF5hHLwdOed5TMBTxtokkgWtMh1bFnORZjybh0NwVCw,2526
15
16
  nucliadb_models/metadata.py,sha256=OOKGy_83NtlG1QKQZEwMuwu4wbVEe7P30Y2QvnGSDto,8933
@@ -17,7 +18,7 @@ nucliadb_models/notifications.py,sha256=mna8-AoD_29Wds0Thl0AF0zpERnJmYGLZX1w1fUo
17
18
  nucliadb_models/processing.py,sha256=nhKuHQjqCdb9zJVkYGPTLub23tK9e_lwL5OCDVymZjY,719
18
19
  nucliadb_models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
20
  nucliadb_models/resource.py,sha256=RzCos0QRgSMkaV-p7EoceSmt7UTzt9G9be5BKF-iGrQ,9021
20
- nucliadb_models/search.py,sha256=gQEXJ9bXXcxswr7aOzvBeGIQlrq5TgRWIqTxKEbSoCE,96409
21
+ nucliadb_models/search.py,sha256=WDhihnfgVrEb_YBxxRkKMCVUcax0qpRIm9HwECxKcZs,97630
21
22
  nucliadb_models/security.py,sha256=opxaDLfvk3aU0sjesK0jGrYLx5h4YCwlKKN0moYs_ig,1150
22
23
  nucliadb_models/synonyms.py,sha256=afbaVqSQSxGLwi2PusVaLSRpkOtA5AZmWOKd1f4nl2E,690
23
24
  nucliadb_models/text.py,sha256=60bxZnOjRHnDdezR8VfR3AZsXTOwePFPs2BKB8wxBak,3414
@@ -33,7 +34,7 @@ nucliadb_models/graph/responses.py,sha256=Sdq8OgFAL1YT-1lJyLLrkqcScvj7YTEqAUwQ-k
33
34
  nucliadb_models/internal/__init__.py,sha256=zG33bUz1rHFPtvqQPWn4rDwBJt3FJodGuQYD45quiQg,583
34
35
  nucliadb_models/internal/predict.py,sha256=Pnx6MmLfK65eExe1XnVxqmSlvMwdowewwks9BOEoqMw,2029
35
36
  nucliadb_models/internal/shards.py,sha256=__y1OZtWGiNcPQEWfSFOj8yw458WGi7mM4vZe0K-L1Y,1691
36
- nucliadb_models-6.9.3.post5300.dist-info/METADATA,sha256=vXycSKTBX_0_Wv2DVKq1MM_L5XqmrT77eeXqgaTuM_c,745
37
- nucliadb_models-6.9.3.post5300.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
38
- nucliadb_models-6.9.3.post5300.dist-info/top_level.txt,sha256=UrY1I8oeovIRwkXLYplssTrxQdUjhSEFDFbnwaIV3tA,16
39
- nucliadb_models-6.9.3.post5300.dist-info/RECORD,,
37
+ nucliadb_models-6.9.7.post5550.dist-info/METADATA,sha256=WGOjysUKnl_Ggf5DUXFhXmfhPmYXC1abXDtkCJvIkp0,745
38
+ nucliadb_models-6.9.7.post5550.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
+ nucliadb_models-6.9.7.post5550.dist-info/top_level.txt,sha256=UrY1I8oeovIRwkXLYplssTrxQdUjhSEFDFbnwaIV3tA,16
40
+ nucliadb_models-6.9.7.post5550.dist-info/RECORD,,