nucliadb-models 6.9.2.post5276__py3-none-any.whl → 6.9.7.post5583__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb_models/augment.py +339 -0
- nucliadb_models/common.py +1 -1
- nucliadb_models/conversation.py +1 -2
- nucliadb_models/filters.py +2 -2
- nucliadb_models/hydration.py +4 -3
- nucliadb_models/resource.py +0 -9
- nucliadb_models/search.py +69 -9
- {nucliadb_models-6.9.2.post5276.dist-info → nucliadb_models-6.9.7.post5583.dist-info}/METADATA +1 -1
- {nucliadb_models-6.9.2.post5276.dist-info → nucliadb_models-6.9.7.post5583.dist-info}/RECORD +11 -10
- {nucliadb_models-6.9.2.post5276.dist-info → nucliadb_models-6.9.7.post5583.dist-info}/WHEEL +0 -0
- {nucliadb_models-6.9.2.post5276.dist-info → nucliadb_models-6.9.7.post5583.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
# Copyright 2025 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import Annotated
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field, StringConstraints, model_validator
|
|
20
|
+
from typing_extensions import Self
|
|
21
|
+
|
|
22
|
+
from nucliadb_models import filters
|
|
23
|
+
from nucliadb_models.common import FieldTypeName
|
|
24
|
+
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
25
|
+
from nucliadb_models.search import Image, ResourceProperties
|
|
26
|
+
|
|
27
|
+
ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
|
|
28
|
+
ResourceId = Annotated[
|
|
29
|
+
str,
|
|
30
|
+
StringConstraints(pattern=ResourceIdPattern, min_length=32, max_length=36),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
FieldIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?$"
|
|
34
|
+
FieldId = Annotated[
|
|
35
|
+
str,
|
|
36
|
+
StringConstraints(
|
|
37
|
+
pattern=FieldIdPattern,
|
|
38
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0,
|
|
39
|
+
# max field id of 250
|
|
40
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 218,
|
|
41
|
+
),
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
ParagraphIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$"
|
|
45
|
+
ParagraphId = Annotated[
|
|
46
|
+
str,
|
|
47
|
+
StringConstraints(
|
|
48
|
+
# resource-uuid/field-type/field-id/[split-id/]paragraph-id
|
|
49
|
+
pattern=ParagraphIdPattern,
|
|
50
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
|
|
51
|
+
# max field id of 250 and 10 digit paragraphs. More than enough
|
|
52
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
|
|
53
|
+
),
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Request
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ResourceProp(str, Enum):
|
|
61
|
+
"""Superset of former `show` and `extracted` serializations options."""
|
|
62
|
+
|
|
63
|
+
# `show` props
|
|
64
|
+
BASIC = "basic"
|
|
65
|
+
ORIGIN = "origin"
|
|
66
|
+
EXTRA = "extra"
|
|
67
|
+
RELATIONS = "relations"
|
|
68
|
+
VALUES = "values"
|
|
69
|
+
ERRORS = "errors"
|
|
70
|
+
SECURITY = "security"
|
|
71
|
+
# `extracted` props
|
|
72
|
+
EXTRACTED_TEXT = "extracted_text"
|
|
73
|
+
EXTRACTED_METADATA = "extracted_metadata"
|
|
74
|
+
EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
|
|
75
|
+
EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
|
|
76
|
+
EXTRACTED_VECTOR = "extracted_vectors"
|
|
77
|
+
EXTRACTED_LINK = "extracted_link"
|
|
78
|
+
EXTRACTED_FILE = "extracted_file"
|
|
79
|
+
EXTRACTED_QA = "extracted_question_answers"
|
|
80
|
+
# new granular props
|
|
81
|
+
TITLE = "title"
|
|
82
|
+
SUMMARY = "summary"
|
|
83
|
+
CLASSIFICATION_LABELS = "classification_labels"
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_show_and_extracted(
|
|
87
|
+
cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
|
|
88
|
+
) -> list["ResourceProp"]:
|
|
89
|
+
_show_to_prop = {
|
|
90
|
+
ResourceProperties.BASIC: cls.BASIC,
|
|
91
|
+
ResourceProperties.ORIGIN: cls.ORIGIN,
|
|
92
|
+
ResourceProperties.EXTRA: cls.EXTRA,
|
|
93
|
+
ResourceProperties.RELATIONS: cls.RELATIONS,
|
|
94
|
+
ResourceProperties.VALUES: cls.VALUES,
|
|
95
|
+
ResourceProperties.ERRORS: cls.ERRORS,
|
|
96
|
+
ResourceProperties.SECURITY: cls.SECURITY,
|
|
97
|
+
}
|
|
98
|
+
_extracted_to_prop = {
|
|
99
|
+
ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
|
|
100
|
+
ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
|
|
101
|
+
ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
|
|
102
|
+
ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
|
|
103
|
+
ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
|
|
104
|
+
ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
|
|
105
|
+
ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
|
|
106
|
+
ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
props = []
|
|
110
|
+
for s in show:
|
|
111
|
+
show_prop = _show_to_prop.get(s)
|
|
112
|
+
# show=extracted is not in the dict
|
|
113
|
+
if show_prop is None:
|
|
114
|
+
continue
|
|
115
|
+
props.append(show_prop)
|
|
116
|
+
|
|
117
|
+
if ResourceProperties.EXTRACTED in show:
|
|
118
|
+
for e in extracted:
|
|
119
|
+
extracted_prop = _extracted_to_prop[e]
|
|
120
|
+
props.append(extracted_prop)
|
|
121
|
+
|
|
122
|
+
return props
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class AugmentResourceFields(BaseModel):
|
|
126
|
+
text: bool = False
|
|
127
|
+
classification_labels: bool = False
|
|
128
|
+
|
|
129
|
+
filters: list[filters.Field | filters.Generated]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class AugmentResources(BaseModel):
|
|
133
|
+
given: list[ResourceId]
|
|
134
|
+
|
|
135
|
+
# TODO(decoupled-ask): replace this select for bool fields
|
|
136
|
+
select: list[ResourceProp] = Field(default_factory=list)
|
|
137
|
+
|
|
138
|
+
field_type_filter: list[FieldTypeName] | None = Field(
|
|
139
|
+
default=None,
|
|
140
|
+
deprecated="Only use this for legacy resource serialization",
|
|
141
|
+
title="Field type filter",
|
|
142
|
+
description=(
|
|
143
|
+
"Define which field types are serialized on resources of search results. "
|
|
144
|
+
"If omitted and legacy serialization is used, all field types will be serialized"
|
|
145
|
+
),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
fields: AugmentResourceFields | None = None
|
|
149
|
+
|
|
150
|
+
@model_validator(mode="after")
|
|
151
|
+
def bwc_resource_serialization(self) -> Self:
|
|
152
|
+
if self.field_type_filter is not None and self.fields is not None:
|
|
153
|
+
raise ValueError("`field_type_filter` and `fields` are incompatible together")
|
|
154
|
+
|
|
155
|
+
return self
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class AugmentFields(BaseModel):
|
|
159
|
+
given: list[FieldId]
|
|
160
|
+
|
|
161
|
+
text: bool = False
|
|
162
|
+
classification_labels: bool = False
|
|
163
|
+
entities: bool = False # also known as ners
|
|
164
|
+
|
|
165
|
+
# When enabled, augment all the messages from the conversation. This is
|
|
166
|
+
# incompatible with max_conversation_messages defined
|
|
167
|
+
full_conversation: bool = False
|
|
168
|
+
|
|
169
|
+
# When `full` disbled, this option controls the max amount of messages to be
|
|
170
|
+
# augmented. This number will be a best-effort window centered around the
|
|
171
|
+
# selected message. In addition, the 1st message of the conversation will
|
|
172
|
+
# always be included.
|
|
173
|
+
#
|
|
174
|
+
# This option is combinable with attachments.
|
|
175
|
+
max_conversation_messages: int | None = None
|
|
176
|
+
|
|
177
|
+
# Given a message, if it's a question, try to find an answer. Otherwise,
|
|
178
|
+
# return a window of messages following the requested one.
|
|
179
|
+
#
|
|
180
|
+
# This was previously done without explicit user consent, now it's an option.
|
|
181
|
+
conversation_answer_or_messages_after: bool = False
|
|
182
|
+
|
|
183
|
+
# Both attachment options will only add attachments for the full or the 1st
|
|
184
|
+
# + window, not answer nor messages after
|
|
185
|
+
|
|
186
|
+
# include conversation text attachments
|
|
187
|
+
conversation_text_attachments: bool = False
|
|
188
|
+
# include conversation image attachments
|
|
189
|
+
conversation_image_attachments: bool = False
|
|
190
|
+
|
|
191
|
+
@model_validator(mode="after")
|
|
192
|
+
def validate_cross_options(self):
|
|
193
|
+
if self.full_conversation and self.max_conversation_messages is not None:
|
|
194
|
+
raise ValueError(
|
|
195
|
+
"`full_conversation` and `max_conversation_messages` are not compatible together"
|
|
196
|
+
)
|
|
197
|
+
if (
|
|
198
|
+
(self.conversation_text_attachments or self.conversation_image_attachments)
|
|
199
|
+
and self.full_conversation is False
|
|
200
|
+
and self.max_conversation_messages is None
|
|
201
|
+
):
|
|
202
|
+
raise ValueError(
|
|
203
|
+
"Attachments are only compatible with `full_conversation` and `max_conversation_messages`"
|
|
204
|
+
)
|
|
205
|
+
return self
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# TODO(decoupled-ask): remove unused metadata
|
|
209
|
+
class ParagraphMetadata(BaseModel):
|
|
210
|
+
field_labels: list[str]
|
|
211
|
+
paragraph_labels: list[str]
|
|
212
|
+
|
|
213
|
+
is_an_image: bool
|
|
214
|
+
is_a_table: bool
|
|
215
|
+
|
|
216
|
+
# for extracted from visual content (ocr, inception, tables)
|
|
217
|
+
source_file: str | None
|
|
218
|
+
|
|
219
|
+
# for documents (pdf, docx...) only
|
|
220
|
+
page: int | None
|
|
221
|
+
in_page_with_visual: bool | None
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
class AugmentParagraph(BaseModel):
|
|
225
|
+
id: ParagraphId
|
|
226
|
+
metadata: ParagraphMetadata | None = None
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class AugmentParagraphs(BaseModel):
|
|
230
|
+
given: list[AugmentParagraph]
|
|
231
|
+
|
|
232
|
+
text: bool = True
|
|
233
|
+
|
|
234
|
+
neighbours_before: int = 0
|
|
235
|
+
neighbours_after: int = 0
|
|
236
|
+
|
|
237
|
+
# TODO(decoupled-ask): implement image strategy
|
|
238
|
+
# paragraph extracted from an image, return an image
|
|
239
|
+
source_image: bool = False
|
|
240
|
+
|
|
241
|
+
# TODO(decoupled-ask): implement image strategy
|
|
242
|
+
# paragraph extracted from a table, return table image
|
|
243
|
+
table_image: bool = False
|
|
244
|
+
|
|
245
|
+
# TODO(decoupled-ask): implement image strategy
|
|
246
|
+
# return page_preview instead of table image if table image enabled
|
|
247
|
+
table_prefers_page_preview: bool = False
|
|
248
|
+
|
|
249
|
+
# TODO(decoupled-ask): implement image strategy
|
|
250
|
+
# paragraph from a page, return page preview image
|
|
251
|
+
page_preview_image: bool = False
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class AugmentRequest(BaseModel):
|
|
255
|
+
resources: AugmentResources | None = None
|
|
256
|
+
fields: AugmentFields | None = None
|
|
257
|
+
paragraphs: AugmentParagraphs | None = None
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
# Response
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class AugmentedParagraph(BaseModel):
|
|
264
|
+
text: str | None = None
|
|
265
|
+
|
|
266
|
+
neighbours_before: list[ParagraphId] | None = None
|
|
267
|
+
neighbours_after: list[ParagraphId] | None = None
|
|
268
|
+
|
|
269
|
+
image: Image | None = None
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
class AugmentedField(BaseModel):
|
|
273
|
+
text: str | None = None
|
|
274
|
+
|
|
275
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
276
|
+
|
|
277
|
+
# former ners
|
|
278
|
+
entities: dict[str, list[str]] | None = None
|
|
279
|
+
|
|
280
|
+
page_preview_image: Image | None = None
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class AugmentedConversationMessage(BaseModel):
|
|
284
|
+
ident: str
|
|
285
|
+
text: str | None = None
|
|
286
|
+
attachments: list[FieldId] | None = None
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
class AugmentedConversationField(BaseModel):
|
|
290
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
291
|
+
# former ners
|
|
292
|
+
entities: dict[str, list[str]] | None = None
|
|
293
|
+
|
|
294
|
+
messages: list[AugmentedConversationMessage] | None = None
|
|
295
|
+
|
|
296
|
+
@property
|
|
297
|
+
def text(self) -> str | None:
|
|
298
|
+
"""Syntactic sugar to access aggregate text from all messages"""
|
|
299
|
+
if self.messages is None:
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
text = ""
|
|
303
|
+
for message in self.messages:
|
|
304
|
+
text += message.text or ""
|
|
305
|
+
|
|
306
|
+
return text or None
|
|
307
|
+
|
|
308
|
+
@property
|
|
309
|
+
def attachments(self) -> list[FieldId] | None:
|
|
310
|
+
"""Syntactic sugar to access the aggregate of attachments from all messages."""
|
|
311
|
+
if self.messages is None:
|
|
312
|
+
return None
|
|
313
|
+
|
|
314
|
+
has_attachments = False
|
|
315
|
+
attachments = []
|
|
316
|
+
for message in self.messages:
|
|
317
|
+
if message.attachments is None:
|
|
318
|
+
continue
|
|
319
|
+
has_attachments = True
|
|
320
|
+
attachments.extend(message.attachments)
|
|
321
|
+
|
|
322
|
+
if has_attachments:
|
|
323
|
+
return attachments
|
|
324
|
+
else:
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class AugmentedResource(Resource):
|
|
329
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
330
|
+
|
|
331
|
+
def updated_from(self, origin: Resource):
|
|
332
|
+
for key in origin.model_fields.keys():
|
|
333
|
+
self.__setattr__(key, getattr(origin, key))
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
class AugmentResponse(BaseModel):
|
|
337
|
+
resources: dict[ResourceId, AugmentedResource]
|
|
338
|
+
fields: dict[FieldId, AugmentedField | AugmentedConversationField]
|
|
339
|
+
paragraphs: dict[ParagraphId, AugmentedParagraph]
|
nucliadb_models/common.py
CHANGED
|
@@ -108,7 +108,7 @@ class File(BaseModel):
|
|
|
108
108
|
if self.md5 is None:
|
|
109
109
|
# In case md5 is not supplied, compute it
|
|
110
110
|
try:
|
|
111
|
-
result = hashlib.md5(base64.b64decode(self.payload))
|
|
111
|
+
result = hashlib.md5(base64.b64decode(self.payload), usedforsecurity=False)
|
|
112
112
|
self.md5 = result.hexdigest()
|
|
113
113
|
except Exception:
|
|
114
114
|
raise ValueError("MD5 could not be computed")
|
nucliadb_models/conversation.py
CHANGED
|
@@ -86,7 +86,7 @@ class FieldConversation(BaseModel):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class InputMessageContent(BaseModel):
|
|
89
|
-
text: str = Field(
|
|
89
|
+
text: str = Field()
|
|
90
90
|
format: MessageFormat = MessageFormat.PLAIN
|
|
91
91
|
attachments: List[FileB64] = Field(default=[], max_length=50)
|
|
92
92
|
attachments_fields: List[FieldRef] = Field(default=[], max_length=50)
|
|
@@ -129,7 +129,6 @@ class InputConversationField(BaseModel):
|
|
|
129
129
|
messages: List[InputMessage] = Field(
|
|
130
130
|
default_factory=list,
|
|
131
131
|
description="List of messages in the conversation field. Each message must have a unique ident. A single conversation can contain up to 51,200 messages. You can add up to 2,048 messages per request.",
|
|
132
|
-
max_length=2048,
|
|
133
132
|
)
|
|
134
133
|
extract_strategy: Optional[str] = Field(
|
|
135
134
|
default=None,
|
nucliadb_models/filters.py
CHANGED
|
@@ -90,7 +90,7 @@ class Resource(FilterProp, extra="forbid"):
|
|
|
90
90
|
try:
|
|
91
91
|
UUID(v)
|
|
92
92
|
except ValueError:
|
|
93
|
-
raise ValueError("
|
|
93
|
+
raise ValueError(f"resource id filter '{v}' should be a valid UUID")
|
|
94
94
|
return v
|
|
95
95
|
|
|
96
96
|
@model_validator(mode="after")
|
|
@@ -276,7 +276,7 @@ class Generated(FilterProp, extra="forbid"):
|
|
|
276
276
|
by: Literal["data-augmentation"] = pydantic.Field(
|
|
277
277
|
description="Generator for this field. Currently, only data-augmentation is supported"
|
|
278
278
|
)
|
|
279
|
-
da_task: Optional[
|
|
279
|
+
da_task: Optional[str] = pydantic.Field(
|
|
280
280
|
default=None, description="Matches field generated by an specific DA task, given its prefix"
|
|
281
281
|
)
|
|
282
282
|
|
nucliadb_models/hydration.py
CHANGED
|
@@ -246,10 +246,11 @@ class Hydration(BaseModel, extra="forbid"):
|
|
|
246
246
|
ParagraphId = Annotated[
|
|
247
247
|
str,
|
|
248
248
|
StringConstraints(
|
|
249
|
-
|
|
250
|
-
|
|
249
|
+
# resource-uuid/field-type/field-id/[split-id/]paragraph-id
|
|
250
|
+
pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$",
|
|
251
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
|
|
251
252
|
# max field id of 250 and 10 digit paragraphs. More than enough
|
|
252
|
-
max_length=32 + 1 + 1 + 1 + 250 + 1 + 21,
|
|
253
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
|
|
253
254
|
),
|
|
254
255
|
]
|
|
255
256
|
|
nucliadb_models/resource.py
CHANGED
|
@@ -72,15 +72,6 @@ class ExtractedDataTypeName(str, Enum):
|
|
|
72
72
|
QA = "question_answers"
|
|
73
73
|
|
|
74
74
|
|
|
75
|
-
class ReleaseChannel(str, Enum):
|
|
76
|
-
"""
|
|
77
|
-
Deprecated. No longer used.
|
|
78
|
-
"""
|
|
79
|
-
|
|
80
|
-
STABLE = "STABLE"
|
|
81
|
-
EXPERIMENTAL = "EXPERIMENTAL"
|
|
82
|
-
|
|
83
|
-
|
|
84
75
|
class KnowledgeBoxConfig(BaseModel):
|
|
85
76
|
slug: Optional[SlugString] = Field(
|
|
86
77
|
default=None, title="Slug", description="Slug for the Knowledge Box."
|
nucliadb_models/search.py
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
import json
|
|
16
16
|
from enum import Enum
|
|
17
17
|
from typing import Any, Literal, Optional, Union
|
|
18
|
+
from uuid import UUID
|
|
18
19
|
|
|
19
20
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
20
21
|
from pydantic.aliases import AliasChoices
|
|
@@ -347,10 +348,12 @@ SortOrderMap = {
|
|
|
347
348
|
|
|
348
349
|
class SortOptions(BaseModel):
|
|
349
350
|
field: SortField
|
|
350
|
-
limit: Optional[int] = Field(None, gt=0)
|
|
351
351
|
order: SortOrder = SortOrder.DESC
|
|
352
352
|
|
|
353
353
|
|
|
354
|
+
MAX_RANK_FUSION_WINDOW = 500
|
|
355
|
+
|
|
356
|
+
|
|
354
357
|
class RankFusionName(str, Enum):
|
|
355
358
|
RECIPROCAL_RANK_FUSION = "rrf"
|
|
356
359
|
|
|
@@ -380,7 +383,7 @@ class ReciprocalRankFusion(_BaseRankFusion):
|
|
|
380
383
|
)
|
|
381
384
|
window: Optional[int] = Field(
|
|
382
385
|
default=None,
|
|
383
|
-
le=
|
|
386
|
+
le=MAX_RANK_FUSION_WINDOW,
|
|
384
387
|
title="RRF window",
|
|
385
388
|
description="Number of elements for retrieval to do RRF. Window must be greater or equal to top_k. Greater values will increase probability of multi match at cost of retrieval time", # noqa: E501
|
|
386
389
|
)
|
|
@@ -503,10 +506,18 @@ class SearchParamDefaults:
|
|
|
503
506
|
)
|
|
504
507
|
top_k = ParamDefault(
|
|
505
508
|
default=20,
|
|
509
|
+
gt=-1,
|
|
506
510
|
le=200,
|
|
507
511
|
title="Top k",
|
|
508
512
|
description="The number of results search should return. The maximum number of results allowed is 200.",
|
|
509
513
|
)
|
|
514
|
+
offset = ParamDefault(
|
|
515
|
+
default=0,
|
|
516
|
+
gt=-1,
|
|
517
|
+
le=1000,
|
|
518
|
+
title="Results offset",
|
|
519
|
+
description="The number of results to skip, starting from the beginning in sort order. Used for pagination. It can only be used with the keyword and fulltext indexes.",
|
|
520
|
+
)
|
|
510
521
|
highlight = ParamDefault(
|
|
511
522
|
default=False,
|
|
512
523
|
title="Highlight",
|
|
@@ -532,12 +543,6 @@ class SearchParamDefaults:
|
|
|
532
543
|
title="Sort order",
|
|
533
544
|
description="Order to sort results with",
|
|
534
545
|
)
|
|
535
|
-
sort_limit = ParamDefault(
|
|
536
|
-
default=None,
|
|
537
|
-
title="Sort limit",
|
|
538
|
-
description="",
|
|
539
|
-
gt=0,
|
|
540
|
-
)
|
|
541
546
|
sort_field = ParamDefault(
|
|
542
547
|
default=None,
|
|
543
548
|
title="Sort field",
|
|
@@ -927,6 +932,13 @@ Please return ONLY the question without any explanation. Just the rephrased ques
|
|
|
927
932
|
values["top_k"] = SearchParamDefaults.top_k.default
|
|
928
933
|
return values
|
|
929
934
|
|
|
935
|
+
@field_validator("resource_filters", mode="after")
|
|
936
|
+
def validate_resource_filters(cls, values: list[str]) -> list[str]:
|
|
937
|
+
if values is not None:
|
|
938
|
+
for v in values:
|
|
939
|
+
_validate_resource_filter(v)
|
|
940
|
+
return values
|
|
941
|
+
|
|
930
942
|
|
|
931
943
|
class SearchRequest(BaseSearchRequest):
|
|
932
944
|
features: list[SearchOptions] = SearchParamDefaults.search_features.to_pydantic_field(
|
|
@@ -938,12 +950,32 @@ class SearchRequest(BaseSearchRequest):
|
|
|
938
950
|
)
|
|
939
951
|
faceted: list[str] = SearchParamDefaults.faceted.to_pydantic_field()
|
|
940
952
|
sort: Optional[SortOptions] = SearchParamDefaults.sort.to_pydantic_field()
|
|
953
|
+
offset: int = SearchParamDefaults.offset.to_pydantic_field()
|
|
941
954
|
|
|
942
955
|
@field_validator("faceted")
|
|
943
956
|
@classmethod
|
|
944
957
|
def nested_facets_not_supported(cls, facets):
|
|
945
958
|
return validate_facets(facets)
|
|
946
959
|
|
|
960
|
+
@model_validator(mode="after")
|
|
961
|
+
def offset_sort_only_on_keyword_indexes(self):
|
|
962
|
+
has_non_keyword_indexes = set(self.features) & {SearchOptions.SEMANTIC, SearchOptions.RELATIONS}
|
|
963
|
+
if has_non_keyword_indexes:
|
|
964
|
+
if self.offset > 0:
|
|
965
|
+
raise ValueError("offset cannot be used with the semantic or relations index")
|
|
966
|
+
if self.sort and self.sort.field != SortField.SCORE:
|
|
967
|
+
raise ValueError("sort by date cannot be used with the semantic or relations index")
|
|
968
|
+
|
|
969
|
+
return self
|
|
970
|
+
|
|
971
|
+
@field_validator("sort", mode="after")
|
|
972
|
+
@classmethod
|
|
973
|
+
def sorting_by_title_not_supported(cls, value: Optional[SortOptions]) -> Optional[SortOptions]:
|
|
974
|
+
if value and value.field == SortField.TITLE:
|
|
975
|
+
raise ValueError("sorting by title not supported in /search")
|
|
976
|
+
|
|
977
|
+
return value
|
|
978
|
+
|
|
947
979
|
|
|
948
980
|
class Author(str, Enum):
|
|
949
981
|
NUCLIA = "NUCLIA"
|
|
@@ -1836,6 +1868,13 @@ Using this feature also disables the `citations` parameter. For maximal accuracy
|
|
|
1836
1868
|
self.context = None
|
|
1837
1869
|
return self
|
|
1838
1870
|
|
|
1871
|
+
@field_validator("resource_filters", mode="after")
|
|
1872
|
+
def validate_resource_filters(cls, values: list[str]) -> list[str]:
|
|
1873
|
+
if values is not None:
|
|
1874
|
+
for v in values:
|
|
1875
|
+
_validate_resource_filter(v)
|
|
1876
|
+
return values
|
|
1877
|
+
|
|
1839
1878
|
|
|
1840
1879
|
# Alias (for backwards compatiblity with testbed)
|
|
1841
1880
|
class ChatRequest(AskRequest):
|
|
@@ -2038,8 +2077,10 @@ class FindResource(Resource):
|
|
|
2038
2077
|
fields: dict[str, FindField]
|
|
2039
2078
|
|
|
2040
2079
|
def updated_from(self, origin: Resource):
|
|
2080
|
+
find_resource_model_fields = self.model_fields.keys()
|
|
2041
2081
|
for key in origin.model_fields.keys():
|
|
2042
|
-
|
|
2082
|
+
if key in find_resource_model_fields:
|
|
2083
|
+
self.__setattr__(key, getattr(origin, key))
|
|
2043
2084
|
|
|
2044
2085
|
|
|
2045
2086
|
class KnowledgeboxFindResults(JsonBaseModel):
|
|
@@ -2500,3 +2541,22 @@ class CatalogFacetsRequest(BaseModel):
|
|
|
2500
2541
|
|
|
2501
2542
|
class CatalogFacetsResponse(BaseModel):
|
|
2502
2543
|
facets: dict[str, int]
|
|
2544
|
+
|
|
2545
|
+
|
|
2546
|
+
def _validate_resource_filter(v: str):
|
|
2547
|
+
parts = v.split("/")
|
|
2548
|
+
|
|
2549
|
+
rid = parts[0]
|
|
2550
|
+
try:
|
|
2551
|
+
UUID(rid)
|
|
2552
|
+
except ValueError:
|
|
2553
|
+
raise ValueError(f"resource id filter '{rid}' should be a valid UUID")
|
|
2554
|
+
|
|
2555
|
+
if len(parts) > 1:
|
|
2556
|
+
field_type = parts[1]
|
|
2557
|
+
try:
|
|
2558
|
+
FieldTypeName.from_abbreviation(field_type)
|
|
2559
|
+
except KeyError: # pragma: no cover
|
|
2560
|
+
raise ValueError(
|
|
2561
|
+
f"resource filter {v} has an invalid field type: {field_type}",
|
|
2562
|
+
)
|
{nucliadb_models-6.9.2.post5276.dist-info → nucliadb_models-6.9.7.post5583.dist-info}/RECORD
RENAMED
|
@@ -1,23 +1,24 @@
|
|
|
1
1
|
nucliadb_models/__init__.py,sha256=3y8-htogKuCZcbhaUZdSjTeEjUSeec9aRWyL8AlKCyM,1077
|
|
2
|
-
nucliadb_models/
|
|
2
|
+
nucliadb_models/augment.py,sha256=b-A_qF3-MoIPTrK0ZjsGxV6mJA-TZqnHrSJT0vVnvqQ,10970
|
|
3
|
+
nucliadb_models/common.py,sha256=gVG5kOhOwQZR-t5n3b9-hANMlLy2CHelUU5PPUf3bck,8192
|
|
3
4
|
nucliadb_models/configuration.py,sha256=BBrJsNjP324Cw_5J3dBrGwvpkHQYbXEo3TUaI9IqAOg,2449
|
|
4
5
|
nucliadb_models/content_types.py,sha256=36Ga-iGf4ivCqgtXC7imFgegrwHB117s9eqP62JtGv0,3456
|
|
5
|
-
nucliadb_models/conversation.py,sha256=
|
|
6
|
+
nucliadb_models/conversation.py,sha256=k9bKhkDiqhqmdrDfDPNoUfG7-2H_-KAyuOnETd8zV0E,5081
|
|
6
7
|
nucliadb_models/entities.py,sha256=i-7Y8qmFRRTih5zw0ajv1U_iiXexe66M3TK8hUikQZk,2356
|
|
7
8
|
nucliadb_models/export_import.py,sha256=mNm9IArOLnC6TLupkwqVFhxD5d08mpIVOVFneECv8UA,1073
|
|
8
9
|
nucliadb_models/external_index_providers.py,sha256=pL3leo4MkuJOnKlU1Sg6GT_mnK_VUBxGui-RPmDYVWU,1126
|
|
9
10
|
nucliadb_models/extracted.py,sha256=Owz7LC3le3Dvau3TtRiO8NY84meOf6IxN-RrOqqpMPs,5593
|
|
10
11
|
nucliadb_models/file.py,sha256=tXtgB9c7i2ADsnJ7HdbXyroAmXadGvOeA49htBh7BZo,2263
|
|
11
|
-
nucliadb_models/filters.py,sha256=
|
|
12
|
-
nucliadb_models/hydration.py,sha256=
|
|
12
|
+
nucliadb_models/filters.py,sha256=MlyG3mWyOro90PLzzczVfyGrqRBD8RS1OMjVejm_tQ4,14793
|
|
13
|
+
nucliadb_models/hydration.py,sha256=SlAzraJE6DX0uOpZWxu2k_9-ikYorsj0t8xwsWSBQZY,14363
|
|
13
14
|
nucliadb_models/labels.py,sha256=9zqRgkpZuX3kUPwsTTgCH7JyOWK7dM5pwyuHJR86YdU,3949
|
|
14
15
|
nucliadb_models/link.py,sha256=PF5hHLwdOed5TMBTxtokkgWtMh1bFnORZjybh0NwVCw,2526
|
|
15
16
|
nucliadb_models/metadata.py,sha256=OOKGy_83NtlG1QKQZEwMuwu4wbVEe7P30Y2QvnGSDto,8933
|
|
16
17
|
nucliadb_models/notifications.py,sha256=mna8-AoD_29Wds0Thl0AF0zpERnJmYGLZX1w1fUopMY,4036
|
|
17
18
|
nucliadb_models/processing.py,sha256=nhKuHQjqCdb9zJVkYGPTLub23tK9e_lwL5OCDVymZjY,719
|
|
18
19
|
nucliadb_models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
-
nucliadb_models/resource.py,sha256=
|
|
20
|
-
nucliadb_models/search.py,sha256=
|
|
20
|
+
nucliadb_models/resource.py,sha256=k_8I_5cN4hrWEFfzpgxwKJmKfqTfnPfFH1drAXLUjKc,8881
|
|
21
|
+
nucliadb_models/search.py,sha256=O7TA_U0ZUEdQbKAXxAQhmZHhv5kK8hU1wSrt14JCpPg,98669
|
|
21
22
|
nucliadb_models/security.py,sha256=opxaDLfvk3aU0sjesK0jGrYLx5h4YCwlKKN0moYs_ig,1150
|
|
22
23
|
nucliadb_models/synonyms.py,sha256=afbaVqSQSxGLwi2PusVaLSRpkOtA5AZmWOKd1f4nl2E,690
|
|
23
24
|
nucliadb_models/text.py,sha256=60bxZnOjRHnDdezR8VfR3AZsXTOwePFPs2BKB8wxBak,3414
|
|
@@ -33,7 +34,7 @@ nucliadb_models/graph/responses.py,sha256=Sdq8OgFAL1YT-1lJyLLrkqcScvj7YTEqAUwQ-k
|
|
|
33
34
|
nucliadb_models/internal/__init__.py,sha256=zG33bUz1rHFPtvqQPWn4rDwBJt3FJodGuQYD45quiQg,583
|
|
34
35
|
nucliadb_models/internal/predict.py,sha256=Pnx6MmLfK65eExe1XnVxqmSlvMwdowewwks9BOEoqMw,2029
|
|
35
36
|
nucliadb_models/internal/shards.py,sha256=__y1OZtWGiNcPQEWfSFOj8yw458WGi7mM4vZe0K-L1Y,1691
|
|
36
|
-
nucliadb_models-6.9.
|
|
37
|
-
nucliadb_models-6.9.
|
|
38
|
-
nucliadb_models-6.9.
|
|
39
|
-
nucliadb_models-6.9.
|
|
37
|
+
nucliadb_models-6.9.7.post5583.dist-info/METADATA,sha256=cMtwMi17BpKGWqA1DpJCeDYb1Lr1gMSXcv3aVFZcrm8,745
|
|
38
|
+
nucliadb_models-6.9.7.post5583.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
39
|
+
nucliadb_models-6.9.7.post5583.dist-info/top_level.txt,sha256=UrY1I8oeovIRwkXLYplssTrxQdUjhSEFDFbnwaIV3tA,16
|
|
40
|
+
nucliadb_models-6.9.7.post5583.dist-info/RECORD,,
|
|
File without changes
|
{nucliadb_models-6.9.2.post5276.dist-info → nucliadb_models-6.9.7.post5583.dist-info}/top_level.txt
RENAMED
|
File without changes
|