nucliadb-models 6.8.1.post4983__py3-none-any.whl → 6.10.0.post5694__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb-models might be problematic. Click here for more details.
- nucliadb_models/agents/ingestion.py +4 -4
- nucliadb_models/augment.py +359 -0
- nucliadb_models/common.py +66 -57
- nucliadb_models/configuration.py +9 -9
- nucliadb_models/content_types.py +13 -11
- nucliadb_models/conversation.py +30 -29
- nucliadb_models/entities.py +17 -18
- nucliadb_models/external_index_providers.py +5 -20
- nucliadb_models/extracted.py +82 -83
- nucliadb_models/file.py +10 -11
- nucliadb_models/filters.py +78 -74
- nucliadb_models/graph/requests.py +38 -47
- nucliadb_models/hydration.py +423 -0
- nucliadb_models/internal/predict.py +7 -9
- nucliadb_models/internal/shards.py +2 -3
- nucliadb_models/labels.py +18 -11
- nucliadb_models/link.py +18 -19
- nucliadb_models/metadata.py +80 -53
- nucliadb_models/notifications.py +3 -3
- nucliadb_models/processing.py +1 -2
- nucliadb_models/resource.py +85 -102
- nucliadb_models/retrieval.py +147 -0
- nucliadb_models/search.py +360 -306
- nucliadb_models/security.py +2 -3
- nucliadb_models/text.py +7 -8
- nucliadb_models/trainset.py +1 -2
- nucliadb_models/utils.py +2 -3
- nucliadb_models/vectors.py +2 -5
- nucliadb_models/writer.py +56 -57
- {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/METADATA +2 -3
- nucliadb_models-6.10.0.post5694.dist-info/RECORD +41 -0
- nucliadb_models-6.8.1.post4983.dist-info/RECORD +0 -38
- {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/WHEEL +0 -0
- {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
from enum import Enum
|
|
16
|
-
from typing import Optional
|
|
17
16
|
|
|
18
17
|
from pydantic import BaseModel, Field
|
|
19
18
|
|
|
@@ -41,11 +40,12 @@ class AgentsFilter(BaseModel):
|
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
class ResourceAgentsRequest(BaseModel):
|
|
44
|
-
filters:
|
|
43
|
+
filters: list[AgentsFilter] | None = Field(
|
|
44
|
+
title="Resource Agent Filters",
|
|
45
45
|
default=None,
|
|
46
46
|
description="Filters to apply to the agents. If None, all curently configured agents are applied.",
|
|
47
47
|
)
|
|
48
|
-
agent_ids:
|
|
48
|
+
agent_ids: list[str] | None = Field(
|
|
49
49
|
default=None,
|
|
50
50
|
title="An optional list of Data Augmentation Agent IDs to run. If None, all configured agents that match the filters are run.",
|
|
51
51
|
)
|
|
@@ -57,7 +57,7 @@ class NewTextField(BaseModel):
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
class AppliedDataAugmentation(BaseModel):
|
|
60
|
-
qas:
|
|
60
|
+
qas: QuestionAnswers | None = Field(
|
|
61
61
|
default=None,
|
|
62
62
|
description="Question and answers generated by the Question Answers agent",
|
|
63
63
|
)
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# Copyright 2025 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import Annotated
|
|
18
|
+
|
|
19
|
+
from pydantic import BaseModel, Field, StringConstraints, model_validator
|
|
20
|
+
from typing_extensions import Self
|
|
21
|
+
|
|
22
|
+
from nucliadb_models import filters
|
|
23
|
+
from nucliadb_models.common import FieldTypeName
|
|
24
|
+
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
25
|
+
from nucliadb_models.search import ResourceProperties
|
|
26
|
+
|
|
27
|
+
ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
|
|
28
|
+
ResourceId = Annotated[
|
|
29
|
+
str,
|
|
30
|
+
StringConstraints(pattern=ResourceIdPattern, min_length=32, max_length=36),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
FieldIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?$"
|
|
34
|
+
FieldId = Annotated[
|
|
35
|
+
str,
|
|
36
|
+
StringConstraints(
|
|
37
|
+
pattern=FieldIdPattern,
|
|
38
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0,
|
|
39
|
+
# max field id of 250
|
|
40
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 218,
|
|
41
|
+
),
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
ParagraphIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$"
|
|
45
|
+
ParagraphId = Annotated[
|
|
46
|
+
str,
|
|
47
|
+
StringConstraints(
|
|
48
|
+
# resource-uuid/field-type/field-id/[split-id/]paragraph-id
|
|
49
|
+
pattern=ParagraphIdPattern,
|
|
50
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
|
|
51
|
+
# max field id of 250 and 10 digit paragraphs. More than enough
|
|
52
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
|
|
53
|
+
),
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# Request
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ResourceProp(str, Enum):
|
|
61
|
+
"""Superset of former `show` and `extracted` serializations options."""
|
|
62
|
+
|
|
63
|
+
# `show` props
|
|
64
|
+
BASIC = "basic"
|
|
65
|
+
ORIGIN = "origin"
|
|
66
|
+
EXTRA = "extra"
|
|
67
|
+
RELATIONS = "relations"
|
|
68
|
+
VALUES = "values"
|
|
69
|
+
ERRORS = "errors"
|
|
70
|
+
SECURITY = "security"
|
|
71
|
+
# `extracted` props
|
|
72
|
+
EXTRACTED_TEXT = "extracted_text"
|
|
73
|
+
EXTRACTED_METADATA = "extracted_metadata"
|
|
74
|
+
EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
|
|
75
|
+
EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
|
|
76
|
+
EXTRACTED_VECTOR = "extracted_vectors"
|
|
77
|
+
EXTRACTED_LINK = "extracted_link"
|
|
78
|
+
EXTRACTED_FILE = "extracted_file"
|
|
79
|
+
EXTRACTED_QA = "extracted_question_answers"
|
|
80
|
+
# new granular props
|
|
81
|
+
TITLE = "title"
|
|
82
|
+
SUMMARY = "summary"
|
|
83
|
+
CLASSIFICATION_LABELS = "classification_labels"
|
|
84
|
+
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_show_and_extracted(
|
|
87
|
+
cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
|
|
88
|
+
) -> list["ResourceProp"]:
|
|
89
|
+
_show_to_prop = {
|
|
90
|
+
ResourceProperties.BASIC: cls.BASIC,
|
|
91
|
+
ResourceProperties.ORIGIN: cls.ORIGIN,
|
|
92
|
+
ResourceProperties.EXTRA: cls.EXTRA,
|
|
93
|
+
ResourceProperties.RELATIONS: cls.RELATIONS,
|
|
94
|
+
ResourceProperties.VALUES: cls.VALUES,
|
|
95
|
+
ResourceProperties.ERRORS: cls.ERRORS,
|
|
96
|
+
ResourceProperties.SECURITY: cls.SECURITY,
|
|
97
|
+
}
|
|
98
|
+
_extracted_to_prop = {
|
|
99
|
+
ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
|
|
100
|
+
ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
|
|
101
|
+
ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
|
|
102
|
+
ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
|
|
103
|
+
ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
|
|
104
|
+
ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
|
|
105
|
+
ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
|
|
106
|
+
ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
props = []
|
|
110
|
+
for s in show:
|
|
111
|
+
show_prop = _show_to_prop.get(s)
|
|
112
|
+
# show=extracted is not in the dict
|
|
113
|
+
if show_prop is None:
|
|
114
|
+
continue
|
|
115
|
+
props.append(show_prop)
|
|
116
|
+
|
|
117
|
+
if ResourceProperties.EXTRACTED in show:
|
|
118
|
+
for e in extracted:
|
|
119
|
+
extracted_prop = _extracted_to_prop[e]
|
|
120
|
+
props.append(extracted_prop)
|
|
121
|
+
|
|
122
|
+
return props
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class AugmentResourceFields(BaseModel):
|
|
126
|
+
text: bool = False
|
|
127
|
+
classification_labels: bool = False
|
|
128
|
+
|
|
129
|
+
filters: list[filters.Field | filters.Generated]
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class AugmentResources(BaseModel):
|
|
133
|
+
given: list[ResourceId]
|
|
134
|
+
|
|
135
|
+
# TODO(decoupled-ask): replace this select for bool fields
|
|
136
|
+
select: list[ResourceProp] = Field(default_factory=list)
|
|
137
|
+
|
|
138
|
+
field_type_filter: list[FieldTypeName] | None = Field(
|
|
139
|
+
default=None,
|
|
140
|
+
deprecated="Only use this for legacy resource serialization",
|
|
141
|
+
title="Field type filter",
|
|
142
|
+
description=(
|
|
143
|
+
"Define which field types are serialized on resources of search results. "
|
|
144
|
+
"If omitted and legacy serialization is used, all field types will be serialized"
|
|
145
|
+
),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
fields: AugmentResourceFields | None = None
|
|
149
|
+
|
|
150
|
+
@model_validator(mode="after")
|
|
151
|
+
def bwc_resource_serialization(self) -> Self:
|
|
152
|
+
if self.field_type_filter is not None and self.fields is not None:
|
|
153
|
+
raise ValueError("`field_type_filter` and `fields` are incompatible together")
|
|
154
|
+
|
|
155
|
+
return self
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class AugmentFields(BaseModel):
|
|
159
|
+
given: list[FieldId]
|
|
160
|
+
|
|
161
|
+
text: bool = False
|
|
162
|
+
classification_labels: bool = False
|
|
163
|
+
entities: bool = False # also known as ners
|
|
164
|
+
|
|
165
|
+
# For file fields, augment the path to the thumbnail image
|
|
166
|
+
file_thumbnail: bool = False
|
|
167
|
+
|
|
168
|
+
# When enabled, augment all the messages from the conversation. This is
|
|
169
|
+
# incompatible with max_conversation_messages defined
|
|
170
|
+
full_conversation: bool = False
|
|
171
|
+
|
|
172
|
+
# When `full` disbled, this option controls the max amount of messages to be
|
|
173
|
+
# augmented. This number will be a best-effort window centered around the
|
|
174
|
+
# selected message. In addition, the 1st message of the conversation will
|
|
175
|
+
# always be included.
|
|
176
|
+
#
|
|
177
|
+
# This option is combinable with attachments.
|
|
178
|
+
max_conversation_messages: int | None = None
|
|
179
|
+
|
|
180
|
+
# Given a message, if it's a question, try to find an answer. Otherwise,
|
|
181
|
+
# return a window of messages following the requested one.
|
|
182
|
+
#
|
|
183
|
+
# This was previously done without explicit user consent, now it's an option.
|
|
184
|
+
conversation_answer_or_messages_after: bool = False
|
|
185
|
+
|
|
186
|
+
# Both attachment options will only add attachments for the full or the 1st
|
|
187
|
+
# + window, not answer nor messages after
|
|
188
|
+
|
|
189
|
+
# include conversation text attachments
|
|
190
|
+
conversation_text_attachments: bool = False
|
|
191
|
+
# include conversation image attachments
|
|
192
|
+
conversation_image_attachments: bool = False
|
|
193
|
+
|
|
194
|
+
@model_validator(mode="after")
|
|
195
|
+
def validate_cross_options(self):
|
|
196
|
+
if self.full_conversation and self.max_conversation_messages is not None:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
"`full_conversation` and `max_conversation_messages` are not compatible together"
|
|
199
|
+
)
|
|
200
|
+
if (
|
|
201
|
+
(self.conversation_text_attachments or self.conversation_image_attachments)
|
|
202
|
+
and self.full_conversation is False
|
|
203
|
+
and self.max_conversation_messages is None
|
|
204
|
+
):
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"Attachments are only compatible with `full_conversation` and `max_conversation_messages`"
|
|
207
|
+
)
|
|
208
|
+
return self
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# TODO(decoupled-ask): remove unused metadata
|
|
212
|
+
class ParagraphMetadata(BaseModel):
|
|
213
|
+
field_labels: list[str]
|
|
214
|
+
paragraph_labels: list[str]
|
|
215
|
+
|
|
216
|
+
is_an_image: bool
|
|
217
|
+
is_a_table: bool
|
|
218
|
+
|
|
219
|
+
# for extracted from visual content (ocr, inception, tables)
|
|
220
|
+
source_file: str | None
|
|
221
|
+
|
|
222
|
+
# for documents (pdf, docx...) only
|
|
223
|
+
page: int | None
|
|
224
|
+
in_page_with_visual: bool | None
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class AugmentParagraph(BaseModel):
|
|
228
|
+
id: ParagraphId
|
|
229
|
+
metadata: ParagraphMetadata | None = None
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class AugmentParagraphs(BaseModel):
|
|
233
|
+
given: list[AugmentParagraph]
|
|
234
|
+
|
|
235
|
+
text: bool = True
|
|
236
|
+
|
|
237
|
+
neighbours_before: int = 0
|
|
238
|
+
neighbours_after: int = 0
|
|
239
|
+
|
|
240
|
+
# paragraph extracted from an image, return an image
|
|
241
|
+
source_image: bool = False
|
|
242
|
+
|
|
243
|
+
# paragraph extracted from a table, return table image
|
|
244
|
+
table_image: bool = False
|
|
245
|
+
|
|
246
|
+
# return page_preview instead of table image if table image enabled
|
|
247
|
+
table_prefers_page_preview: bool = False
|
|
248
|
+
|
|
249
|
+
# paragraph from a page, return page preview image
|
|
250
|
+
page_preview_image: bool = False
|
|
251
|
+
|
|
252
|
+
@model_validator(mode="after")
|
|
253
|
+
def table_options_work_together(self) -> Self:
|
|
254
|
+
if not self.table_image and self.table_prefers_page_preview:
|
|
255
|
+
raise ValueError("`table_prefers_page_preview` can only be enabled with `table_image`")
|
|
256
|
+
return self
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
class AugmentRequest(BaseModel):
|
|
260
|
+
resources: AugmentResources | None = None
|
|
261
|
+
fields: AugmentFields | None = None
|
|
262
|
+
paragraphs: AugmentParagraphs | None = None
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# Response
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class AugmentedParagraph(BaseModel):
|
|
269
|
+
text: str | None = None
|
|
270
|
+
|
|
271
|
+
neighbours_before: list[ParagraphId] | None = None
|
|
272
|
+
neighbours_after: list[ParagraphId] | None = None
|
|
273
|
+
|
|
274
|
+
source_image: str | None = None
|
|
275
|
+
table_image: str | None = None
|
|
276
|
+
page_preview_image: str | None = None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class AugmentedField(BaseModel):
|
|
280
|
+
text: str | None = None
|
|
281
|
+
|
|
282
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
283
|
+
|
|
284
|
+
# former ners
|
|
285
|
+
entities: dict[str, list[str]] | None = None
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class AugmentedFileField(BaseModel):
|
|
289
|
+
text: str | None = None
|
|
290
|
+
|
|
291
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
292
|
+
|
|
293
|
+
# former ners
|
|
294
|
+
entities: dict[str, list[str]] | None = None
|
|
295
|
+
|
|
296
|
+
# TODO(decoupled-ask): implement image strategy
|
|
297
|
+
page_preview_image: str | None = None
|
|
298
|
+
|
|
299
|
+
# Path for the download API to retrieve the file thumbnail image
|
|
300
|
+
thumbnail_image: str | None = None
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class AugmentedConversationMessage(BaseModel):
|
|
304
|
+
ident: str
|
|
305
|
+
text: str | None = None
|
|
306
|
+
attachments: list[FieldId] | None = None
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class AugmentedConversationField(BaseModel):
|
|
310
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
311
|
+
# former ners
|
|
312
|
+
entities: dict[str, list[str]] | None = None
|
|
313
|
+
|
|
314
|
+
messages: list[AugmentedConversationMessage] | None = None
|
|
315
|
+
|
|
316
|
+
@property
|
|
317
|
+
def text(self) -> str | None:
|
|
318
|
+
"""Syntactic sugar to access aggregate text from all messages"""
|
|
319
|
+
if self.messages is None:
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
text = ""
|
|
323
|
+
for message in self.messages:
|
|
324
|
+
text += message.text or ""
|
|
325
|
+
|
|
326
|
+
return text or None
|
|
327
|
+
|
|
328
|
+
@property
|
|
329
|
+
def attachments(self) -> list[FieldId] | None:
|
|
330
|
+
"""Syntactic sugar to access the aggregate of attachments from all messages."""
|
|
331
|
+
if self.messages is None:
|
|
332
|
+
return None
|
|
333
|
+
|
|
334
|
+
has_attachments = False
|
|
335
|
+
attachments = []
|
|
336
|
+
for message in self.messages:
|
|
337
|
+
if message.attachments is None:
|
|
338
|
+
continue
|
|
339
|
+
has_attachments = True
|
|
340
|
+
attachments.extend(message.attachments)
|
|
341
|
+
|
|
342
|
+
if has_attachments:
|
|
343
|
+
return attachments
|
|
344
|
+
else:
|
|
345
|
+
return None
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class AugmentedResource(Resource):
|
|
349
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
350
|
+
|
|
351
|
+
def updated_from(self, origin: Resource):
|
|
352
|
+
for key in origin.model_fields.keys():
|
|
353
|
+
self.__setattr__(key, getattr(origin, key))
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
class AugmentResponse(BaseModel):
|
|
357
|
+
resources: dict[ResourceId, AugmentedResource]
|
|
358
|
+
fields: dict[FieldId, AugmentedField | AugmentedFileField | AugmentedConversationField]
|
|
359
|
+
paragraphs: dict[ParagraphId, AugmentedParagraph]
|
nucliadb_models/common.py
CHANGED
|
@@ -16,7 +16,7 @@ import base64
|
|
|
16
16
|
import hashlib
|
|
17
17
|
import re
|
|
18
18
|
from enum import Enum
|
|
19
|
-
from typing import Any
|
|
19
|
+
from typing import Any
|
|
20
20
|
|
|
21
21
|
from pydantic import (
|
|
22
22
|
BaseModel,
|
|
@@ -38,7 +38,7 @@ FIELD_TYPE_CHAR_MAP = {
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
STORAGE_FILE_MATCH = re.compile(
|
|
41
|
-
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
41
|
+
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
42
42
|
)
|
|
43
43
|
DOWNLOAD_TYPE_MAP = {"f": "field", "e": "extracted"}
|
|
44
44
|
DOWNLOAD_URI = "/kb/{kbid}/resource/{rid}/{field_type}/{field_id}/download/{download_type}/{key}"
|
|
@@ -50,9 +50,9 @@ class ParamDefault(BaseModel):
|
|
|
50
50
|
default: Any = None
|
|
51
51
|
title: str
|
|
52
52
|
description: str
|
|
53
|
-
le:
|
|
54
|
-
gt:
|
|
55
|
-
max_items:
|
|
53
|
+
le: float | None = None
|
|
54
|
+
gt: float | None = None
|
|
55
|
+
max_items: int | None = None
|
|
56
56
|
deprecated: bool = False
|
|
57
57
|
|
|
58
58
|
def to_pydantic_field(self, default=_NOT_SET, **kw) -> Field: # type: ignore
|
|
@@ -86,13 +86,13 @@ class FieldID(BaseModel):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class File(BaseModel):
|
|
89
|
-
filename:
|
|
89
|
+
filename: str | None = None
|
|
90
90
|
content_type: str = "application/octet-stream"
|
|
91
|
-
payload:
|
|
92
|
-
md5:
|
|
91
|
+
payload: str | None = Field(default=None, description="Base64 encoded file content")
|
|
92
|
+
md5: str | None = None
|
|
93
93
|
# These are to be used for external files
|
|
94
|
-
uri:
|
|
95
|
-
extra_headers:
|
|
94
|
+
uri: str | None = None
|
|
95
|
+
extra_headers: dict[str, str] = {}
|
|
96
96
|
|
|
97
97
|
@model_validator(mode="after")
|
|
98
98
|
def _check_internal_file_fields(self) -> Self:
|
|
@@ -108,7 +108,7 @@ class File(BaseModel):
|
|
|
108
108
|
if self.md5 is None:
|
|
109
109
|
# In case md5 is not supplied, compute it
|
|
110
110
|
try:
|
|
111
|
-
result = hashlib.md5(base64.b64decode(self.payload))
|
|
111
|
+
result = hashlib.md5(base64.b64decode(self.payload), usedforsecurity=False)
|
|
112
112
|
self.md5 = result.hexdigest()
|
|
113
113
|
except Exception:
|
|
114
114
|
raise ValueError("MD5 could not be computed")
|
|
@@ -134,10 +134,10 @@ class FileB64(BaseModel):
|
|
|
134
134
|
|
|
135
135
|
|
|
136
136
|
class CloudFile(BaseModel):
|
|
137
|
-
uri:
|
|
138
|
-
size:
|
|
139
|
-
content_type:
|
|
140
|
-
bucket_name:
|
|
137
|
+
uri: str | None = None
|
|
138
|
+
size: int | None = None
|
|
139
|
+
content_type: str | None = None
|
|
140
|
+
bucket_name: str | None = None
|
|
141
141
|
|
|
142
142
|
class Source(Enum):
|
|
143
143
|
FLAPS = "FLAPS"
|
|
@@ -146,23 +146,23 @@ class CloudFile(BaseModel):
|
|
|
146
146
|
LOCAL = "LOCAL"
|
|
147
147
|
EXTERNAL = "EXTERNAL"
|
|
148
148
|
|
|
149
|
-
source:
|
|
150
|
-
filename:
|
|
151
|
-
resumable_uri:
|
|
152
|
-
offset:
|
|
153
|
-
upload_uri:
|
|
154
|
-
parts:
|
|
155
|
-
old_uri:
|
|
156
|
-
old_bucket:
|
|
157
|
-
md5:
|
|
149
|
+
source: Source | None
|
|
150
|
+
filename: str | None
|
|
151
|
+
resumable_uri: str | None
|
|
152
|
+
offset: int | None
|
|
153
|
+
upload_uri: str | None
|
|
154
|
+
parts: list[str] | None
|
|
155
|
+
old_uri: str | None
|
|
156
|
+
old_bucket: str | None
|
|
157
|
+
md5: str | None
|
|
158
158
|
|
|
159
159
|
|
|
160
160
|
class CloudLink(BaseModel):
|
|
161
|
-
uri:
|
|
162
|
-
size:
|
|
163
|
-
content_type:
|
|
164
|
-
filename:
|
|
165
|
-
md5:
|
|
161
|
+
uri: str | None = None
|
|
162
|
+
size: int | None = None
|
|
163
|
+
content_type: str | None = None
|
|
164
|
+
filename: str | None = None
|
|
165
|
+
md5: str | None = None
|
|
166
166
|
|
|
167
167
|
@staticmethod
|
|
168
168
|
def format_reader_download_uri(uri: str) -> str:
|
|
@@ -203,16 +203,25 @@ class FieldTypeName(str, Enum):
|
|
|
203
203
|
"a": FieldTypeName.GENERIC,
|
|
204
204
|
}[abbr]
|
|
205
205
|
|
|
206
|
+
def abbreviation(self) -> str:
|
|
207
|
+
return {
|
|
208
|
+
FieldTypeName.TEXT: "t",
|
|
209
|
+
FieldTypeName.FILE: "f",
|
|
210
|
+
FieldTypeName.LINK: "u",
|
|
211
|
+
FieldTypeName.CONVERSATION: "c",
|
|
212
|
+
FieldTypeName.GENERIC: "a",
|
|
213
|
+
}[self]
|
|
214
|
+
|
|
206
215
|
|
|
207
216
|
class FieldRef(BaseModel):
|
|
208
217
|
field_type: FieldTypeName
|
|
209
218
|
field_id: str
|
|
210
|
-
split:
|
|
219
|
+
split: str | None = None
|
|
211
220
|
|
|
212
221
|
|
|
213
222
|
class Classification(BaseModel):
|
|
214
|
-
labelset: str
|
|
215
|
-
label: str
|
|
223
|
+
labelset: str = Field(title="The ID of the labelset")
|
|
224
|
+
label: str = Field(title="The label assigned from the labelset")
|
|
216
225
|
|
|
217
226
|
|
|
218
227
|
class UserClassification(Classification):
|
|
@@ -220,19 +229,19 @@ class UserClassification(Classification):
|
|
|
220
229
|
|
|
221
230
|
|
|
222
231
|
class Sentence(BaseModel):
|
|
223
|
-
start:
|
|
224
|
-
end:
|
|
225
|
-
key:
|
|
232
|
+
start: int | None = None
|
|
233
|
+
end: int | None = None
|
|
234
|
+
key: str | None = None
|
|
226
235
|
|
|
227
236
|
|
|
228
237
|
class PageInformation(BaseModel):
|
|
229
|
-
page:
|
|
230
|
-
page_with_visual:
|
|
238
|
+
page: int | None = Field(default=None, title="Page Information Page")
|
|
239
|
+
page_with_visual: bool | None = None
|
|
231
240
|
|
|
232
241
|
|
|
233
242
|
class Representation(BaseModel):
|
|
234
|
-
is_a_table:
|
|
235
|
-
reference_file:
|
|
243
|
+
is_a_table: bool | None = None
|
|
244
|
+
reference_file: str | None = None
|
|
236
245
|
|
|
237
246
|
|
|
238
247
|
class ParagraphRelations(BaseModel):
|
|
@@ -242,10 +251,10 @@ class ParagraphRelations(BaseModel):
|
|
|
242
251
|
|
|
243
252
|
|
|
244
253
|
class Paragraph(BaseModel):
|
|
245
|
-
start:
|
|
246
|
-
end:
|
|
247
|
-
start_seconds:
|
|
248
|
-
end_seconds:
|
|
254
|
+
start: int | None = None
|
|
255
|
+
end: int | None = None
|
|
256
|
+
start_seconds: list[int] | None = None
|
|
257
|
+
end_seconds: list[int] | None = None
|
|
249
258
|
|
|
250
259
|
class TypeParagraph(str, Enum):
|
|
251
260
|
TEXT = "TEXT"
|
|
@@ -256,35 +265,35 @@ class Paragraph(BaseModel):
|
|
|
256
265
|
TITLE = "TITLE"
|
|
257
266
|
TABLE = "TABLE"
|
|
258
267
|
|
|
259
|
-
kind:
|
|
260
|
-
classifications:
|
|
261
|
-
sentences:
|
|
262
|
-
key:
|
|
263
|
-
page:
|
|
264
|
-
representation:
|
|
265
|
-
relations:
|
|
268
|
+
kind: TypeParagraph | None = None
|
|
269
|
+
classifications: list[Classification] | None = None
|
|
270
|
+
sentences: list[Sentence] | None = None
|
|
271
|
+
key: str | None = None
|
|
272
|
+
page: PageInformation | None = None
|
|
273
|
+
representation: Representation | None = None
|
|
274
|
+
relations: ParagraphRelations | None = None
|
|
266
275
|
|
|
267
276
|
|
|
268
277
|
class Shards(BaseModel):
|
|
269
|
-
shards:
|
|
278
|
+
shards: list[str] | None = None
|
|
270
279
|
|
|
271
280
|
|
|
272
281
|
class Question(BaseModel):
|
|
273
282
|
text: str
|
|
274
|
-
language:
|
|
275
|
-
ids_paragraphs:
|
|
283
|
+
language: str | None = None
|
|
284
|
+
ids_paragraphs: list[str]
|
|
276
285
|
|
|
277
286
|
|
|
278
287
|
class Answer(BaseModel):
|
|
279
288
|
text: str
|
|
280
|
-
language:
|
|
281
|
-
ids_paragraphs:
|
|
289
|
+
language: str | None = None
|
|
290
|
+
ids_paragraphs: list[str]
|
|
282
291
|
|
|
283
292
|
|
|
284
293
|
class QuestionAnswer(BaseModel):
|
|
285
294
|
question: Question
|
|
286
|
-
answers:
|
|
295
|
+
answers: list[Answer]
|
|
287
296
|
|
|
288
297
|
|
|
289
298
|
class QuestionAnswers(BaseModel):
|
|
290
|
-
question_answer:
|
|
299
|
+
question_answer: list[QuestionAnswer]
|
nucliadb_models/configuration.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
#
|
|
15
15
|
|
|
16
16
|
import warnings
|
|
17
|
-
from typing import Annotated, Any, Literal
|
|
17
|
+
from typing import Annotated, Any, Literal
|
|
18
18
|
|
|
19
19
|
from pydantic import BaseModel, Field, create_model
|
|
20
20
|
|
|
@@ -28,11 +28,11 @@ class KBConfiguration(BaseModel):
|
|
|
28
28
|
super().__init__(**data)
|
|
29
29
|
|
|
30
30
|
# Do not touch this model synced on Processing side
|
|
31
|
-
semantic_model:
|
|
32
|
-
generative_model:
|
|
33
|
-
ner_model:
|
|
34
|
-
anonymization_model:
|
|
35
|
-
visual_labeling:
|
|
31
|
+
semantic_model: str | None = None
|
|
32
|
+
generative_model: str | None = None
|
|
33
|
+
ner_model: str | None = None
|
|
34
|
+
anonymization_model: str | None = None
|
|
35
|
+
visual_labeling: str | None = None
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
#
|
|
@@ -44,7 +44,7 @@ def _model_fields(model: type[BaseModel], skip: list[str]) -> dict[str, Any]:
|
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
# FindConfig is a
|
|
47
|
+
# FindConfig is a FindRequest without `search_configuration`
|
|
48
48
|
FindConfig = create_model("FindConfig", **_model_fields(FindRequest, skip=["search_configuration"]))
|
|
49
49
|
|
|
50
50
|
|
|
@@ -57,7 +57,7 @@ class FindSearchConfiguration(BaseModel):
|
|
|
57
57
|
AskConfig = create_model(
|
|
58
58
|
"AskConfig",
|
|
59
59
|
**_model_fields(AskRequest, skip=["query", "search_configuration"]),
|
|
60
|
-
query=(
|
|
60
|
+
query=(str | None, None),
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
|
|
@@ -67,7 +67,7 @@ class AskSearchConfiguration(BaseModel):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
SearchConfiguration = Annotated[
|
|
70
|
-
|
|
70
|
+
FindSearchConfiguration | AskSearchConfiguration, Field(discriminator="kind")
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
# We need this to avoid issues with pydantic and generic types defined in another module
|