nucliadb-models 6.9.5.post5452__py3-none-any.whl → 6.10.0.post5694__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb-models might be problematic. Click here for more details.
- nucliadb_models/agents/ingestion.py +4 -4
- nucliadb_models/augment.py +297 -23
- nucliadb_models/common.py +57 -57
- nucliadb_models/configuration.py +8 -8
- nucliadb_models/content_types.py +13 -11
- nucliadb_models/conversation.py +25 -26
- nucliadb_models/entities.py +17 -18
- nucliadb_models/external_index_providers.py +1 -2
- nucliadb_models/extracted.py +82 -83
- nucliadb_models/file.py +10 -11
- nucliadb_models/filters.py +78 -74
- nucliadb_models/graph/requests.py +38 -47
- nucliadb_models/hydration.py +48 -50
- nucliadb_models/internal/predict.py +7 -9
- nucliadb_models/internal/shards.py +2 -3
- nucliadb_models/labels.py +18 -11
- nucliadb_models/link.py +18 -19
- nucliadb_models/metadata.py +65 -53
- nucliadb_models/notifications.py +3 -3
- nucliadb_models/processing.py +1 -2
- nucliadb_models/resource.py +85 -102
- nucliadb_models/retrieval.py +147 -0
- nucliadb_models/search.py +266 -276
- nucliadb_models/security.py +2 -3
- nucliadb_models/text.py +7 -8
- nucliadb_models/trainset.py +1 -2
- nucliadb_models/utils.py +2 -3
- nucliadb_models/vectors.py +2 -5
- nucliadb_models/writer.py +56 -57
- {nucliadb_models-6.9.5.post5452.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/METADATA +1 -1
- nucliadb_models-6.10.0.post5694.dist-info/RECORD +41 -0
- nucliadb_models-6.9.5.post5452.dist-info/RECORD +0 -40
- {nucliadb_models-6.9.5.post5452.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/WHEEL +0 -0
- {nucliadb_models-6.9.5.post5452.dist-info → nucliadb_models-6.10.0.post5694.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
from enum import Enum
|
|
16
|
-
from typing import Optional
|
|
17
16
|
|
|
18
17
|
from pydantic import BaseModel, Field
|
|
19
18
|
|
|
@@ -41,11 +40,12 @@ class AgentsFilter(BaseModel):
|
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
class ResourceAgentsRequest(BaseModel):
|
|
44
|
-
filters:
|
|
43
|
+
filters: list[AgentsFilter] | None = Field(
|
|
44
|
+
title="Resource Agent Filters",
|
|
45
45
|
default=None,
|
|
46
46
|
description="Filters to apply to the agents. If None, all curently configured agents are applied.",
|
|
47
47
|
)
|
|
48
|
-
agent_ids:
|
|
48
|
+
agent_ids: list[str] | None = Field(
|
|
49
49
|
default=None,
|
|
50
50
|
title="An optional list of Data Augmentation Agent IDs to run. If None, all configured agents that match the filters are run.",
|
|
51
51
|
)
|
|
@@ -57,7 +57,7 @@ class NewTextField(BaseModel):
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
class AppliedDataAugmentation(BaseModel):
|
|
60
|
-
qas:
|
|
60
|
+
qas: QuestionAnswers | None = Field(
|
|
61
61
|
default=None,
|
|
62
62
|
description="Question and answers generated by the Question Answers agent",
|
|
63
63
|
)
|
nucliadb_models/augment.py
CHANGED
|
@@ -13,45 +13,220 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
|
|
16
|
-
from
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import Annotated
|
|
17
18
|
|
|
19
|
+
from pydantic import BaseModel, Field, StringConstraints, model_validator
|
|
20
|
+
from typing_extensions import Self
|
|
21
|
+
|
|
22
|
+
from nucliadb_models import filters
|
|
18
23
|
from nucliadb_models.common import FieldTypeName
|
|
19
24
|
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
20
|
-
from nucliadb_models.search import
|
|
25
|
+
from nucliadb_models.search import ResourceProperties
|
|
21
26
|
|
|
22
|
-
|
|
27
|
+
ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
|
|
28
|
+
ResourceId = Annotated[
|
|
29
|
+
str,
|
|
30
|
+
StringConstraints(pattern=ResourceIdPattern, min_length=32, max_length=36),
|
|
31
|
+
]
|
|
23
32
|
|
|
33
|
+
FieldIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?$"
|
|
34
|
+
FieldId = Annotated[
|
|
35
|
+
str,
|
|
36
|
+
StringConstraints(
|
|
37
|
+
pattern=FieldIdPattern,
|
|
38
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0,
|
|
39
|
+
# max field id of 250
|
|
40
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 218,
|
|
41
|
+
),
|
|
42
|
+
]
|
|
24
43
|
|
|
25
|
-
|
|
26
|
-
|
|
44
|
+
ParagraphIdPattern = r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$"
|
|
45
|
+
ParagraphId = Annotated[
|
|
46
|
+
str,
|
|
47
|
+
StringConstraints(
|
|
48
|
+
# resource-uuid/field-type/field-id/[split-id/]paragraph-id
|
|
49
|
+
pattern=ParagraphIdPattern,
|
|
50
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
|
|
51
|
+
# max field id of 250 and 10 digit paragraphs. More than enough
|
|
52
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
|
|
53
|
+
),
|
|
54
|
+
]
|
|
27
55
|
|
|
28
|
-
neighbours_before: dict[ParagraphId, str] | None = None
|
|
29
|
-
neighbours_after: dict[ParagraphId, str] | None = None
|
|
30
56
|
|
|
31
|
-
|
|
57
|
+
# Request
|
|
32
58
|
|
|
33
59
|
|
|
34
|
-
class
|
|
35
|
-
|
|
60
|
+
class ResourceProp(str, Enum):
|
|
61
|
+
"""Superset of former `show` and `extracted` serializations options."""
|
|
36
62
|
|
|
63
|
+
# `show` props
|
|
64
|
+
BASIC = "basic"
|
|
65
|
+
ORIGIN = "origin"
|
|
66
|
+
EXTRA = "extra"
|
|
67
|
+
RELATIONS = "relations"
|
|
68
|
+
VALUES = "values"
|
|
69
|
+
ERRORS = "errors"
|
|
70
|
+
SECURITY = "security"
|
|
71
|
+
# `extracted` props
|
|
72
|
+
EXTRACTED_TEXT = "extracted_text"
|
|
73
|
+
EXTRACTED_METADATA = "extracted_metadata"
|
|
74
|
+
EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
|
|
75
|
+
EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
|
|
76
|
+
EXTRACTED_VECTOR = "extracted_vectors"
|
|
77
|
+
EXTRACTED_LINK = "extracted_link"
|
|
78
|
+
EXTRACTED_FILE = "extracted_file"
|
|
79
|
+
EXTRACTED_QA = "extracted_question_answers"
|
|
80
|
+
# new granular props
|
|
81
|
+
TITLE = "title"
|
|
82
|
+
SUMMARY = "summary"
|
|
83
|
+
CLASSIFICATION_LABELS = "classification_labels"
|
|
37
84
|
|
|
38
|
-
|
|
39
|
-
def
|
|
40
|
-
|
|
41
|
-
|
|
85
|
+
@classmethod
|
|
86
|
+
def from_show_and_extracted(
|
|
87
|
+
cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
|
|
88
|
+
) -> list["ResourceProp"]:
|
|
89
|
+
_show_to_prop = {
|
|
90
|
+
ResourceProperties.BASIC: cls.BASIC,
|
|
91
|
+
ResourceProperties.ORIGIN: cls.ORIGIN,
|
|
92
|
+
ResourceProperties.EXTRA: cls.EXTRA,
|
|
93
|
+
ResourceProperties.RELATIONS: cls.RELATIONS,
|
|
94
|
+
ResourceProperties.VALUES: cls.VALUES,
|
|
95
|
+
ResourceProperties.ERRORS: cls.ERRORS,
|
|
96
|
+
ResourceProperties.SECURITY: cls.SECURITY,
|
|
97
|
+
}
|
|
98
|
+
_extracted_to_prop = {
|
|
99
|
+
ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
|
|
100
|
+
ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
|
|
101
|
+
ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
|
|
102
|
+
ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
|
|
103
|
+
ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
|
|
104
|
+
ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
|
|
105
|
+
ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
|
|
106
|
+
ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
props = []
|
|
110
|
+
for s in show:
|
|
111
|
+
show_prop = _show_to_prop.get(s)
|
|
112
|
+
# show=extracted is not in the dict
|
|
113
|
+
if show_prop is None:
|
|
114
|
+
continue
|
|
115
|
+
props.append(show_prop)
|
|
116
|
+
|
|
117
|
+
if ResourceProperties.EXTRACTED in show:
|
|
118
|
+
for e in extracted:
|
|
119
|
+
extracted_prop = _extracted_to_prop[e]
|
|
120
|
+
props.append(extracted_prop)
|
|
121
|
+
|
|
122
|
+
return props
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class AugmentResourceFields(BaseModel):
|
|
126
|
+
text: bool = False
|
|
127
|
+
classification_labels: bool = False
|
|
128
|
+
|
|
129
|
+
filters: list[filters.Field | filters.Generated]
|
|
42
130
|
|
|
43
131
|
|
|
44
132
|
class AugmentResources(BaseModel):
|
|
45
|
-
given: list[
|
|
133
|
+
given: list[ResourceId]
|
|
134
|
+
|
|
135
|
+
# TODO(decoupled-ask): replace this select for bool fields
|
|
136
|
+
select: list[ResourceProp] = Field(default_factory=list)
|
|
137
|
+
|
|
138
|
+
field_type_filter: list[FieldTypeName] | None = Field(
|
|
139
|
+
default=None,
|
|
140
|
+
deprecated="Only use this for legacy resource serialization",
|
|
141
|
+
title="Field type filter",
|
|
142
|
+
description=(
|
|
143
|
+
"Define which field types are serialized on resources of search results. "
|
|
144
|
+
"If omitted and legacy serialization is used, all field types will be serialized"
|
|
145
|
+
),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
fields: AugmentResourceFields | None = None
|
|
149
|
+
|
|
150
|
+
@model_validator(mode="after")
|
|
151
|
+
def bwc_resource_serialization(self) -> Self:
|
|
152
|
+
if self.field_type_filter is not None and self.fields is not None:
|
|
153
|
+
raise ValueError("`field_type_filter` and `fields` are incompatible together")
|
|
154
|
+
|
|
155
|
+
return self
|
|
46
156
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
157
|
+
|
|
158
|
+
class AugmentFields(BaseModel):
|
|
159
|
+
given: list[FieldId]
|
|
160
|
+
|
|
161
|
+
text: bool = False
|
|
162
|
+
classification_labels: bool = False
|
|
163
|
+
entities: bool = False # also known as ners
|
|
164
|
+
|
|
165
|
+
# For file fields, augment the path to the thumbnail image
|
|
166
|
+
file_thumbnail: bool = False
|
|
167
|
+
|
|
168
|
+
# When enabled, augment all the messages from the conversation. This is
|
|
169
|
+
# incompatible with max_conversation_messages defined
|
|
170
|
+
full_conversation: bool = False
|
|
171
|
+
|
|
172
|
+
# When `full` disbled, this option controls the max amount of messages to be
|
|
173
|
+
# augmented. This number will be a best-effort window centered around the
|
|
174
|
+
# selected message. In addition, the 1st message of the conversation will
|
|
175
|
+
# always be included.
|
|
176
|
+
#
|
|
177
|
+
# This option is combinable with attachments.
|
|
178
|
+
max_conversation_messages: int | None = None
|
|
179
|
+
|
|
180
|
+
# Given a message, if it's a question, try to find an answer. Otherwise,
|
|
181
|
+
# return a window of messages following the requested one.
|
|
182
|
+
#
|
|
183
|
+
# This was previously done without explicit user consent, now it's an option.
|
|
184
|
+
conversation_answer_or_messages_after: bool = False
|
|
185
|
+
|
|
186
|
+
# Both attachment options will only add attachments for the full or the 1st
|
|
187
|
+
# + window, not answer nor messages after
|
|
188
|
+
|
|
189
|
+
# include conversation text attachments
|
|
190
|
+
conversation_text_attachments: bool = False
|
|
191
|
+
# include conversation image attachments
|
|
192
|
+
conversation_image_attachments: bool = False
|
|
193
|
+
|
|
194
|
+
@model_validator(mode="after")
|
|
195
|
+
def validate_cross_options(self):
|
|
196
|
+
if self.full_conversation and self.max_conversation_messages is not None:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
"`full_conversation` and `max_conversation_messages` are not compatible together"
|
|
199
|
+
)
|
|
200
|
+
if (
|
|
201
|
+
(self.conversation_text_attachments or self.conversation_image_attachments)
|
|
202
|
+
and self.full_conversation is False
|
|
203
|
+
and self.max_conversation_messages is None
|
|
204
|
+
):
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"Attachments are only compatible with `full_conversation` and `max_conversation_messages`"
|
|
207
|
+
)
|
|
208
|
+
return self
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# TODO(decoupled-ask): remove unused metadata
|
|
212
|
+
class ParagraphMetadata(BaseModel):
|
|
213
|
+
field_labels: list[str]
|
|
214
|
+
paragraph_labels: list[str]
|
|
215
|
+
|
|
216
|
+
is_an_image: bool
|
|
217
|
+
is_a_table: bool
|
|
218
|
+
|
|
219
|
+
# for extracted from visual content (ocr, inception, tables)
|
|
220
|
+
source_file: str | None
|
|
221
|
+
|
|
222
|
+
# for documents (pdf, docx...) only
|
|
223
|
+
page: int | None
|
|
224
|
+
in_page_with_visual: bool | None
|
|
51
225
|
|
|
52
226
|
|
|
53
227
|
class AugmentParagraph(BaseModel):
|
|
54
228
|
id: ParagraphId
|
|
229
|
+
metadata: ParagraphMetadata | None = None
|
|
55
230
|
|
|
56
231
|
|
|
57
232
|
class AugmentParagraphs(BaseModel):
|
|
@@ -74,12 +249,111 @@ class AugmentParagraphs(BaseModel):
|
|
|
74
249
|
# paragraph from a page, return page preview image
|
|
75
250
|
page_preview_image: bool = False
|
|
76
251
|
|
|
252
|
+
@model_validator(mode="after")
|
|
253
|
+
def table_options_work_together(self) -> Self:
|
|
254
|
+
if not self.table_image and self.table_prefers_page_preview:
|
|
255
|
+
raise ValueError("`table_prefers_page_preview` can only be enabled with `table_image`")
|
|
256
|
+
return self
|
|
257
|
+
|
|
77
258
|
|
|
78
259
|
class AugmentRequest(BaseModel):
|
|
79
|
-
resources: AugmentResources
|
|
80
|
-
|
|
260
|
+
resources: AugmentResources | None = None
|
|
261
|
+
fields: AugmentFields | None = None
|
|
262
|
+
paragraphs: AugmentParagraphs | None = None
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
# Response
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class AugmentedParagraph(BaseModel):
|
|
269
|
+
text: str | None = None
|
|
270
|
+
|
|
271
|
+
neighbours_before: list[ParagraphId] | None = None
|
|
272
|
+
neighbours_after: list[ParagraphId] | None = None
|
|
273
|
+
|
|
274
|
+
source_image: str | None = None
|
|
275
|
+
table_image: str | None = None
|
|
276
|
+
page_preview_image: str | None = None
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
class AugmentedField(BaseModel):
|
|
280
|
+
text: str | None = None
|
|
281
|
+
|
|
282
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
283
|
+
|
|
284
|
+
# former ners
|
|
285
|
+
entities: dict[str, list[str]] | None = None
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
class AugmentedFileField(BaseModel):
|
|
289
|
+
text: str | None = None
|
|
290
|
+
|
|
291
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
292
|
+
|
|
293
|
+
# former ners
|
|
294
|
+
entities: dict[str, list[str]] | None = None
|
|
295
|
+
|
|
296
|
+
# TODO(decoupled-ask): implement image strategy
|
|
297
|
+
page_preview_image: str | None = None
|
|
298
|
+
|
|
299
|
+
# Path for the download API to retrieve the file thumbnail image
|
|
300
|
+
thumbnail_image: str | None = None
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
class AugmentedConversationMessage(BaseModel):
|
|
304
|
+
ident: str
|
|
305
|
+
text: str | None = None
|
|
306
|
+
attachments: list[FieldId] | None = None
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class AugmentedConversationField(BaseModel):
|
|
310
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
311
|
+
# former ners
|
|
312
|
+
entities: dict[str, list[str]] | None = None
|
|
313
|
+
|
|
314
|
+
messages: list[AugmentedConversationMessage] | None = None
|
|
315
|
+
|
|
316
|
+
@property
|
|
317
|
+
def text(self) -> str | None:
|
|
318
|
+
"""Syntactic sugar to access aggregate text from all messages"""
|
|
319
|
+
if self.messages is None:
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
text = ""
|
|
323
|
+
for message in self.messages:
|
|
324
|
+
text += message.text or ""
|
|
325
|
+
|
|
326
|
+
return text or None
|
|
327
|
+
|
|
328
|
+
@property
|
|
329
|
+
def attachments(self) -> list[FieldId] | None:
|
|
330
|
+
"""Syntactic sugar to access the aggregate of attachments from all messages."""
|
|
331
|
+
if self.messages is None:
|
|
332
|
+
return None
|
|
333
|
+
|
|
334
|
+
has_attachments = False
|
|
335
|
+
attachments = []
|
|
336
|
+
for message in self.messages:
|
|
337
|
+
if message.attachments is None:
|
|
338
|
+
continue
|
|
339
|
+
has_attachments = True
|
|
340
|
+
attachments.extend(message.attachments)
|
|
341
|
+
|
|
342
|
+
if has_attachments:
|
|
343
|
+
return attachments
|
|
344
|
+
else:
|
|
345
|
+
return None
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class AugmentedResource(Resource):
|
|
349
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
350
|
+
|
|
351
|
+
def updated_from(self, origin: Resource):
|
|
352
|
+
for key in origin.model_fields.keys():
|
|
353
|
+
self.__setattr__(key, getattr(origin, key))
|
|
81
354
|
|
|
82
355
|
|
|
83
356
|
class AugmentResponse(BaseModel):
|
|
84
|
-
resources: dict[
|
|
85
|
-
|
|
357
|
+
resources: dict[ResourceId, AugmentedResource]
|
|
358
|
+
fields: dict[FieldId, AugmentedField | AugmentedFileField | AugmentedConversationField]
|
|
359
|
+
paragraphs: dict[ParagraphId, AugmentedParagraph]
|
nucliadb_models/common.py
CHANGED
|
@@ -16,7 +16,7 @@ import base64
|
|
|
16
16
|
import hashlib
|
|
17
17
|
import re
|
|
18
18
|
from enum import Enum
|
|
19
|
-
from typing import Any
|
|
19
|
+
from typing import Any
|
|
20
20
|
|
|
21
21
|
from pydantic import (
|
|
22
22
|
BaseModel,
|
|
@@ -38,7 +38,7 @@ FIELD_TYPE_CHAR_MAP = {
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
STORAGE_FILE_MATCH = re.compile(
|
|
41
|
-
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
41
|
+
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
42
42
|
)
|
|
43
43
|
DOWNLOAD_TYPE_MAP = {"f": "field", "e": "extracted"}
|
|
44
44
|
DOWNLOAD_URI = "/kb/{kbid}/resource/{rid}/{field_type}/{field_id}/download/{download_type}/{key}"
|
|
@@ -50,9 +50,9 @@ class ParamDefault(BaseModel):
|
|
|
50
50
|
default: Any = None
|
|
51
51
|
title: str
|
|
52
52
|
description: str
|
|
53
|
-
le:
|
|
54
|
-
gt:
|
|
55
|
-
max_items:
|
|
53
|
+
le: float | None = None
|
|
54
|
+
gt: float | None = None
|
|
55
|
+
max_items: int | None = None
|
|
56
56
|
deprecated: bool = False
|
|
57
57
|
|
|
58
58
|
def to_pydantic_field(self, default=_NOT_SET, **kw) -> Field: # type: ignore
|
|
@@ -86,13 +86,13 @@ class FieldID(BaseModel):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class File(BaseModel):
|
|
89
|
-
filename:
|
|
89
|
+
filename: str | None = None
|
|
90
90
|
content_type: str = "application/octet-stream"
|
|
91
|
-
payload:
|
|
92
|
-
md5:
|
|
91
|
+
payload: str | None = Field(default=None, description="Base64 encoded file content")
|
|
92
|
+
md5: str | None = None
|
|
93
93
|
# These are to be used for external files
|
|
94
|
-
uri:
|
|
95
|
-
extra_headers:
|
|
94
|
+
uri: str | None = None
|
|
95
|
+
extra_headers: dict[str, str] = {}
|
|
96
96
|
|
|
97
97
|
@model_validator(mode="after")
|
|
98
98
|
def _check_internal_file_fields(self) -> Self:
|
|
@@ -108,7 +108,7 @@ class File(BaseModel):
|
|
|
108
108
|
if self.md5 is None:
|
|
109
109
|
# In case md5 is not supplied, compute it
|
|
110
110
|
try:
|
|
111
|
-
result = hashlib.md5(base64.b64decode(self.payload))
|
|
111
|
+
result = hashlib.md5(base64.b64decode(self.payload), usedforsecurity=False)
|
|
112
112
|
self.md5 = result.hexdigest()
|
|
113
113
|
except Exception:
|
|
114
114
|
raise ValueError("MD5 could not be computed")
|
|
@@ -134,10 +134,10 @@ class FileB64(BaseModel):
|
|
|
134
134
|
|
|
135
135
|
|
|
136
136
|
class CloudFile(BaseModel):
|
|
137
|
-
uri:
|
|
138
|
-
size:
|
|
139
|
-
content_type:
|
|
140
|
-
bucket_name:
|
|
137
|
+
uri: str | None = None
|
|
138
|
+
size: int | None = None
|
|
139
|
+
content_type: str | None = None
|
|
140
|
+
bucket_name: str | None = None
|
|
141
141
|
|
|
142
142
|
class Source(Enum):
|
|
143
143
|
FLAPS = "FLAPS"
|
|
@@ -146,23 +146,23 @@ class CloudFile(BaseModel):
|
|
|
146
146
|
LOCAL = "LOCAL"
|
|
147
147
|
EXTERNAL = "EXTERNAL"
|
|
148
148
|
|
|
149
|
-
source:
|
|
150
|
-
filename:
|
|
151
|
-
resumable_uri:
|
|
152
|
-
offset:
|
|
153
|
-
upload_uri:
|
|
154
|
-
parts:
|
|
155
|
-
old_uri:
|
|
156
|
-
old_bucket:
|
|
157
|
-
md5:
|
|
149
|
+
source: Source | None
|
|
150
|
+
filename: str | None
|
|
151
|
+
resumable_uri: str | None
|
|
152
|
+
offset: int | None
|
|
153
|
+
upload_uri: str | None
|
|
154
|
+
parts: list[str] | None
|
|
155
|
+
old_uri: str | None
|
|
156
|
+
old_bucket: str | None
|
|
157
|
+
md5: str | None
|
|
158
158
|
|
|
159
159
|
|
|
160
160
|
class CloudLink(BaseModel):
|
|
161
|
-
uri:
|
|
162
|
-
size:
|
|
163
|
-
content_type:
|
|
164
|
-
filename:
|
|
165
|
-
md5:
|
|
161
|
+
uri: str | None = None
|
|
162
|
+
size: int | None = None
|
|
163
|
+
content_type: str | None = None
|
|
164
|
+
filename: str | None = None
|
|
165
|
+
md5: str | None = None
|
|
166
166
|
|
|
167
167
|
@staticmethod
|
|
168
168
|
def format_reader_download_uri(uri: str) -> str:
|
|
@@ -216,12 +216,12 @@ class FieldTypeName(str, Enum):
|
|
|
216
216
|
class FieldRef(BaseModel):
|
|
217
217
|
field_type: FieldTypeName
|
|
218
218
|
field_id: str
|
|
219
|
-
split:
|
|
219
|
+
split: str | None = None
|
|
220
220
|
|
|
221
221
|
|
|
222
222
|
class Classification(BaseModel):
|
|
223
|
-
labelset: str
|
|
224
|
-
label: str
|
|
223
|
+
labelset: str = Field(title="The ID of the labelset")
|
|
224
|
+
label: str = Field(title="The label assigned from the labelset")
|
|
225
225
|
|
|
226
226
|
|
|
227
227
|
class UserClassification(Classification):
|
|
@@ -229,19 +229,19 @@ class UserClassification(Classification):
|
|
|
229
229
|
|
|
230
230
|
|
|
231
231
|
class Sentence(BaseModel):
|
|
232
|
-
start:
|
|
233
|
-
end:
|
|
234
|
-
key:
|
|
232
|
+
start: int | None = None
|
|
233
|
+
end: int | None = None
|
|
234
|
+
key: str | None = None
|
|
235
235
|
|
|
236
236
|
|
|
237
237
|
class PageInformation(BaseModel):
|
|
238
|
-
page:
|
|
239
|
-
page_with_visual:
|
|
238
|
+
page: int | None = Field(default=None, title="Page Information Page")
|
|
239
|
+
page_with_visual: bool | None = None
|
|
240
240
|
|
|
241
241
|
|
|
242
242
|
class Representation(BaseModel):
|
|
243
|
-
is_a_table:
|
|
244
|
-
reference_file:
|
|
243
|
+
is_a_table: bool | None = None
|
|
244
|
+
reference_file: str | None = None
|
|
245
245
|
|
|
246
246
|
|
|
247
247
|
class ParagraphRelations(BaseModel):
|
|
@@ -251,10 +251,10 @@ class ParagraphRelations(BaseModel):
|
|
|
251
251
|
|
|
252
252
|
|
|
253
253
|
class Paragraph(BaseModel):
|
|
254
|
-
start:
|
|
255
|
-
end:
|
|
256
|
-
start_seconds:
|
|
257
|
-
end_seconds:
|
|
254
|
+
start: int | None = None
|
|
255
|
+
end: int | None = None
|
|
256
|
+
start_seconds: list[int] | None = None
|
|
257
|
+
end_seconds: list[int] | None = None
|
|
258
258
|
|
|
259
259
|
class TypeParagraph(str, Enum):
|
|
260
260
|
TEXT = "TEXT"
|
|
@@ -265,35 +265,35 @@ class Paragraph(BaseModel):
|
|
|
265
265
|
TITLE = "TITLE"
|
|
266
266
|
TABLE = "TABLE"
|
|
267
267
|
|
|
268
|
-
kind:
|
|
269
|
-
classifications:
|
|
270
|
-
sentences:
|
|
271
|
-
key:
|
|
272
|
-
page:
|
|
273
|
-
representation:
|
|
274
|
-
relations:
|
|
268
|
+
kind: TypeParagraph | None = None
|
|
269
|
+
classifications: list[Classification] | None = None
|
|
270
|
+
sentences: list[Sentence] | None = None
|
|
271
|
+
key: str | None = None
|
|
272
|
+
page: PageInformation | None = None
|
|
273
|
+
representation: Representation | None = None
|
|
274
|
+
relations: ParagraphRelations | None = None
|
|
275
275
|
|
|
276
276
|
|
|
277
277
|
class Shards(BaseModel):
|
|
278
|
-
shards:
|
|
278
|
+
shards: list[str] | None = None
|
|
279
279
|
|
|
280
280
|
|
|
281
281
|
class Question(BaseModel):
|
|
282
282
|
text: str
|
|
283
|
-
language:
|
|
284
|
-
ids_paragraphs:
|
|
283
|
+
language: str | None = None
|
|
284
|
+
ids_paragraphs: list[str]
|
|
285
285
|
|
|
286
286
|
|
|
287
287
|
class Answer(BaseModel):
|
|
288
288
|
text: str
|
|
289
|
-
language:
|
|
290
|
-
ids_paragraphs:
|
|
289
|
+
language: str | None = None
|
|
290
|
+
ids_paragraphs: list[str]
|
|
291
291
|
|
|
292
292
|
|
|
293
293
|
class QuestionAnswer(BaseModel):
|
|
294
294
|
question: Question
|
|
295
|
-
answers:
|
|
295
|
+
answers: list[Answer]
|
|
296
296
|
|
|
297
297
|
|
|
298
298
|
class QuestionAnswers(BaseModel):
|
|
299
|
-
question_answer:
|
|
299
|
+
question_answer: list[QuestionAnswer]
|
nucliadb_models/configuration.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
#
|
|
15
15
|
|
|
16
16
|
import warnings
|
|
17
|
-
from typing import Annotated, Any, Literal
|
|
17
|
+
from typing import Annotated, Any, Literal
|
|
18
18
|
|
|
19
19
|
from pydantic import BaseModel, Field, create_model
|
|
20
20
|
|
|
@@ -28,11 +28,11 @@ class KBConfiguration(BaseModel):
|
|
|
28
28
|
super().__init__(**data)
|
|
29
29
|
|
|
30
30
|
# Do not touch this model synced on Processing side
|
|
31
|
-
semantic_model:
|
|
32
|
-
generative_model:
|
|
33
|
-
ner_model:
|
|
34
|
-
anonymization_model:
|
|
35
|
-
visual_labeling:
|
|
31
|
+
semantic_model: str | None = None
|
|
32
|
+
generative_model: str | None = None
|
|
33
|
+
ner_model: str | None = None
|
|
34
|
+
anonymization_model: str | None = None
|
|
35
|
+
visual_labeling: str | None = None
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
#
|
|
@@ -57,7 +57,7 @@ class FindSearchConfiguration(BaseModel):
|
|
|
57
57
|
AskConfig = create_model(
|
|
58
58
|
"AskConfig",
|
|
59
59
|
**_model_fields(AskRequest, skip=["query", "search_configuration"]),
|
|
60
|
-
query=(
|
|
60
|
+
query=(str | None, None),
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
|
|
@@ -67,7 +67,7 @@ class AskSearchConfiguration(BaseModel):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
SearchConfiguration = Annotated[
|
|
70
|
-
|
|
70
|
+
FindSearchConfiguration | AskSearchConfiguration, Field(discriminator="kind")
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
# We need this to avoid issues with pydantic and generic types defined in another module
|