nucliadb-models 6.9.7.post5550__py3-none-any.whl → 6.10.0.post5792__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb-models might be problematic. Click here for more details.
- nucliadb_models/agents/ingestion.py +4 -4
- nucliadb_models/augment.py +187 -78
- nucliadb_models/common.py +56 -56
- nucliadb_models/configuration.py +8 -8
- nucliadb_models/content_types.py +13 -11
- nucliadb_models/conversation.py +25 -26
- nucliadb_models/entities.py +17 -18
- nucliadb_models/external_index_providers.py +1 -2
- nucliadb_models/extracted.py +82 -83
- nucliadb_models/file.py +10 -11
- nucliadb_models/filters.py +79 -75
- nucliadb_models/graph/requests.py +40 -48
- nucliadb_models/graph/responses.py +13 -1
- nucliadb_models/hydration.py +48 -50
- nucliadb_models/internal/predict.py +7 -9
- nucliadb_models/internal/shards.py +2 -3
- nucliadb_models/labels.py +18 -11
- nucliadb_models/link.py +18 -19
- nucliadb_models/metadata.py +66 -54
- nucliadb_models/notifications.py +3 -3
- nucliadb_models/processing.py +1 -2
- nucliadb_models/resource.py +85 -102
- nucliadb_models/retrieval.py +147 -0
- nucliadb_models/search.py +297 -275
- nucliadb_models/security.py +2 -3
- nucliadb_models/text.py +7 -8
- nucliadb_models/trainset.py +1 -2
- nucliadb_models/utils.py +2 -3
- nucliadb_models/vectors.py +2 -5
- nucliadb_models/writer.py +56 -57
- {nucliadb_models-6.9.7.post5550.dist-info → nucliadb_models-6.10.0.post5792.dist-info}/METADATA +1 -1
- nucliadb_models-6.10.0.post5792.dist-info/RECORD +41 -0
- nucliadb_models-6.9.7.post5550.dist-info/RECORD +0 -40
- {nucliadb_models-6.9.7.post5550.dist-info → nucliadb_models-6.10.0.post5792.dist-info}/WHEEL +0 -0
- {nucliadb_models-6.9.7.post5550.dist-info → nucliadb_models-6.10.0.post5792.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
from enum import Enum
|
|
16
|
-
from typing import Optional
|
|
17
16
|
|
|
18
17
|
from pydantic import BaseModel, Field
|
|
19
18
|
|
|
@@ -41,11 +40,12 @@ class AgentsFilter(BaseModel):
|
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
class ResourceAgentsRequest(BaseModel):
|
|
44
|
-
filters:
|
|
43
|
+
filters: list[AgentsFilter] | None = Field(
|
|
44
|
+
title="Resource Agent Filters",
|
|
45
45
|
default=None,
|
|
46
46
|
description="Filters to apply to the agents. If None, all curently configured agents are applied.",
|
|
47
47
|
)
|
|
48
|
-
agent_ids:
|
|
48
|
+
agent_ids: list[str] | None = Field(
|
|
49
49
|
default=None,
|
|
50
50
|
title="An optional list of Data Augmentation Agent IDs to run. If None, all configured agents that match the filters are run.",
|
|
51
51
|
)
|
|
@@ -57,7 +57,7 @@ class NewTextField(BaseModel):
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
class AppliedDataAugmentation(BaseModel):
|
|
60
|
-
qas:
|
|
60
|
+
qas: QuestionAnswers | None = Field(
|
|
61
61
|
default=None,
|
|
62
62
|
description="Question and answers generated by the Question Answers agent",
|
|
63
63
|
)
|
nucliadb_models/augment.py
CHANGED
|
@@ -13,16 +13,15 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
|
|
16
|
-
from enum import Enum
|
|
17
16
|
from typing import Annotated
|
|
18
17
|
|
|
19
18
|
from pydantic import BaseModel, Field, StringConstraints, model_validator
|
|
20
|
-
from typing_extensions import Self
|
|
19
|
+
from typing_extensions import Self, assert_never
|
|
21
20
|
|
|
22
21
|
from nucliadb_models import filters
|
|
23
22
|
from nucliadb_models.common import FieldTypeName
|
|
24
23
|
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
25
|
-
from nucliadb_models.search import
|
|
24
|
+
from nucliadb_models.search import ResourceProperties, TextPosition
|
|
26
25
|
|
|
27
26
|
ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
|
|
28
27
|
ResourceId = Annotated[
|
|
@@ -57,71 +56,6 @@ ParagraphId = Annotated[
|
|
|
57
56
|
# Request
|
|
58
57
|
|
|
59
58
|
|
|
60
|
-
class ResourceProp(str, Enum):
|
|
61
|
-
"""Superset of former `show` and `extracted` serializations options."""
|
|
62
|
-
|
|
63
|
-
# `show` props
|
|
64
|
-
BASIC = "basic"
|
|
65
|
-
ORIGIN = "origin"
|
|
66
|
-
EXTRA = "extra"
|
|
67
|
-
RELATIONS = "relations"
|
|
68
|
-
VALUES = "values"
|
|
69
|
-
ERRORS = "errors"
|
|
70
|
-
SECURITY = "security"
|
|
71
|
-
# `extracted` props
|
|
72
|
-
EXTRACTED_TEXT = "extracted_text"
|
|
73
|
-
EXTRACTED_METADATA = "extracted_metadata"
|
|
74
|
-
EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
|
|
75
|
-
EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
|
|
76
|
-
EXTRACTED_VECTOR = "extracted_vectors"
|
|
77
|
-
EXTRACTED_LINK = "extracted_link"
|
|
78
|
-
EXTRACTED_FILE = "extracted_file"
|
|
79
|
-
EXTRACTED_QA = "extracted_question_answers"
|
|
80
|
-
# new granular props
|
|
81
|
-
TITLE = "title"
|
|
82
|
-
SUMMARY = "summary"
|
|
83
|
-
CLASSIFICATION_LABELS = "classification_labels"
|
|
84
|
-
|
|
85
|
-
@classmethod
|
|
86
|
-
def from_show_and_extracted(
|
|
87
|
-
cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
|
|
88
|
-
) -> list["ResourceProp"]:
|
|
89
|
-
_show_to_prop = {
|
|
90
|
-
ResourceProperties.BASIC: cls.BASIC,
|
|
91
|
-
ResourceProperties.ORIGIN: cls.ORIGIN,
|
|
92
|
-
ResourceProperties.EXTRA: cls.EXTRA,
|
|
93
|
-
ResourceProperties.RELATIONS: cls.RELATIONS,
|
|
94
|
-
ResourceProperties.VALUES: cls.VALUES,
|
|
95
|
-
ResourceProperties.ERRORS: cls.ERRORS,
|
|
96
|
-
ResourceProperties.SECURITY: cls.SECURITY,
|
|
97
|
-
}
|
|
98
|
-
_extracted_to_prop = {
|
|
99
|
-
ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
|
|
100
|
-
ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
|
|
101
|
-
ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
|
|
102
|
-
ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
|
|
103
|
-
ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
|
|
104
|
-
ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
|
|
105
|
-
ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
|
|
106
|
-
ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
props = []
|
|
110
|
-
for s in show:
|
|
111
|
-
show_prop = _show_to_prop.get(s)
|
|
112
|
-
# show=extracted is not in the dict
|
|
113
|
-
if show_prop is None:
|
|
114
|
-
continue
|
|
115
|
-
props.append(show_prop)
|
|
116
|
-
|
|
117
|
-
if ResourceProperties.EXTRACTED in show:
|
|
118
|
-
for e in extracted:
|
|
119
|
-
extracted_prop = _extracted_to_prop[e]
|
|
120
|
-
props.append(extracted_prop)
|
|
121
|
-
|
|
122
|
-
return props
|
|
123
|
-
|
|
124
|
-
|
|
125
59
|
class AugmentResourceFields(BaseModel):
|
|
126
60
|
text: bool = False
|
|
127
61
|
classification_labels: bool = False
|
|
@@ -132,7 +66,29 @@ class AugmentResourceFields(BaseModel):
|
|
|
132
66
|
class AugmentResources(BaseModel):
|
|
133
67
|
given: list[ResourceId]
|
|
134
68
|
|
|
135
|
-
|
|
69
|
+
# `show` props
|
|
70
|
+
basic: bool = False
|
|
71
|
+
origin: bool = False
|
|
72
|
+
extra: bool = False
|
|
73
|
+
relations: bool = False
|
|
74
|
+
values: bool = False
|
|
75
|
+
errors: bool = False
|
|
76
|
+
security: bool = False
|
|
77
|
+
|
|
78
|
+
# `extracted` props
|
|
79
|
+
extracted_text: bool = False
|
|
80
|
+
extracted_metadata: bool = False
|
|
81
|
+
extracted_shortened_metadata: bool = False
|
|
82
|
+
extracted_large_metadata: bool = False
|
|
83
|
+
extracted_vector: bool = False
|
|
84
|
+
extracted_link: bool = False
|
|
85
|
+
extracted_file: bool = False
|
|
86
|
+
extracted_qa: bool = False
|
|
87
|
+
|
|
88
|
+
# new granular props
|
|
89
|
+
title: bool = False
|
|
90
|
+
summary: bool = False
|
|
91
|
+
classification_labels: bool = False
|
|
136
92
|
|
|
137
93
|
field_type_filter: list[FieldTypeName] | None = Field(
|
|
138
94
|
default=None,
|
|
@@ -153,6 +109,51 @@ class AugmentResources(BaseModel):
|
|
|
153
109
|
|
|
154
110
|
return self
|
|
155
111
|
|
|
112
|
+
def apply_show_and_extracted(
|
|
113
|
+
self, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
|
|
114
|
+
):
|
|
115
|
+
show_extracted = False
|
|
116
|
+
for s in show:
|
|
117
|
+
if s == ResourceProperties.BASIC:
|
|
118
|
+
self.basic = True
|
|
119
|
+
elif s == ResourceProperties.ORIGIN:
|
|
120
|
+
self.origin = True
|
|
121
|
+
elif s == ResourceProperties.EXTRA:
|
|
122
|
+
self.extra = True
|
|
123
|
+
elif s == ResourceProperties.RELATIONS:
|
|
124
|
+
self.relations = True
|
|
125
|
+
elif s == ResourceProperties.VALUES:
|
|
126
|
+
self.values = True
|
|
127
|
+
elif s == ResourceProperties.ERRORS:
|
|
128
|
+
self.errors = True
|
|
129
|
+
elif s == ResourceProperties.SECURITY:
|
|
130
|
+
self.security = True
|
|
131
|
+
elif s == ResourceProperties.EXTRACTED:
|
|
132
|
+
show_extracted = True
|
|
133
|
+
else: # pragma: no cover
|
|
134
|
+
assert_never(s)
|
|
135
|
+
|
|
136
|
+
if show_extracted:
|
|
137
|
+
for e in extracted:
|
|
138
|
+
if e == ExtractedDataTypeName.TEXT:
|
|
139
|
+
self.extracted_text = True
|
|
140
|
+
elif e == ExtractedDataTypeName.METADATA:
|
|
141
|
+
self.extracted_metadata = True
|
|
142
|
+
elif e == ExtractedDataTypeName.SHORTENED_METADATA:
|
|
143
|
+
self.extracted_shortened_metadata = True
|
|
144
|
+
elif e == ExtractedDataTypeName.LARGE_METADATA:
|
|
145
|
+
self.extracted_large_metadata = True
|
|
146
|
+
elif e == ExtractedDataTypeName.VECTOR:
|
|
147
|
+
self.extracted_vector = True
|
|
148
|
+
elif e == ExtractedDataTypeName.LINK:
|
|
149
|
+
self.extracted_link = True
|
|
150
|
+
elif e == ExtractedDataTypeName.FILE:
|
|
151
|
+
self.extracted_file = True
|
|
152
|
+
elif e == ExtractedDataTypeName.QA:
|
|
153
|
+
self.extracted_qa = True
|
|
154
|
+
else: # pragma: no cover
|
|
155
|
+
assert_never(s)
|
|
156
|
+
|
|
156
157
|
|
|
157
158
|
class AugmentFields(BaseModel):
|
|
158
159
|
given: list[FieldId]
|
|
@@ -161,11 +162,53 @@ class AugmentFields(BaseModel):
|
|
|
161
162
|
classification_labels: bool = False
|
|
162
163
|
entities: bool = False # also known as ners
|
|
163
164
|
|
|
165
|
+
# For file fields, augment the path to the thumbnail image
|
|
166
|
+
file_thumbnail: bool = False
|
|
167
|
+
|
|
168
|
+
# When enabled, augment all the messages from the conversation. This is
|
|
169
|
+
# incompatible with max_conversation_messages defined
|
|
170
|
+
full_conversation: bool = False
|
|
171
|
+
|
|
172
|
+
# When `full` disbled, this option controls the max amount of messages to be
|
|
173
|
+
# augmented. This number will be a best-effort window centered around the
|
|
174
|
+
# selected message. In addition, the 1st message of the conversation will
|
|
175
|
+
# always be included.
|
|
176
|
+
#
|
|
177
|
+
# This option is combinable with attachments.
|
|
178
|
+
max_conversation_messages: int | None = None
|
|
179
|
+
|
|
180
|
+
# Given a message, if it's a question, try to find an answer. Otherwise,
|
|
181
|
+
# return a window of messages following the requested one.
|
|
182
|
+
#
|
|
183
|
+
# This was previously done without explicit user consent, now it's an option.
|
|
184
|
+
conversation_answer_or_messages_after: bool = False
|
|
185
|
+
|
|
186
|
+
# Both attachment options will only add attachments for the full or the 1st
|
|
187
|
+
# + window, not answer nor messages after
|
|
188
|
+
|
|
189
|
+
# include conversation text attachments
|
|
190
|
+
conversation_text_attachments: bool = False
|
|
191
|
+
# include conversation image attachments
|
|
192
|
+
conversation_image_attachments: bool = False
|
|
193
|
+
|
|
194
|
+
@model_validator(mode="after")
|
|
195
|
+
def validate_cross_options(self):
|
|
196
|
+
if self.full_conversation and self.max_conversation_messages is not None:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
"`full_conversation` and `max_conversation_messages` are not compatible together"
|
|
199
|
+
)
|
|
200
|
+
if (
|
|
201
|
+
(self.conversation_text_attachments or self.conversation_image_attachments)
|
|
202
|
+
and self.full_conversation is False
|
|
203
|
+
and self.max_conversation_messages is None
|
|
204
|
+
):
|
|
205
|
+
raise ValueError(
|
|
206
|
+
"Attachments are only compatible with `full_conversation` and `max_conversation_messages`"
|
|
207
|
+
)
|
|
208
|
+
return self
|
|
164
209
|
|
|
165
|
-
class ParagraphMetadata(BaseModel):
|
|
166
|
-
field_labels: list[str]
|
|
167
|
-
paragraph_labels: list[str]
|
|
168
210
|
|
|
211
|
+
class ParagraphMetadata(BaseModel):
|
|
169
212
|
is_an_image: bool
|
|
170
213
|
is_a_table: bool
|
|
171
214
|
|
|
@@ -202,11 +245,17 @@ class AugmentParagraphs(BaseModel):
|
|
|
202
245
|
# paragraph from a page, return page preview image
|
|
203
246
|
page_preview_image: bool = False
|
|
204
247
|
|
|
248
|
+
@model_validator(mode="after")
|
|
249
|
+
def table_options_work_together(self) -> Self:
|
|
250
|
+
if not self.table_image and self.table_prefers_page_preview:
|
|
251
|
+
raise ValueError("`table_prefers_page_preview` can only be enabled with `table_image`")
|
|
252
|
+
return self
|
|
253
|
+
|
|
205
254
|
|
|
206
255
|
class AugmentRequest(BaseModel):
|
|
207
|
-
resources: AugmentResources | None = None
|
|
208
|
-
fields: AugmentFields | None = None
|
|
209
|
-
paragraphs: AugmentParagraphs | None = None
|
|
256
|
+
resources: list[AugmentResources] | None = Field(default=None, min_length=1)
|
|
257
|
+
fields: list[AugmentFields] | None = Field(default=None, min_length=1)
|
|
258
|
+
paragraphs: list[AugmentParagraphs] | None = Field(default=None, min_length=1)
|
|
210
259
|
|
|
211
260
|
|
|
212
261
|
# Response
|
|
@@ -214,11 +263,14 @@ class AugmentRequest(BaseModel):
|
|
|
214
263
|
|
|
215
264
|
class AugmentedParagraph(BaseModel):
|
|
216
265
|
text: str | None = None
|
|
266
|
+
position: TextPosition | None = None
|
|
217
267
|
|
|
218
268
|
neighbours_before: list[ParagraphId] | None = None
|
|
219
269
|
neighbours_after: list[ParagraphId] | None = None
|
|
220
270
|
|
|
221
|
-
|
|
271
|
+
source_image: str | None = None
|
|
272
|
+
table_image: str | None = None
|
|
273
|
+
page_preview_image: str | None = None
|
|
222
274
|
|
|
223
275
|
|
|
224
276
|
class AugmentedField(BaseModel):
|
|
@@ -229,7 +281,64 @@ class AugmentedField(BaseModel):
|
|
|
229
281
|
# former ners
|
|
230
282
|
entities: dict[str, list[str]] | None = None
|
|
231
283
|
|
|
232
|
-
|
|
284
|
+
|
|
285
|
+
class AugmentedFileField(BaseModel):
|
|
286
|
+
text: str | None = None
|
|
287
|
+
|
|
288
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
289
|
+
|
|
290
|
+
# former ners
|
|
291
|
+
entities: dict[str, list[str]] | None = None
|
|
292
|
+
|
|
293
|
+
page_preview_image: str | None = None
|
|
294
|
+
|
|
295
|
+
# Path for the download API to retrieve the file thumbnail image
|
|
296
|
+
thumbnail_image: str | None = None
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class AugmentedConversationMessage(BaseModel):
|
|
300
|
+
ident: str
|
|
301
|
+
text: str | None = None
|
|
302
|
+
attachments: list[FieldId] | None = None
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
class AugmentedConversationField(BaseModel):
|
|
306
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
307
|
+
# former ners
|
|
308
|
+
entities: dict[str, list[str]] | None = None
|
|
309
|
+
|
|
310
|
+
messages: list[AugmentedConversationMessage] | None = None
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def text(self) -> str | None:
|
|
314
|
+
"""Syntactic sugar to access aggregate text from all messages"""
|
|
315
|
+
if self.messages is None:
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
text = ""
|
|
319
|
+
for message in self.messages:
|
|
320
|
+
text += message.text or ""
|
|
321
|
+
|
|
322
|
+
return text or None
|
|
323
|
+
|
|
324
|
+
@property
|
|
325
|
+
def attachments(self) -> list[FieldId] | None:
|
|
326
|
+
"""Syntactic sugar to access the aggregate of attachments from all messages."""
|
|
327
|
+
if self.messages is None:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
has_attachments = False
|
|
331
|
+
attachments = []
|
|
332
|
+
for message in self.messages:
|
|
333
|
+
if message.attachments is None:
|
|
334
|
+
continue
|
|
335
|
+
has_attachments = True
|
|
336
|
+
attachments.extend(message.attachments)
|
|
337
|
+
|
|
338
|
+
if has_attachments:
|
|
339
|
+
return attachments
|
|
340
|
+
else:
|
|
341
|
+
return None
|
|
233
342
|
|
|
234
343
|
|
|
235
344
|
class AugmentedResource(Resource):
|
|
@@ -242,5 +351,5 @@ class AugmentedResource(Resource):
|
|
|
242
351
|
|
|
243
352
|
class AugmentResponse(BaseModel):
|
|
244
353
|
resources: dict[ResourceId, AugmentedResource]
|
|
245
|
-
fields: dict[FieldId, AugmentedField]
|
|
354
|
+
fields: dict[FieldId, AugmentedField | AugmentedFileField | AugmentedConversationField]
|
|
246
355
|
paragraphs: dict[ParagraphId, AugmentedParagraph]
|
nucliadb_models/common.py
CHANGED
|
@@ -16,7 +16,7 @@ import base64
|
|
|
16
16
|
import hashlib
|
|
17
17
|
import re
|
|
18
18
|
from enum import Enum
|
|
19
|
-
from typing import Any
|
|
19
|
+
from typing import Any
|
|
20
20
|
|
|
21
21
|
from pydantic import (
|
|
22
22
|
BaseModel,
|
|
@@ -38,7 +38,7 @@ FIELD_TYPE_CHAR_MAP = {
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
STORAGE_FILE_MATCH = re.compile(
|
|
41
|
-
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
41
|
+
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
42
42
|
)
|
|
43
43
|
DOWNLOAD_TYPE_MAP = {"f": "field", "e": "extracted"}
|
|
44
44
|
DOWNLOAD_URI = "/kb/{kbid}/resource/{rid}/{field_type}/{field_id}/download/{download_type}/{key}"
|
|
@@ -50,9 +50,9 @@ class ParamDefault(BaseModel):
|
|
|
50
50
|
default: Any = None
|
|
51
51
|
title: str
|
|
52
52
|
description: str
|
|
53
|
-
le:
|
|
54
|
-
gt:
|
|
55
|
-
max_items:
|
|
53
|
+
le: float | None = None
|
|
54
|
+
gt: float | None = None
|
|
55
|
+
max_items: int | None = None
|
|
56
56
|
deprecated: bool = False
|
|
57
57
|
|
|
58
58
|
def to_pydantic_field(self, default=_NOT_SET, **kw) -> Field: # type: ignore
|
|
@@ -86,13 +86,13 @@ class FieldID(BaseModel):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class File(BaseModel):
|
|
89
|
-
filename:
|
|
89
|
+
filename: str | None = None
|
|
90
90
|
content_type: str = "application/octet-stream"
|
|
91
|
-
payload:
|
|
92
|
-
md5:
|
|
91
|
+
payload: str | None = Field(default=None, description="Base64 encoded file content")
|
|
92
|
+
md5: str | None = None
|
|
93
93
|
# These are to be used for external files
|
|
94
|
-
uri:
|
|
95
|
-
extra_headers:
|
|
94
|
+
uri: str | None = None
|
|
95
|
+
extra_headers: dict[str, str] = {}
|
|
96
96
|
|
|
97
97
|
@model_validator(mode="after")
|
|
98
98
|
def _check_internal_file_fields(self) -> Self:
|
|
@@ -134,10 +134,10 @@ class FileB64(BaseModel):
|
|
|
134
134
|
|
|
135
135
|
|
|
136
136
|
class CloudFile(BaseModel):
|
|
137
|
-
uri:
|
|
138
|
-
size:
|
|
139
|
-
content_type:
|
|
140
|
-
bucket_name:
|
|
137
|
+
uri: str | None = None
|
|
138
|
+
size: int | None = None
|
|
139
|
+
content_type: str | None = None
|
|
140
|
+
bucket_name: str | None = None
|
|
141
141
|
|
|
142
142
|
class Source(Enum):
|
|
143
143
|
FLAPS = "FLAPS"
|
|
@@ -146,23 +146,23 @@ class CloudFile(BaseModel):
|
|
|
146
146
|
LOCAL = "LOCAL"
|
|
147
147
|
EXTERNAL = "EXTERNAL"
|
|
148
148
|
|
|
149
|
-
source:
|
|
150
|
-
filename:
|
|
151
|
-
resumable_uri:
|
|
152
|
-
offset:
|
|
153
|
-
upload_uri:
|
|
154
|
-
parts:
|
|
155
|
-
old_uri:
|
|
156
|
-
old_bucket:
|
|
157
|
-
md5:
|
|
149
|
+
source: Source | None
|
|
150
|
+
filename: str | None
|
|
151
|
+
resumable_uri: str | None
|
|
152
|
+
offset: int | None
|
|
153
|
+
upload_uri: str | None
|
|
154
|
+
parts: list[str] | None
|
|
155
|
+
old_uri: str | None
|
|
156
|
+
old_bucket: str | None
|
|
157
|
+
md5: str | None
|
|
158
158
|
|
|
159
159
|
|
|
160
160
|
class CloudLink(BaseModel):
|
|
161
|
-
uri:
|
|
162
|
-
size:
|
|
163
|
-
content_type:
|
|
164
|
-
filename:
|
|
165
|
-
md5:
|
|
161
|
+
uri: str | None = None
|
|
162
|
+
size: int | None = None
|
|
163
|
+
content_type: str | None = None
|
|
164
|
+
filename: str | None = None
|
|
165
|
+
md5: str | None = None
|
|
166
166
|
|
|
167
167
|
@staticmethod
|
|
168
168
|
def format_reader_download_uri(uri: str) -> str:
|
|
@@ -216,12 +216,12 @@ class FieldTypeName(str, Enum):
|
|
|
216
216
|
class FieldRef(BaseModel):
|
|
217
217
|
field_type: FieldTypeName
|
|
218
218
|
field_id: str
|
|
219
|
-
split:
|
|
219
|
+
split: str | None = None
|
|
220
220
|
|
|
221
221
|
|
|
222
222
|
class Classification(BaseModel):
|
|
223
|
-
labelset: str
|
|
224
|
-
label: str
|
|
223
|
+
labelset: str = Field(title="The ID of the labelset")
|
|
224
|
+
label: str = Field(title="The label assigned from the labelset")
|
|
225
225
|
|
|
226
226
|
|
|
227
227
|
class UserClassification(Classification):
|
|
@@ -229,19 +229,19 @@ class UserClassification(Classification):
|
|
|
229
229
|
|
|
230
230
|
|
|
231
231
|
class Sentence(BaseModel):
|
|
232
|
-
start:
|
|
233
|
-
end:
|
|
234
|
-
key:
|
|
232
|
+
start: int | None = None
|
|
233
|
+
end: int | None = None
|
|
234
|
+
key: str | None = None
|
|
235
235
|
|
|
236
236
|
|
|
237
237
|
class PageInformation(BaseModel):
|
|
238
|
-
page:
|
|
239
|
-
page_with_visual:
|
|
238
|
+
page: int | None = Field(default=None, title="Page Information Page")
|
|
239
|
+
page_with_visual: bool | None = None
|
|
240
240
|
|
|
241
241
|
|
|
242
242
|
class Representation(BaseModel):
|
|
243
|
-
is_a_table:
|
|
244
|
-
reference_file:
|
|
243
|
+
is_a_table: bool | None = None
|
|
244
|
+
reference_file: str | None = None
|
|
245
245
|
|
|
246
246
|
|
|
247
247
|
class ParagraphRelations(BaseModel):
|
|
@@ -251,10 +251,10 @@ class ParagraphRelations(BaseModel):
|
|
|
251
251
|
|
|
252
252
|
|
|
253
253
|
class Paragraph(BaseModel):
|
|
254
|
-
start:
|
|
255
|
-
end:
|
|
256
|
-
start_seconds:
|
|
257
|
-
end_seconds:
|
|
254
|
+
start: int | None = None
|
|
255
|
+
end: int | None = None
|
|
256
|
+
start_seconds: list[int] | None = None
|
|
257
|
+
end_seconds: list[int] | None = None
|
|
258
258
|
|
|
259
259
|
class TypeParagraph(str, Enum):
|
|
260
260
|
TEXT = "TEXT"
|
|
@@ -265,35 +265,35 @@ class Paragraph(BaseModel):
|
|
|
265
265
|
TITLE = "TITLE"
|
|
266
266
|
TABLE = "TABLE"
|
|
267
267
|
|
|
268
|
-
kind:
|
|
269
|
-
classifications:
|
|
270
|
-
sentences:
|
|
271
|
-
key:
|
|
272
|
-
page:
|
|
273
|
-
representation:
|
|
274
|
-
relations:
|
|
268
|
+
kind: TypeParagraph | None = None
|
|
269
|
+
classifications: list[Classification] | None = None
|
|
270
|
+
sentences: list[Sentence] | None = None
|
|
271
|
+
key: str | None = None
|
|
272
|
+
page: PageInformation | None = None
|
|
273
|
+
representation: Representation | None = None
|
|
274
|
+
relations: ParagraphRelations | None = None
|
|
275
275
|
|
|
276
276
|
|
|
277
277
|
class Shards(BaseModel):
|
|
278
|
-
shards:
|
|
278
|
+
shards: list[str] | None = None
|
|
279
279
|
|
|
280
280
|
|
|
281
281
|
class Question(BaseModel):
|
|
282
282
|
text: str
|
|
283
|
-
language:
|
|
284
|
-
ids_paragraphs:
|
|
283
|
+
language: str | None = None
|
|
284
|
+
ids_paragraphs: list[str]
|
|
285
285
|
|
|
286
286
|
|
|
287
287
|
class Answer(BaseModel):
|
|
288
288
|
text: str
|
|
289
|
-
language:
|
|
290
|
-
ids_paragraphs:
|
|
289
|
+
language: str | None = None
|
|
290
|
+
ids_paragraphs: list[str]
|
|
291
291
|
|
|
292
292
|
|
|
293
293
|
class QuestionAnswer(BaseModel):
|
|
294
294
|
question: Question
|
|
295
|
-
answers:
|
|
295
|
+
answers: list[Answer]
|
|
296
296
|
|
|
297
297
|
|
|
298
298
|
class QuestionAnswers(BaseModel):
|
|
299
|
-
question_answer:
|
|
299
|
+
question_answer: list[QuestionAnswer]
|
nucliadb_models/configuration.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
#
|
|
15
15
|
|
|
16
16
|
import warnings
|
|
17
|
-
from typing import Annotated, Any, Literal
|
|
17
|
+
from typing import Annotated, Any, Literal
|
|
18
18
|
|
|
19
19
|
from pydantic import BaseModel, Field, create_model
|
|
20
20
|
|
|
@@ -28,11 +28,11 @@ class KBConfiguration(BaseModel):
|
|
|
28
28
|
super().__init__(**data)
|
|
29
29
|
|
|
30
30
|
# Do not touch this model synced on Processing side
|
|
31
|
-
semantic_model:
|
|
32
|
-
generative_model:
|
|
33
|
-
ner_model:
|
|
34
|
-
anonymization_model:
|
|
35
|
-
visual_labeling:
|
|
31
|
+
semantic_model: str | None = None
|
|
32
|
+
generative_model: str | None = None
|
|
33
|
+
ner_model: str | None = None
|
|
34
|
+
anonymization_model: str | None = None
|
|
35
|
+
visual_labeling: str | None = None
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
#
|
|
@@ -57,7 +57,7 @@ class FindSearchConfiguration(BaseModel):
|
|
|
57
57
|
AskConfig = create_model(
|
|
58
58
|
"AskConfig",
|
|
59
59
|
**_model_fields(AskRequest, skip=["query", "search_configuration"]),
|
|
60
|
-
query=(
|
|
60
|
+
query=(str | None, None),
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
|
|
@@ -67,7 +67,7 @@ class AskSearchConfiguration(BaseModel):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
SearchConfiguration = Annotated[
|
|
70
|
-
|
|
70
|
+
FindSearchConfiguration | AskSearchConfiguration, Field(discriminator="kind")
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
# We need this to avoid issues with pydantic and generic types defined in another module
|
nucliadb_models/content_types.py
CHANGED
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
import mimetypes
|
|
17
|
-
from typing import Optional
|
|
18
17
|
|
|
19
18
|
GENERIC_MIME_TYPE = "application/generic"
|
|
20
19
|
|
|
@@ -26,7 +25,9 @@ NUCLIA_CUSTOM_CONTENT_TYPES = {
|
|
|
26
25
|
|
|
27
26
|
EXTRA_VALID_CONTENT_TYPES = {
|
|
28
27
|
"application/font-woff",
|
|
28
|
+
"application/javascript",
|
|
29
29
|
"application/mp4",
|
|
30
|
+
"application/rtf",
|
|
30
31
|
"application/toml",
|
|
31
32
|
"application/vnd.jgraph.mxfile",
|
|
32
33
|
"application/vnd.ms-excel.sheet.macroenabled.12",
|
|
@@ -38,6 +39,7 @@ EXTRA_VALID_CONTENT_TYPES = {
|
|
|
38
39
|
"application/x-git",
|
|
39
40
|
"application/x-gzip",
|
|
40
41
|
"application/x-iwork-pages-sffpages",
|
|
42
|
+
"application/x-javascript",
|
|
41
43
|
"application/x-mach-binary",
|
|
42
44
|
"application/x-mobipocket-ebook",
|
|
43
45
|
"application/x-ms-shortcut",
|
|
@@ -46,10 +48,15 @@ EXTRA_VALID_CONTENT_TYPES = {
|
|
|
46
48
|
"application/x-openscad",
|
|
47
49
|
"application/x-sql",
|
|
48
50
|
"application/x-zip-compressed",
|
|
51
|
+
"application/x-zip",
|
|
49
52
|
"application/zstd",
|
|
53
|
+
"audio/m4a",
|
|
50
54
|
"audio/vnd.dlna.adts",
|
|
51
55
|
"audio/wav",
|
|
52
56
|
"audio/x-m4a",
|
|
57
|
+
"image/svg+xml",
|
|
58
|
+
"image/tif",
|
|
59
|
+
"image/x-ico",
|
|
53
60
|
"model/stl",
|
|
54
61
|
"multipart/form-data",
|
|
55
62
|
"text/jsx",
|
|
@@ -58,26 +65,21 @@ EXTRA_VALID_CONTENT_TYPES = {
|
|
|
58
65
|
"text/rtf",
|
|
59
66
|
"text/x-c++",
|
|
60
67
|
"text/x-java-source",
|
|
68
|
+
"text/x-javascript",
|
|
61
69
|
"text/x-log",
|
|
62
70
|
"text/x-python-script",
|
|
63
71
|
"text/x-ruby-script",
|
|
64
72
|
"text/yaml",
|
|
65
|
-
"video/
|
|
66
|
-
"video/YouTube",
|
|
67
|
-
"image/tif",
|
|
73
|
+
"video/mkv",
|
|
68
74
|
"video/qt",
|
|
69
75
|
"video/webp",
|
|
70
|
-
"
|
|
71
|
-
"application/x-zip",
|
|
72
|
-
"video/mkv",
|
|
73
|
-
"image/x-ico",
|
|
74
|
-
"audio/m4a",
|
|
75
|
-
"image/svg+xml",
|
|
76
|
+
"video/x-m4v",
|
|
76
77
|
"video/x-msvideo",
|
|
78
|
+
"video/YouTube",
|
|
77
79
|
} | NUCLIA_CUSTOM_CONTENT_TYPES
|
|
78
80
|
|
|
79
81
|
|
|
80
|
-
def guess(filename: str) ->
|
|
82
|
+
def guess(filename: str) -> str | None:
|
|
81
83
|
"""
|
|
82
84
|
Guess the content type of a file based on its filename.
|
|
83
85
|
Returns None if the content type could not be guessed.
|