nucliadb-models 6.9.7.post5583__py3-none-any.whl → 6.11.1.post5822__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb_models/agents/ingestion.py +4 -4
- nucliadb_models/augment.py +100 -84
- nucliadb_models/common.py +56 -56
- nucliadb_models/configuration.py +8 -8
- nucliadb_models/content_types.py +13 -11
- nucliadb_models/conversation.py +25 -26
- nucliadb_models/entities.py +17 -18
- nucliadb_models/external_index_providers.py +1 -2
- nucliadb_models/extracted.py +82 -83
- nucliadb_models/file.py +10 -11
- nucliadb_models/filters.py +78 -74
- nucliadb_models/graph/requests.py +40 -48
- nucliadb_models/graph/responses.py +13 -1
- nucliadb_models/hydration.py +48 -50
- nucliadb_models/internal/predict.py +7 -9
- nucliadb_models/internal/shards.py +2 -3
- nucliadb_models/labels.py +18 -11
- nucliadb_models/link.py +18 -19
- nucliadb_models/metadata.py +66 -54
- nucliadb_models/notifications.py +3 -3
- nucliadb_models/processing.py +1 -2
- nucliadb_models/resource.py +85 -93
- nucliadb_models/retrieval.py +147 -0
- nucliadb_models/search.py +263 -275
- nucliadb_models/security.py +2 -3
- nucliadb_models/text.py +7 -8
- nucliadb_models/trainset.py +1 -2
- nucliadb_models/utils.py +2 -3
- nucliadb_models/vectors.py +2 -5
- nucliadb_models/writer.py +56 -57
- {nucliadb_models-6.9.7.post5583.dist-info → nucliadb_models-6.11.1.post5822.dist-info}/METADATA +1 -1
- nucliadb_models-6.11.1.post5822.dist-info/RECORD +41 -0
- {nucliadb_models-6.9.7.post5583.dist-info → nucliadb_models-6.11.1.post5822.dist-info}/WHEEL +1 -1
- nucliadb_models-6.9.7.post5583.dist-info/RECORD +0 -40
- {nucliadb_models-6.9.7.post5583.dist-info → nucliadb_models-6.11.1.post5822.dist-info}/top_level.txt +0 -0
|
@@ -13,7 +13,6 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
from enum import Enum
|
|
16
|
-
from typing import Optional
|
|
17
16
|
|
|
18
17
|
from pydantic import BaseModel, Field
|
|
19
18
|
|
|
@@ -41,11 +40,12 @@ class AgentsFilter(BaseModel):
|
|
|
41
40
|
|
|
42
41
|
|
|
43
42
|
class ResourceAgentsRequest(BaseModel):
|
|
44
|
-
filters:
|
|
43
|
+
filters: list[AgentsFilter] | None = Field(
|
|
44
|
+
title="Resource Agent Filters",
|
|
45
45
|
default=None,
|
|
46
46
|
description="Filters to apply to the agents. If None, all curently configured agents are applied.",
|
|
47
47
|
)
|
|
48
|
-
agent_ids:
|
|
48
|
+
agent_ids: list[str] | None = Field(
|
|
49
49
|
default=None,
|
|
50
50
|
title="An optional list of Data Augmentation Agent IDs to run. If None, all configured agents that match the filters are run.",
|
|
51
51
|
)
|
|
@@ -57,7 +57,7 @@ class NewTextField(BaseModel):
|
|
|
57
57
|
|
|
58
58
|
|
|
59
59
|
class AppliedDataAugmentation(BaseModel):
|
|
60
|
-
qas:
|
|
60
|
+
qas: QuestionAnswers | None = Field(
|
|
61
61
|
default=None,
|
|
62
62
|
description="Question and answers generated by the Question Answers agent",
|
|
63
63
|
)
|
nucliadb_models/augment.py
CHANGED
|
@@ -13,16 +13,15 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
#
|
|
15
15
|
|
|
16
|
-
from enum import Enum
|
|
17
16
|
from typing import Annotated
|
|
18
17
|
|
|
19
18
|
from pydantic import BaseModel, Field, StringConstraints, model_validator
|
|
20
|
-
from typing_extensions import Self
|
|
19
|
+
from typing_extensions import Self, assert_never
|
|
21
20
|
|
|
22
21
|
from nucliadb_models import filters
|
|
23
22
|
from nucliadb_models.common import FieldTypeName
|
|
24
23
|
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
25
|
-
from nucliadb_models.search import
|
|
24
|
+
from nucliadb_models.search import ResourceProperties, TextPosition
|
|
26
25
|
|
|
27
26
|
ResourceIdPattern = r"^([0-9a-f]{32}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})$"
|
|
28
27
|
ResourceId = Annotated[
|
|
@@ -57,71 +56,6 @@ ParagraphId = Annotated[
|
|
|
57
56
|
# Request
|
|
58
57
|
|
|
59
58
|
|
|
60
|
-
class ResourceProp(str, Enum):
|
|
61
|
-
"""Superset of former `show` and `extracted` serializations options."""
|
|
62
|
-
|
|
63
|
-
# `show` props
|
|
64
|
-
BASIC = "basic"
|
|
65
|
-
ORIGIN = "origin"
|
|
66
|
-
EXTRA = "extra"
|
|
67
|
-
RELATIONS = "relations"
|
|
68
|
-
VALUES = "values"
|
|
69
|
-
ERRORS = "errors"
|
|
70
|
-
SECURITY = "security"
|
|
71
|
-
# `extracted` props
|
|
72
|
-
EXTRACTED_TEXT = "extracted_text"
|
|
73
|
-
EXTRACTED_METADATA = "extracted_metadata"
|
|
74
|
-
EXTRACTED_SHORTENED_METADATA = "extracted_shortened_metadata"
|
|
75
|
-
EXTRACTED_LARGE_METADATA = "extracted_large_metadata"
|
|
76
|
-
EXTRACTED_VECTOR = "extracted_vectors"
|
|
77
|
-
EXTRACTED_LINK = "extracted_link"
|
|
78
|
-
EXTRACTED_FILE = "extracted_file"
|
|
79
|
-
EXTRACTED_QA = "extracted_question_answers"
|
|
80
|
-
# new granular props
|
|
81
|
-
TITLE = "title"
|
|
82
|
-
SUMMARY = "summary"
|
|
83
|
-
CLASSIFICATION_LABELS = "classification_labels"
|
|
84
|
-
|
|
85
|
-
@classmethod
|
|
86
|
-
def from_show_and_extracted(
|
|
87
|
-
cls, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
|
|
88
|
-
) -> list["ResourceProp"]:
|
|
89
|
-
_show_to_prop = {
|
|
90
|
-
ResourceProperties.BASIC: cls.BASIC,
|
|
91
|
-
ResourceProperties.ORIGIN: cls.ORIGIN,
|
|
92
|
-
ResourceProperties.EXTRA: cls.EXTRA,
|
|
93
|
-
ResourceProperties.RELATIONS: cls.RELATIONS,
|
|
94
|
-
ResourceProperties.VALUES: cls.VALUES,
|
|
95
|
-
ResourceProperties.ERRORS: cls.ERRORS,
|
|
96
|
-
ResourceProperties.SECURITY: cls.SECURITY,
|
|
97
|
-
}
|
|
98
|
-
_extracted_to_prop = {
|
|
99
|
-
ExtractedDataTypeName.TEXT: cls.EXTRACTED_TEXT,
|
|
100
|
-
ExtractedDataTypeName.METADATA: cls.EXTRACTED_METADATA,
|
|
101
|
-
ExtractedDataTypeName.SHORTENED_METADATA: cls.EXTRACTED_SHORTENED_METADATA,
|
|
102
|
-
ExtractedDataTypeName.LARGE_METADATA: cls.EXTRACTED_LARGE_METADATA,
|
|
103
|
-
ExtractedDataTypeName.VECTOR: cls.EXTRACTED_VECTOR,
|
|
104
|
-
ExtractedDataTypeName.LINK: cls.EXTRACTED_LINK,
|
|
105
|
-
ExtractedDataTypeName.FILE: cls.EXTRACTED_FILE,
|
|
106
|
-
ExtractedDataTypeName.QA: cls.EXTRACTED_QA,
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
props = []
|
|
110
|
-
for s in show:
|
|
111
|
-
show_prop = _show_to_prop.get(s)
|
|
112
|
-
# show=extracted is not in the dict
|
|
113
|
-
if show_prop is None:
|
|
114
|
-
continue
|
|
115
|
-
props.append(show_prop)
|
|
116
|
-
|
|
117
|
-
if ResourceProperties.EXTRACTED in show:
|
|
118
|
-
for e in extracted:
|
|
119
|
-
extracted_prop = _extracted_to_prop[e]
|
|
120
|
-
props.append(extracted_prop)
|
|
121
|
-
|
|
122
|
-
return props
|
|
123
|
-
|
|
124
|
-
|
|
125
59
|
class AugmentResourceFields(BaseModel):
|
|
126
60
|
text: bool = False
|
|
127
61
|
classification_labels: bool = False
|
|
@@ -132,8 +66,29 @@ class AugmentResourceFields(BaseModel):
|
|
|
132
66
|
class AugmentResources(BaseModel):
|
|
133
67
|
given: list[ResourceId]
|
|
134
68
|
|
|
135
|
-
#
|
|
136
|
-
|
|
69
|
+
# `show` props
|
|
70
|
+
basic: bool = False
|
|
71
|
+
origin: bool = False
|
|
72
|
+
extra: bool = False
|
|
73
|
+
relations: bool = False
|
|
74
|
+
values: bool = False
|
|
75
|
+
errors: bool = False
|
|
76
|
+
security: bool = False
|
|
77
|
+
|
|
78
|
+
# `extracted` props
|
|
79
|
+
extracted_text: bool = False
|
|
80
|
+
extracted_metadata: bool = False
|
|
81
|
+
extracted_shortened_metadata: bool = False
|
|
82
|
+
extracted_large_metadata: bool = False
|
|
83
|
+
extracted_vector: bool = False
|
|
84
|
+
extracted_link: bool = False
|
|
85
|
+
extracted_file: bool = False
|
|
86
|
+
extracted_qa: bool = False
|
|
87
|
+
|
|
88
|
+
# new granular props
|
|
89
|
+
title: bool = False
|
|
90
|
+
summary: bool = False
|
|
91
|
+
classification_labels: bool = False
|
|
137
92
|
|
|
138
93
|
field_type_filter: list[FieldTypeName] | None = Field(
|
|
139
94
|
default=None,
|
|
@@ -154,6 +109,51 @@ class AugmentResources(BaseModel):
|
|
|
154
109
|
|
|
155
110
|
return self
|
|
156
111
|
|
|
112
|
+
def apply_show_and_extracted(
|
|
113
|
+
self, show: list[ResourceProperties], extracted: list[ExtractedDataTypeName]
|
|
114
|
+
):
|
|
115
|
+
show_extracted = False
|
|
116
|
+
for s in show:
|
|
117
|
+
if s == ResourceProperties.BASIC:
|
|
118
|
+
self.basic = True
|
|
119
|
+
elif s == ResourceProperties.ORIGIN:
|
|
120
|
+
self.origin = True
|
|
121
|
+
elif s == ResourceProperties.EXTRA:
|
|
122
|
+
self.extra = True
|
|
123
|
+
elif s == ResourceProperties.RELATIONS:
|
|
124
|
+
self.relations = True
|
|
125
|
+
elif s == ResourceProperties.VALUES:
|
|
126
|
+
self.values = True
|
|
127
|
+
elif s == ResourceProperties.ERRORS:
|
|
128
|
+
self.errors = True
|
|
129
|
+
elif s == ResourceProperties.SECURITY:
|
|
130
|
+
self.security = True
|
|
131
|
+
elif s == ResourceProperties.EXTRACTED:
|
|
132
|
+
show_extracted = True
|
|
133
|
+
else: # pragma: no cover
|
|
134
|
+
assert_never(s)
|
|
135
|
+
|
|
136
|
+
if show_extracted:
|
|
137
|
+
for e in extracted:
|
|
138
|
+
if e == ExtractedDataTypeName.TEXT:
|
|
139
|
+
self.extracted_text = True
|
|
140
|
+
elif e == ExtractedDataTypeName.METADATA:
|
|
141
|
+
self.extracted_metadata = True
|
|
142
|
+
elif e == ExtractedDataTypeName.SHORTENED_METADATA:
|
|
143
|
+
self.extracted_shortened_metadata = True
|
|
144
|
+
elif e == ExtractedDataTypeName.LARGE_METADATA:
|
|
145
|
+
self.extracted_large_metadata = True
|
|
146
|
+
elif e == ExtractedDataTypeName.VECTOR:
|
|
147
|
+
self.extracted_vector = True
|
|
148
|
+
elif e == ExtractedDataTypeName.LINK:
|
|
149
|
+
self.extracted_link = True
|
|
150
|
+
elif e == ExtractedDataTypeName.FILE:
|
|
151
|
+
self.extracted_file = True
|
|
152
|
+
elif e == ExtractedDataTypeName.QA:
|
|
153
|
+
self.extracted_qa = True
|
|
154
|
+
else: # pragma: no cover
|
|
155
|
+
assert_never(s)
|
|
156
|
+
|
|
157
157
|
|
|
158
158
|
class AugmentFields(BaseModel):
|
|
159
159
|
given: list[FieldId]
|
|
@@ -162,6 +162,9 @@ class AugmentFields(BaseModel):
|
|
|
162
162
|
classification_labels: bool = False
|
|
163
163
|
entities: bool = False # also known as ners
|
|
164
164
|
|
|
165
|
+
# For file fields, augment the path to the thumbnail image
|
|
166
|
+
file_thumbnail: bool = False
|
|
167
|
+
|
|
165
168
|
# When enabled, augment all the messages from the conversation. This is
|
|
166
169
|
# incompatible with max_conversation_messages defined
|
|
167
170
|
full_conversation: bool = False
|
|
@@ -205,11 +208,7 @@ class AugmentFields(BaseModel):
|
|
|
205
208
|
return self
|
|
206
209
|
|
|
207
210
|
|
|
208
|
-
# TODO(decoupled-ask): remove unused metadata
|
|
209
211
|
class ParagraphMetadata(BaseModel):
|
|
210
|
-
field_labels: list[str]
|
|
211
|
-
paragraph_labels: list[str]
|
|
212
|
-
|
|
213
212
|
is_an_image: bool
|
|
214
213
|
is_a_table: bool
|
|
215
214
|
|
|
@@ -234,27 +233,29 @@ class AugmentParagraphs(BaseModel):
|
|
|
234
233
|
neighbours_before: int = 0
|
|
235
234
|
neighbours_after: int = 0
|
|
236
235
|
|
|
237
|
-
# TODO(decoupled-ask): implement image strategy
|
|
238
236
|
# paragraph extracted from an image, return an image
|
|
239
237
|
source_image: bool = False
|
|
240
238
|
|
|
241
|
-
# TODO(decoupled-ask): implement image strategy
|
|
242
239
|
# paragraph extracted from a table, return table image
|
|
243
240
|
table_image: bool = False
|
|
244
241
|
|
|
245
|
-
# TODO(decoupled-ask): implement image strategy
|
|
246
242
|
# return page_preview instead of table image if table image enabled
|
|
247
243
|
table_prefers_page_preview: bool = False
|
|
248
244
|
|
|
249
|
-
# TODO(decoupled-ask): implement image strategy
|
|
250
245
|
# paragraph from a page, return page preview image
|
|
251
246
|
page_preview_image: bool = False
|
|
252
247
|
|
|
248
|
+
@model_validator(mode="after")
|
|
249
|
+
def table_options_work_together(self) -> Self:
|
|
250
|
+
if not self.table_image and self.table_prefers_page_preview:
|
|
251
|
+
raise ValueError("`table_prefers_page_preview` can only be enabled with `table_image`")
|
|
252
|
+
return self
|
|
253
|
+
|
|
253
254
|
|
|
254
255
|
class AugmentRequest(BaseModel):
|
|
255
|
-
resources: AugmentResources | None = None
|
|
256
|
-
fields: AugmentFields | None = None
|
|
257
|
-
paragraphs: AugmentParagraphs | None = None
|
|
256
|
+
resources: list[AugmentResources] | None = Field(default=None, min_length=1)
|
|
257
|
+
fields: list[AugmentFields] | None = Field(default=None, min_length=1)
|
|
258
|
+
paragraphs: list[AugmentParagraphs] | None = Field(default=None, min_length=1)
|
|
258
259
|
|
|
259
260
|
|
|
260
261
|
# Response
|
|
@@ -262,11 +263,14 @@ class AugmentRequest(BaseModel):
|
|
|
262
263
|
|
|
263
264
|
class AugmentedParagraph(BaseModel):
|
|
264
265
|
text: str | None = None
|
|
266
|
+
position: TextPosition | None = None
|
|
265
267
|
|
|
266
268
|
neighbours_before: list[ParagraphId] | None = None
|
|
267
269
|
neighbours_after: list[ParagraphId] | None = None
|
|
268
270
|
|
|
269
|
-
|
|
271
|
+
source_image: str | None = None
|
|
272
|
+
table_image: str | None = None
|
|
273
|
+
page_preview_image: str | None = None
|
|
270
274
|
|
|
271
275
|
|
|
272
276
|
class AugmentedField(BaseModel):
|
|
@@ -277,7 +281,19 @@ class AugmentedField(BaseModel):
|
|
|
277
281
|
# former ners
|
|
278
282
|
entities: dict[str, list[str]] | None = None
|
|
279
283
|
|
|
280
|
-
|
|
284
|
+
|
|
285
|
+
class AugmentedFileField(BaseModel):
|
|
286
|
+
text: str | None = None
|
|
287
|
+
|
|
288
|
+
classification_labels: dict[str, list[str]] | None = None
|
|
289
|
+
|
|
290
|
+
# former ners
|
|
291
|
+
entities: dict[str, list[str]] | None = None
|
|
292
|
+
|
|
293
|
+
page_preview_image: str | None = None
|
|
294
|
+
|
|
295
|
+
# Path for the download API to retrieve the file thumbnail image
|
|
296
|
+
thumbnail_image: str | None = None
|
|
281
297
|
|
|
282
298
|
|
|
283
299
|
class AugmentedConversationMessage(BaseModel):
|
|
@@ -335,5 +351,5 @@ class AugmentedResource(Resource):
|
|
|
335
351
|
|
|
336
352
|
class AugmentResponse(BaseModel):
|
|
337
353
|
resources: dict[ResourceId, AugmentedResource]
|
|
338
|
-
fields: dict[FieldId, AugmentedField | AugmentedConversationField]
|
|
354
|
+
fields: dict[FieldId, AugmentedField | AugmentedFileField | AugmentedConversationField]
|
|
339
355
|
paragraphs: dict[ParagraphId, AugmentedParagraph]
|
nucliadb_models/common.py
CHANGED
|
@@ -16,7 +16,7 @@ import base64
|
|
|
16
16
|
import hashlib
|
|
17
17
|
import re
|
|
18
18
|
from enum import Enum
|
|
19
|
-
from typing import Any
|
|
19
|
+
from typing import Any
|
|
20
20
|
|
|
21
21
|
from pydantic import (
|
|
22
22
|
BaseModel,
|
|
@@ -38,7 +38,7 @@ FIELD_TYPE_CHAR_MAP = {
|
|
|
38
38
|
}
|
|
39
39
|
|
|
40
40
|
STORAGE_FILE_MATCH = re.compile(
|
|
41
|
-
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
41
|
+
r"/?kbs/(?P<kbid>[^/]+)/r/(?P<rid>[^/]+)/(?P<download_type>[fe])/(?P<field_type>\w)/(?P<field_id>[^/]+)/?(?P<key>.*)?"
|
|
42
42
|
)
|
|
43
43
|
DOWNLOAD_TYPE_MAP = {"f": "field", "e": "extracted"}
|
|
44
44
|
DOWNLOAD_URI = "/kb/{kbid}/resource/{rid}/{field_type}/{field_id}/download/{download_type}/{key}"
|
|
@@ -50,9 +50,9 @@ class ParamDefault(BaseModel):
|
|
|
50
50
|
default: Any = None
|
|
51
51
|
title: str
|
|
52
52
|
description: str
|
|
53
|
-
le:
|
|
54
|
-
gt:
|
|
55
|
-
max_items:
|
|
53
|
+
le: float | None = None
|
|
54
|
+
gt: float | None = None
|
|
55
|
+
max_items: int | None = None
|
|
56
56
|
deprecated: bool = False
|
|
57
57
|
|
|
58
58
|
def to_pydantic_field(self, default=_NOT_SET, **kw) -> Field: # type: ignore
|
|
@@ -86,13 +86,13 @@ class FieldID(BaseModel):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class File(BaseModel):
|
|
89
|
-
filename:
|
|
89
|
+
filename: str | None = None
|
|
90
90
|
content_type: str = "application/octet-stream"
|
|
91
|
-
payload:
|
|
92
|
-
md5:
|
|
91
|
+
payload: str | None = Field(default=None, description="Base64 encoded file content")
|
|
92
|
+
md5: str | None = None
|
|
93
93
|
# These are to be used for external files
|
|
94
|
-
uri:
|
|
95
|
-
extra_headers:
|
|
94
|
+
uri: str | None = None
|
|
95
|
+
extra_headers: dict[str, str] = {}
|
|
96
96
|
|
|
97
97
|
@model_validator(mode="after")
|
|
98
98
|
def _check_internal_file_fields(self) -> Self:
|
|
@@ -134,10 +134,10 @@ class FileB64(BaseModel):
|
|
|
134
134
|
|
|
135
135
|
|
|
136
136
|
class CloudFile(BaseModel):
|
|
137
|
-
uri:
|
|
138
|
-
size:
|
|
139
|
-
content_type:
|
|
140
|
-
bucket_name:
|
|
137
|
+
uri: str | None = None
|
|
138
|
+
size: int | None = None
|
|
139
|
+
content_type: str | None = None
|
|
140
|
+
bucket_name: str | None = None
|
|
141
141
|
|
|
142
142
|
class Source(Enum):
|
|
143
143
|
FLAPS = "FLAPS"
|
|
@@ -146,23 +146,23 @@ class CloudFile(BaseModel):
|
|
|
146
146
|
LOCAL = "LOCAL"
|
|
147
147
|
EXTERNAL = "EXTERNAL"
|
|
148
148
|
|
|
149
|
-
source:
|
|
150
|
-
filename:
|
|
151
|
-
resumable_uri:
|
|
152
|
-
offset:
|
|
153
|
-
upload_uri:
|
|
154
|
-
parts:
|
|
155
|
-
old_uri:
|
|
156
|
-
old_bucket:
|
|
157
|
-
md5:
|
|
149
|
+
source: Source | None
|
|
150
|
+
filename: str | None
|
|
151
|
+
resumable_uri: str | None
|
|
152
|
+
offset: int | None
|
|
153
|
+
upload_uri: str | None
|
|
154
|
+
parts: list[str] | None
|
|
155
|
+
old_uri: str | None
|
|
156
|
+
old_bucket: str | None
|
|
157
|
+
md5: str | None
|
|
158
158
|
|
|
159
159
|
|
|
160
160
|
class CloudLink(BaseModel):
|
|
161
|
-
uri:
|
|
162
|
-
size:
|
|
163
|
-
content_type:
|
|
164
|
-
filename:
|
|
165
|
-
md5:
|
|
161
|
+
uri: str | None = None
|
|
162
|
+
size: int | None = None
|
|
163
|
+
content_type: str | None = None
|
|
164
|
+
filename: str | None = None
|
|
165
|
+
md5: str | None = None
|
|
166
166
|
|
|
167
167
|
@staticmethod
|
|
168
168
|
def format_reader_download_uri(uri: str) -> str:
|
|
@@ -216,12 +216,12 @@ class FieldTypeName(str, Enum):
|
|
|
216
216
|
class FieldRef(BaseModel):
|
|
217
217
|
field_type: FieldTypeName
|
|
218
218
|
field_id: str
|
|
219
|
-
split:
|
|
219
|
+
split: str | None = None
|
|
220
220
|
|
|
221
221
|
|
|
222
222
|
class Classification(BaseModel):
|
|
223
|
-
labelset: str
|
|
224
|
-
label: str
|
|
223
|
+
labelset: str = Field(title="The ID of the labelset")
|
|
224
|
+
label: str = Field(title="The label assigned from the labelset")
|
|
225
225
|
|
|
226
226
|
|
|
227
227
|
class UserClassification(Classification):
|
|
@@ -229,19 +229,19 @@ class UserClassification(Classification):
|
|
|
229
229
|
|
|
230
230
|
|
|
231
231
|
class Sentence(BaseModel):
|
|
232
|
-
start:
|
|
233
|
-
end:
|
|
234
|
-
key:
|
|
232
|
+
start: int | None = None
|
|
233
|
+
end: int | None = None
|
|
234
|
+
key: str | None = None
|
|
235
235
|
|
|
236
236
|
|
|
237
237
|
class PageInformation(BaseModel):
|
|
238
|
-
page:
|
|
239
|
-
page_with_visual:
|
|
238
|
+
page: int | None = Field(default=None, title="Page Information Page")
|
|
239
|
+
page_with_visual: bool | None = None
|
|
240
240
|
|
|
241
241
|
|
|
242
242
|
class Representation(BaseModel):
|
|
243
|
-
is_a_table:
|
|
244
|
-
reference_file:
|
|
243
|
+
is_a_table: bool | None = None
|
|
244
|
+
reference_file: str | None = None
|
|
245
245
|
|
|
246
246
|
|
|
247
247
|
class ParagraphRelations(BaseModel):
|
|
@@ -251,10 +251,10 @@ class ParagraphRelations(BaseModel):
|
|
|
251
251
|
|
|
252
252
|
|
|
253
253
|
class Paragraph(BaseModel):
|
|
254
|
-
start:
|
|
255
|
-
end:
|
|
256
|
-
start_seconds:
|
|
257
|
-
end_seconds:
|
|
254
|
+
start: int | None = None
|
|
255
|
+
end: int | None = None
|
|
256
|
+
start_seconds: list[int] | None = None
|
|
257
|
+
end_seconds: list[int] | None = None
|
|
258
258
|
|
|
259
259
|
class TypeParagraph(str, Enum):
|
|
260
260
|
TEXT = "TEXT"
|
|
@@ -265,35 +265,35 @@ class Paragraph(BaseModel):
|
|
|
265
265
|
TITLE = "TITLE"
|
|
266
266
|
TABLE = "TABLE"
|
|
267
267
|
|
|
268
|
-
kind:
|
|
269
|
-
classifications:
|
|
270
|
-
sentences:
|
|
271
|
-
key:
|
|
272
|
-
page:
|
|
273
|
-
representation:
|
|
274
|
-
relations:
|
|
268
|
+
kind: TypeParagraph | None = None
|
|
269
|
+
classifications: list[Classification] | None = None
|
|
270
|
+
sentences: list[Sentence] | None = None
|
|
271
|
+
key: str | None = None
|
|
272
|
+
page: PageInformation | None = None
|
|
273
|
+
representation: Representation | None = None
|
|
274
|
+
relations: ParagraphRelations | None = None
|
|
275
275
|
|
|
276
276
|
|
|
277
277
|
class Shards(BaseModel):
|
|
278
|
-
shards:
|
|
278
|
+
shards: list[str] | None = None
|
|
279
279
|
|
|
280
280
|
|
|
281
281
|
class Question(BaseModel):
|
|
282
282
|
text: str
|
|
283
|
-
language:
|
|
284
|
-
ids_paragraphs:
|
|
283
|
+
language: str | None = None
|
|
284
|
+
ids_paragraphs: list[str]
|
|
285
285
|
|
|
286
286
|
|
|
287
287
|
class Answer(BaseModel):
|
|
288
288
|
text: str
|
|
289
|
-
language:
|
|
290
|
-
ids_paragraphs:
|
|
289
|
+
language: str | None = None
|
|
290
|
+
ids_paragraphs: list[str]
|
|
291
291
|
|
|
292
292
|
|
|
293
293
|
class QuestionAnswer(BaseModel):
|
|
294
294
|
question: Question
|
|
295
|
-
answers:
|
|
295
|
+
answers: list[Answer]
|
|
296
296
|
|
|
297
297
|
|
|
298
298
|
class QuestionAnswers(BaseModel):
|
|
299
|
-
question_answer:
|
|
299
|
+
question_answer: list[QuestionAnswer]
|
nucliadb_models/configuration.py
CHANGED
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
#
|
|
15
15
|
|
|
16
16
|
import warnings
|
|
17
|
-
from typing import Annotated, Any, Literal
|
|
17
|
+
from typing import Annotated, Any, Literal
|
|
18
18
|
|
|
19
19
|
from pydantic import BaseModel, Field, create_model
|
|
20
20
|
|
|
@@ -28,11 +28,11 @@ class KBConfiguration(BaseModel):
|
|
|
28
28
|
super().__init__(**data)
|
|
29
29
|
|
|
30
30
|
# Do not touch this model synced on Processing side
|
|
31
|
-
semantic_model:
|
|
32
|
-
generative_model:
|
|
33
|
-
ner_model:
|
|
34
|
-
anonymization_model:
|
|
35
|
-
visual_labeling:
|
|
31
|
+
semantic_model: str | None = None
|
|
32
|
+
generative_model: str | None = None
|
|
33
|
+
ner_model: str | None = None
|
|
34
|
+
anonymization_model: str | None = None
|
|
35
|
+
visual_labeling: str | None = None
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
#
|
|
@@ -57,7 +57,7 @@ class FindSearchConfiguration(BaseModel):
|
|
|
57
57
|
AskConfig = create_model(
|
|
58
58
|
"AskConfig",
|
|
59
59
|
**_model_fields(AskRequest, skip=["query", "search_configuration"]),
|
|
60
|
-
query=(
|
|
60
|
+
query=(str | None, None),
|
|
61
61
|
)
|
|
62
62
|
|
|
63
63
|
|
|
@@ -67,7 +67,7 @@ class AskSearchConfiguration(BaseModel):
|
|
|
67
67
|
|
|
68
68
|
|
|
69
69
|
SearchConfiguration = Annotated[
|
|
70
|
-
|
|
70
|
+
FindSearchConfiguration | AskSearchConfiguration, Field(discriminator="kind")
|
|
71
71
|
]
|
|
72
72
|
|
|
73
73
|
# We need this to avoid issues with pydantic and generic types defined in another module
|
nucliadb_models/content_types.py
CHANGED
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
import mimetypes
|
|
17
|
-
from typing import Optional
|
|
18
17
|
|
|
19
18
|
GENERIC_MIME_TYPE = "application/generic"
|
|
20
19
|
|
|
@@ -26,7 +25,9 @@ NUCLIA_CUSTOM_CONTENT_TYPES = {
|
|
|
26
25
|
|
|
27
26
|
EXTRA_VALID_CONTENT_TYPES = {
|
|
28
27
|
"application/font-woff",
|
|
28
|
+
"application/javascript",
|
|
29
29
|
"application/mp4",
|
|
30
|
+
"application/rtf",
|
|
30
31
|
"application/toml",
|
|
31
32
|
"application/vnd.jgraph.mxfile",
|
|
32
33
|
"application/vnd.ms-excel.sheet.macroenabled.12",
|
|
@@ -38,6 +39,7 @@ EXTRA_VALID_CONTENT_TYPES = {
|
|
|
38
39
|
"application/x-git",
|
|
39
40
|
"application/x-gzip",
|
|
40
41
|
"application/x-iwork-pages-sffpages",
|
|
42
|
+
"application/x-javascript",
|
|
41
43
|
"application/x-mach-binary",
|
|
42
44
|
"application/x-mobipocket-ebook",
|
|
43
45
|
"application/x-ms-shortcut",
|
|
@@ -46,10 +48,15 @@ EXTRA_VALID_CONTENT_TYPES = {
|
|
|
46
48
|
"application/x-openscad",
|
|
47
49
|
"application/x-sql",
|
|
48
50
|
"application/x-zip-compressed",
|
|
51
|
+
"application/x-zip",
|
|
49
52
|
"application/zstd",
|
|
53
|
+
"audio/m4a",
|
|
50
54
|
"audio/vnd.dlna.adts",
|
|
51
55
|
"audio/wav",
|
|
52
56
|
"audio/x-m4a",
|
|
57
|
+
"image/svg+xml",
|
|
58
|
+
"image/tif",
|
|
59
|
+
"image/x-ico",
|
|
53
60
|
"model/stl",
|
|
54
61
|
"multipart/form-data",
|
|
55
62
|
"text/jsx",
|
|
@@ -58,26 +65,21 @@ EXTRA_VALID_CONTENT_TYPES = {
|
|
|
58
65
|
"text/rtf",
|
|
59
66
|
"text/x-c++",
|
|
60
67
|
"text/x-java-source",
|
|
68
|
+
"text/x-javascript",
|
|
61
69
|
"text/x-log",
|
|
62
70
|
"text/x-python-script",
|
|
63
71
|
"text/x-ruby-script",
|
|
64
72
|
"text/yaml",
|
|
65
|
-
"video/
|
|
66
|
-
"video/YouTube",
|
|
67
|
-
"image/tif",
|
|
73
|
+
"video/mkv",
|
|
68
74
|
"video/qt",
|
|
69
75
|
"video/webp",
|
|
70
|
-
"
|
|
71
|
-
"application/x-zip",
|
|
72
|
-
"video/mkv",
|
|
73
|
-
"image/x-ico",
|
|
74
|
-
"audio/m4a",
|
|
75
|
-
"image/svg+xml",
|
|
76
|
+
"video/x-m4v",
|
|
76
77
|
"video/x-msvideo",
|
|
78
|
+
"video/YouTube",
|
|
77
79
|
} | NUCLIA_CUSTOM_CONTENT_TYPES
|
|
78
80
|
|
|
79
81
|
|
|
80
|
-
def guess(filename: str) ->
|
|
82
|
+
def guess(filename: str) -> str | None:
|
|
81
83
|
"""
|
|
82
84
|
Guess the content type of a file based on its filename.
|
|
83
85
|
Returns None if the content type could not be guessed.
|