nucliadb-models 6.8.1.post4983__py3-none-any.whl → 6.9.5.post5452__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb-models might be problematic. Click here for more details.
- nucliadb_models/augment.py +85 -0
- nucliadb_models/common.py +9 -0
- nucliadb_models/configuration.py +1 -1
- nucliadb_models/conversation.py +7 -5
- nucliadb_models/external_index_providers.py +5 -19
- nucliadb_models/hydration.py +425 -0
- nucliadb_models/metadata.py +18 -3
- nucliadb_models/search.py +98 -34
- {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.9.5.post5452.dist-info}/METADATA +2 -3
- {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.9.5.post5452.dist-info}/RECORD +12 -10
- {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.9.5.post5452.dist-info}/WHEEL +0 -0
- {nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.9.5.post5452.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Copyright 2025 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
from nucliadb_models.common import FieldTypeName
|
|
19
|
+
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
|
20
|
+
from nucliadb_models.search import Image, ResourceProperties, SearchParamDefaults
|
|
21
|
+
|
|
22
|
+
ParagraphId = str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class AugmentedParagraph(BaseModel):
|
|
26
|
+
text: str | None = None
|
|
27
|
+
|
|
28
|
+
neighbours_before: dict[ParagraphId, str] | None = None
|
|
29
|
+
neighbours_after: dict[ParagraphId, str] | None = None
|
|
30
|
+
|
|
31
|
+
image: Image | None = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AugmentedField(BaseModel):
|
|
35
|
+
page_preview_image: Image | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class AugmentedResource(Resource):
|
|
39
|
+
def updated_from(self, origin: Resource):
|
|
40
|
+
for key in origin.model_fields.keys():
|
|
41
|
+
self.__setattr__(key, getattr(origin, key))
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class AugmentResources(BaseModel):
|
|
45
|
+
given: list[str]
|
|
46
|
+
|
|
47
|
+
show: list[ResourceProperties] = SearchParamDefaults.show.to_pydantic_field()
|
|
48
|
+
extracted: list[ExtractedDataTypeName] = SearchParamDefaults.extracted.to_pydantic_field()
|
|
49
|
+
field_type_filter: list[FieldTypeName] = SearchParamDefaults.field_type_filter.to_pydantic_field()
|
|
50
|
+
# TODO: field name filter, da field prefix filter
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AugmentParagraph(BaseModel):
|
|
54
|
+
id: ParagraphId
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class AugmentParagraphs(BaseModel):
|
|
58
|
+
given: list[AugmentParagraph]
|
|
59
|
+
|
|
60
|
+
text: bool = True
|
|
61
|
+
|
|
62
|
+
neighbours_before: int = 0
|
|
63
|
+
neighbours_after: int = 0
|
|
64
|
+
|
|
65
|
+
# paragraph extracted from an image, return an image
|
|
66
|
+
source_image: bool = False
|
|
67
|
+
|
|
68
|
+
# paragraph extracted from a table, return table image
|
|
69
|
+
table_image: bool = False
|
|
70
|
+
|
|
71
|
+
# return page_preview instead of table image if table image enabled
|
|
72
|
+
table_prefers_page_preview: bool = False
|
|
73
|
+
|
|
74
|
+
# paragraph from a page, return page preview image
|
|
75
|
+
page_preview_image: bool = False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class AugmentRequest(BaseModel):
|
|
79
|
+
resources: AugmentResources
|
|
80
|
+
paragraphs: AugmentParagraphs
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class AugmentResponse(BaseModel):
|
|
84
|
+
resources: dict[str, AugmentedResource]
|
|
85
|
+
paragraphs: dict[str, AugmentedParagraph]
|
nucliadb_models/common.py
CHANGED
|
@@ -203,6 +203,15 @@ class FieldTypeName(str, Enum):
|
|
|
203
203
|
"a": FieldTypeName.GENERIC,
|
|
204
204
|
}[abbr]
|
|
205
205
|
|
|
206
|
+
def abbreviation(self) -> str:
|
|
207
|
+
return {
|
|
208
|
+
FieldTypeName.TEXT: "t",
|
|
209
|
+
FieldTypeName.FILE: "f",
|
|
210
|
+
FieldTypeName.LINK: "u",
|
|
211
|
+
FieldTypeName.CONVERSATION: "c",
|
|
212
|
+
FieldTypeName.GENERIC: "a",
|
|
213
|
+
}[self]
|
|
214
|
+
|
|
206
215
|
|
|
207
216
|
class FieldRef(BaseModel):
|
|
208
217
|
field_type: FieldTypeName
|
nucliadb_models/configuration.py
CHANGED
|
@@ -44,7 +44,7 @@ def _model_fields(model: type[BaseModel], skip: list[str]) -> dict[str, Any]:
|
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
# FindConfig is a
|
|
47
|
+
# FindConfig is a FindRequest without `search_configuration`
|
|
48
48
|
FindConfig = create_model("FindConfig", **_model_fields(FindRequest, skip=["search_configuration"]))
|
|
49
49
|
|
|
50
50
|
|
nucliadb_models/conversation.py
CHANGED
|
@@ -86,10 +86,10 @@ class FieldConversation(BaseModel):
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class InputMessageContent(BaseModel):
|
|
89
|
-
text: str
|
|
89
|
+
text: str = Field()
|
|
90
90
|
format: MessageFormat = MessageFormat.PLAIN
|
|
91
|
-
attachments: List[FileB64] = []
|
|
92
|
-
attachments_fields: List[FieldRef] = []
|
|
91
|
+
attachments: List[FileB64] = Field(default=[], max_length=50)
|
|
92
|
+
attachments_fields: List[FieldRef] = Field(default=[], max_length=50)
|
|
93
93
|
|
|
94
94
|
|
|
95
95
|
class InputMessage(BaseModel):
|
|
@@ -102,10 +102,12 @@ class InputMessage(BaseModel):
|
|
|
102
102
|
to: List[str] = Field(
|
|
103
103
|
default_factory=list,
|
|
104
104
|
description="List of recipients of the message, e.g. ['assistant'] or ['user']",
|
|
105
|
+
max_length=100,
|
|
105
106
|
)
|
|
106
107
|
content: InputMessageContent
|
|
107
108
|
ident: str = Field(
|
|
108
|
-
description="Unique identifier for the message. Must be unique within the conversation."
|
|
109
|
+
description="Unique identifier for the message. Must be unique within the conversation.",
|
|
110
|
+
max_length=128,
|
|
109
111
|
)
|
|
110
112
|
type_: Optional[MessageType] = Field(None, alias="type")
|
|
111
113
|
|
|
@@ -126,7 +128,7 @@ class InputMessage(BaseModel):
|
|
|
126
128
|
class InputConversationField(BaseModel):
|
|
127
129
|
messages: List[InputMessage] = Field(
|
|
128
130
|
default_factory=list,
|
|
129
|
-
description="List of messages in the conversation field. Each message must have a unique ident.",
|
|
131
|
+
description="List of messages in the conversation field. Each message must have a unique ident. A single conversation can contain up to 51,200 messages. You can add up to 2,048 messages per request.",
|
|
130
132
|
)
|
|
131
133
|
extract_strategy: Optional[str] = Field(
|
|
132
134
|
default=None,
|
|
@@ -22,32 +22,18 @@ from pydantic import BaseModel
|
|
|
22
22
|
class ExternalIndexProviderType(str, Enum):
|
|
23
23
|
"""
|
|
24
24
|
Enum for the different external index providers.
|
|
25
|
-
For now
|
|
25
|
+
For now none are supported, but we may add some in the future.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
UNSET = "unset"
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ExternalIndexProviderBase(BaseModel):
|
|
32
32
|
type: ExternalIndexProviderType
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
class
|
|
36
|
-
|
|
37
|
-
List of cloud providers supported by Pinecone serverless vector database.
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
AWS_US_EAST_1 = "aws_us_east_1"
|
|
41
|
-
AWS_US_WEST_2 = "aws_us_west_2"
|
|
42
|
-
AWS_EU_WEST_1 = "aws_eu_west_1"
|
|
43
|
-
GCP_US_CENTRAL1 = "gcp_us_central1"
|
|
44
|
-
AZURE_EASTUS2 = "azure_eastus2"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class PineconeIndexProvider(ExternalIndexProviderBase):
|
|
48
|
-
type: ExternalIndexProviderType = ExternalIndexProviderType.PINECONE
|
|
49
|
-
api_key: str
|
|
50
|
-
serverless_cloud: PineconeServerlessCloud
|
|
35
|
+
class DummyIndexProvider(ExternalIndexProviderBase):
|
|
36
|
+
type: ExternalIndexProviderType = ExternalIndexProviderType.UNSET
|
|
51
37
|
|
|
52
38
|
|
|
53
|
-
ExternalIndexProvider = Union[
|
|
39
|
+
ExternalIndexProvider = Union[DummyIndexProvider,]
|
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
# Copyright 2025 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
from typing import Annotated, Optional, Union
|
|
16
|
+
|
|
17
|
+
from pydantic import BaseModel, Field, StringConstraints
|
|
18
|
+
|
|
19
|
+
from nucliadb_models.common import FieldTypeName
|
|
20
|
+
from nucliadb_models.metadata import Origin
|
|
21
|
+
from nucliadb_models.resource import FieldConversation, FieldFile, FieldLink, FieldText
|
|
22
|
+
from nucliadb_models.search import Image
|
|
23
|
+
from nucliadb_models.security import ResourceSecurity
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ResourceHydration(BaseModel, extra="forbid"):
|
|
27
|
+
title: bool = Field(
|
|
28
|
+
default=True,
|
|
29
|
+
description="Hydrate resource titles",
|
|
30
|
+
)
|
|
31
|
+
summary: bool = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Hydrate resource summaries",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
origin: bool = Field(
|
|
37
|
+
default=False,
|
|
38
|
+
description="Hydrate resource origin",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
security: bool = Field(
|
|
42
|
+
default=False,
|
|
43
|
+
description="Hydrate resource security metadata",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TextFieldHydration(BaseModel, extra="forbid"):
|
|
48
|
+
value: bool = Field(
|
|
49
|
+
default=False,
|
|
50
|
+
description="Hydrate text field values. Field values are similar payloads to the ones used to create them",
|
|
51
|
+
)
|
|
52
|
+
extracted_text: bool = Field(
|
|
53
|
+
default=False,
|
|
54
|
+
description="Hydrate extracted text for text fields",
|
|
55
|
+
)
|
|
56
|
+
# TODO: what else should be interesting to add?
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class FileFieldHydration(BaseModel, extra="forbid"):
|
|
60
|
+
value: bool = Field(
|
|
61
|
+
default=False,
|
|
62
|
+
description="Hydrate file field values. Field values are similar payloads to the ones used to create them",
|
|
63
|
+
)
|
|
64
|
+
extracted_text: bool = Field(
|
|
65
|
+
default=False,
|
|
66
|
+
description="Hydrate extracted text for file fields",
|
|
67
|
+
)
|
|
68
|
+
# TODO: what else should be interesting to add?
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class LinkFieldHydration(BaseModel, extra="forbid"):
|
|
72
|
+
value: bool = Field(
|
|
73
|
+
default=False,
|
|
74
|
+
description="Hydrate link field values. Field values are similar payloads to the ones used to create them",
|
|
75
|
+
)
|
|
76
|
+
extracted_text: bool = Field(
|
|
77
|
+
default=False,
|
|
78
|
+
description="Hydrate extracted text for link fields",
|
|
79
|
+
)
|
|
80
|
+
# TODO: what else should be interesting to add?
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ConversationFieldHydration(BaseModel, extra="forbid"):
|
|
84
|
+
value: bool = Field(
|
|
85
|
+
default=False,
|
|
86
|
+
description="Hydrate conversation field values. Field values are similar payloads to the ones used to create them",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# TODO: add fields to hydrate conversation fields. Think about how to handle
|
|
90
|
+
# splits and fulfill the conversational RAG strategies
|
|
91
|
+
|
|
92
|
+
# TODO: what else should be interesting to add?
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class GenericFieldHydration(BaseModel, extra="forbid"):
|
|
96
|
+
value: bool = Field(
|
|
97
|
+
default=False,
|
|
98
|
+
description="Hydrate generic field values. Field values are similar payloads to the ones used to create them",
|
|
99
|
+
)
|
|
100
|
+
extracted_text: bool = Field(
|
|
101
|
+
default=False,
|
|
102
|
+
description="Hydrate extracted text for generic fields",
|
|
103
|
+
)
|
|
104
|
+
# TODO: what else should be interesting to add?
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class FieldHydration(BaseModel, extra="forbid"):
|
|
108
|
+
text: Optional[TextFieldHydration] = Field(
|
|
109
|
+
default_factory=TextFieldHydration,
|
|
110
|
+
description="Text fields hydration options",
|
|
111
|
+
)
|
|
112
|
+
file: Optional[FileFieldHydration] = Field(
|
|
113
|
+
default_factory=FileFieldHydration,
|
|
114
|
+
description="File fields hydration options",
|
|
115
|
+
)
|
|
116
|
+
link: Optional[LinkFieldHydration] = Field(
|
|
117
|
+
default_factory=LinkFieldHydration,
|
|
118
|
+
description="Link fields hydration options",
|
|
119
|
+
)
|
|
120
|
+
conversation: Optional[ConversationFieldHydration] = Field(
|
|
121
|
+
default_factory=ConversationFieldHydration,
|
|
122
|
+
description="Conversation fields hydration options",
|
|
123
|
+
)
|
|
124
|
+
generic: Optional[GenericFieldHydration] = Field(
|
|
125
|
+
default_factory=GenericFieldHydration,
|
|
126
|
+
description="Generic fields hydration options",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class NeighbourParagraphHydration(BaseModel, extra="forbid"):
|
|
131
|
+
before: int = Field(
|
|
132
|
+
default=2,
|
|
133
|
+
ge=0,
|
|
134
|
+
description="Number of previous paragraphs to hydrate",
|
|
135
|
+
)
|
|
136
|
+
after: int = Field(
|
|
137
|
+
default=2,
|
|
138
|
+
ge=0,
|
|
139
|
+
description="Number of following paragraphs to hydrate",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class RelatedParagraphHydration(BaseModel, extra="forbid"):
|
|
144
|
+
neighbours: Optional[NeighbourParagraphHydration] = Field(
|
|
145
|
+
default=None,
|
|
146
|
+
description="Hydrate extra paragraphs that surround the original one",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# TODO: FEATURE: implement related paragraphs by page
|
|
150
|
+
# page: bool = Field(
|
|
151
|
+
# default=False,
|
|
152
|
+
# description="Hydrate all paragraphs in the same page. This only applies to fields with pages",
|
|
153
|
+
# )
|
|
154
|
+
|
|
155
|
+
# TODO: description
|
|
156
|
+
# XXX: should we let users control the amount of elements?
|
|
157
|
+
parents: bool = False
|
|
158
|
+
# TODO: description
|
|
159
|
+
# XXX: should we let users control the amount of elements?
|
|
160
|
+
siblings: bool = False
|
|
161
|
+
# TODO: description
|
|
162
|
+
# XXX: should we let users control the amount of elements?
|
|
163
|
+
replacements: bool = False
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ImageParagraphHydration(BaseModel, extra="forbid"):
|
|
167
|
+
# The source image is also known as reference or reference_file in the
|
|
168
|
+
# paragraph context. The reference/reference_file is the filename of the
|
|
169
|
+
# source image from which the paragraph has been extracted
|
|
170
|
+
source_image: bool = Field(
|
|
171
|
+
default=False,
|
|
172
|
+
description=(
|
|
173
|
+
"When a paragraph has been extracted from an image (using OCR, inception...), "
|
|
174
|
+
"hydrate the image that represents it"
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class TableParagraphHydration(BaseModel, extra="forbid"):
|
|
180
|
+
# TODO: implement. ARAG uses the label "/k/table" to check whether a
|
|
181
|
+
# paragraph is or a table or not. We can also use info on maindb
|
|
182
|
+
table_page_preview: bool = Field(
|
|
183
|
+
default=False,
|
|
184
|
+
description="Hydrate the page preview for the table. This will only hydrate fields with pages",
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class ParagraphPageHydration(BaseModel, extra="forbid"):
|
|
189
|
+
# For some field types (file and link) learning generates previews. A
|
|
190
|
+
# preview is a PDF file representing the content. For a docx for example, is
|
|
191
|
+
# the PDF equivalent. Depending on the field type, the preview can
|
|
192
|
+
# represent, for example, a page in a document or a portion of a webpage.
|
|
193
|
+
page_with_visual: bool = Field(
|
|
194
|
+
default=False,
|
|
195
|
+
description=(
|
|
196
|
+
"When a paragraph has been extracted from a page containing visual "
|
|
197
|
+
"content (images, tables...), hydrate the preview of the paragraph's "
|
|
198
|
+
"page as an image. Not all field types have previews nor visual content"
|
|
199
|
+
),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class ParagraphHydration(BaseModel, extra="forbid"):
|
|
204
|
+
text: bool = Field(
|
|
205
|
+
default=True,
|
|
206
|
+
description="Hydrate paragraph text",
|
|
207
|
+
)
|
|
208
|
+
image: Optional[ImageParagraphHydration] = Field(
|
|
209
|
+
default=None,
|
|
210
|
+
description="Hydrate options for paragraphs extracted from images (using OCR, inception...)",
|
|
211
|
+
)
|
|
212
|
+
table: Optional[TableParagraphHydration] = Field(
|
|
213
|
+
default=None,
|
|
214
|
+
description="Hydrate options for paragraphs extracted from tables",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# TODO: at some point, we should add hydration options for paragraphs from
|
|
218
|
+
# audio and video
|
|
219
|
+
|
|
220
|
+
page: Optional[ParagraphPageHydration] = Field(
|
|
221
|
+
default=None,
|
|
222
|
+
description="Hydrte options for paragraphs within a page. This applies to paragraphs in fields with pages",
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
related: Optional[RelatedParagraphHydration] = Field(
|
|
226
|
+
default=None,
|
|
227
|
+
description="Hydration options for related paragraphs. For example, neighbours or sibling paragraphs",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class Hydration(BaseModel, extra="forbid"):
|
|
232
|
+
resource: Optional[ResourceHydration] = Field(
|
|
233
|
+
default_factory=ResourceHydration,
|
|
234
|
+
description="Resource hydration options",
|
|
235
|
+
)
|
|
236
|
+
field: FieldHydration = Field(
|
|
237
|
+
default_factory=FieldHydration,
|
|
238
|
+
description="Field hydration options",
|
|
239
|
+
)
|
|
240
|
+
paragraph: ParagraphHydration = Field(
|
|
241
|
+
default_factory=ParagraphHydration,
|
|
242
|
+
description="Paragraph hydration options",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
ParagraphId = Annotated[
|
|
247
|
+
str,
|
|
248
|
+
StringConstraints(
|
|
249
|
+
# resource-uuid/field-type/field-id/[split-id/]paragraph-id
|
|
250
|
+
pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+(/[^/]{1,128})?/[0-9]+-[0-9]+$",
|
|
251
|
+
min_length=32 + 1 + 1 + 1 + 1 + 0 + 0 + 1 + 3,
|
|
252
|
+
# max field id of 250 and 10 digit paragraphs. More than enough
|
|
253
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 128 + 1 + 21,
|
|
254
|
+
),
|
|
255
|
+
]
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
class HydrateRequest(BaseModel, extra="forbid"):
|
|
259
|
+
data: list[ParagraphId] = Field(
|
|
260
|
+
description="List of paragraph ids we want to hydrate",
|
|
261
|
+
max_length=50,
|
|
262
|
+
)
|
|
263
|
+
hydration: Hydration = Field(description="Description of how hydration must be performed")
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
### Response models
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class HydratedResource(BaseModel, extra="forbid"):
|
|
270
|
+
id: str = Field(description="Unique resource id")
|
|
271
|
+
slug: str = Field(description="Resource slug")
|
|
272
|
+
|
|
273
|
+
title: Optional[str] = None
|
|
274
|
+
summary: Optional[str] = None
|
|
275
|
+
|
|
276
|
+
origin: Optional[Origin] = None
|
|
277
|
+
|
|
278
|
+
security: Optional[ResourceSecurity] = None
|
|
279
|
+
|
|
280
|
+
# TODO: add resource labels to hydrated resources
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class FieldExtractedData(BaseModel, extra="forbid"):
|
|
284
|
+
text: Optional[str] = None
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
class SplitFieldExtractedData(BaseModel, extra="forbid"):
|
|
288
|
+
texts: Optional[dict[str, str]] = None
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
class HydratedTextField(BaseModel, extra="forbid"):
|
|
292
|
+
id: str = Field("Unique field id")
|
|
293
|
+
resource: str = Field("Field resource id")
|
|
294
|
+
field_type: FieldTypeName = FieldTypeName.TEXT
|
|
295
|
+
|
|
296
|
+
value: Optional[FieldText] = None
|
|
297
|
+
extracted: Optional[FieldExtractedData] = None
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class HydratedFileField(BaseModel, extra="forbid"):
|
|
301
|
+
id: str = Field("Unique field id")
|
|
302
|
+
resource: str = Field("Field resource id")
|
|
303
|
+
field_type: FieldTypeName = FieldTypeName.FILE
|
|
304
|
+
|
|
305
|
+
value: Optional[FieldFile] = None
|
|
306
|
+
extracted: Optional[FieldExtractedData] = None
|
|
307
|
+
|
|
308
|
+
previews: Optional[dict[str, Image]] = Field(
|
|
309
|
+
default=None,
|
|
310
|
+
title="Previews of specific parts of the field",
|
|
311
|
+
description=(
|
|
312
|
+
"Previews for specific pages of this field. Previews are differents"
|
|
313
|
+
"depending on the file type. For example, for a PDF file, a preview"
|
|
314
|
+
"will be an image of a single page."
|
|
315
|
+
"In this field, previews will be populated according to the hydration"
|
|
316
|
+
"options requested."
|
|
317
|
+
),
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
class HydratedLinkField(BaseModel, extra="forbid"):
|
|
322
|
+
id: str = Field("Unique field id")
|
|
323
|
+
resource: str = Field("Field resource id")
|
|
324
|
+
field_type: FieldTypeName = FieldTypeName.LINK
|
|
325
|
+
|
|
326
|
+
value: Optional[FieldLink] = None
|
|
327
|
+
extracted: Optional[FieldExtractedData] = None
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
class HydratedConversationField(BaseModel, extra="forbid"):
|
|
331
|
+
id: str = Field("Unique field id")
|
|
332
|
+
resource: str = Field("Field resource id")
|
|
333
|
+
field_type: FieldTypeName = FieldTypeName.CONVERSATION
|
|
334
|
+
|
|
335
|
+
value: Optional[FieldConversation] = None
|
|
336
|
+
extracted: Optional[FieldExtractedData] = None
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
class HydratedGenericField(BaseModel, extra="forbid"):
|
|
340
|
+
id: str = Field("Unique field id")
|
|
341
|
+
resource: str = Field("Field resource id")
|
|
342
|
+
field_type: FieldTypeName = FieldTypeName.TEXT
|
|
343
|
+
|
|
344
|
+
value: Optional[str] = None
|
|
345
|
+
extracted: Optional[FieldExtractedData] = None
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
class RelatedNeighbourParagraphRefs(BaseModel, extra="forbid"):
|
|
349
|
+
before: Optional[list[str]] = None
|
|
350
|
+
after: Optional[list[str]] = None
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
class RelatedParagraphRefs(BaseModel, extra="forbid"):
|
|
354
|
+
neighbours: Optional[RelatedNeighbourParagraphRefs] = None
|
|
355
|
+
parents: Optional[list[str]] = None
|
|
356
|
+
siblings: Optional[list[str]] = None
|
|
357
|
+
replacements: Optional[list[str]] = None
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
class HydratedParagraphImage(BaseModel, extra="forbid"):
|
|
361
|
+
source_image: Optional[Image] = Field(
|
|
362
|
+
default=None,
|
|
363
|
+
description=(
|
|
364
|
+
"Source image for this paragraph. This only applies to paragraphs "
|
|
365
|
+
"extracted from an image using OCR or inception, and if this "
|
|
366
|
+
"hydration option has been enabled in the request"
|
|
367
|
+
),
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
class HydratedParagraphTable(BaseModel, extra="forbid"):
|
|
372
|
+
page_preview_ref: Optional[str] = Field(
|
|
373
|
+
default=None,
|
|
374
|
+
description=(
|
|
375
|
+
"Referento to the page preview for this paragraph. The actual "
|
|
376
|
+
"preview will be found in the previews of its field. This only "
|
|
377
|
+
"applies to paragraphs generated from a table and if the "
|
|
378
|
+
"corresponding hydration option has been enabled in the request"
|
|
379
|
+
),
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
class HydratedParagraphPage(BaseModel, extra="forbid"):
|
|
384
|
+
page_preview_ref: Optional[str] = Field(
|
|
385
|
+
default=None,
|
|
386
|
+
description=(
|
|
387
|
+
"Reference to the page preview for this paragraph. The actual "
|
|
388
|
+
"preview will be found in the previews of its field. This only "
|
|
389
|
+
"applies to paragraphs extracted from a page containing visual "
|
|
390
|
+
"content and if the corresponding hydration option has been enabled "
|
|
391
|
+
"in the request"
|
|
392
|
+
),
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
class HydratedParagraph(BaseModel, extra="forbid"):
|
|
397
|
+
id: str = Field(description="Unique paragraph id")
|
|
398
|
+
field: str = Field(description="Paragraph field id")
|
|
399
|
+
resource: str = Field(description="Paragraph resource id")
|
|
400
|
+
|
|
401
|
+
text: Optional[str] = None
|
|
402
|
+
|
|
403
|
+
# TODO: add labels to hydrated paragraphs
|
|
404
|
+
# labels: Optional[list[str]] = None
|
|
405
|
+
|
|
406
|
+
related: Optional[RelatedParagraphRefs] = None
|
|
407
|
+
|
|
408
|
+
image: Optional[HydratedParagraphImage] = None
|
|
409
|
+
table: Optional[HydratedParagraphTable] = None
|
|
410
|
+
page: Optional[HydratedParagraphPage] = None
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
class Hydrated(BaseModel, extra="forbid"):
|
|
414
|
+
resources: dict[str, HydratedResource]
|
|
415
|
+
fields: dict[
|
|
416
|
+
str,
|
|
417
|
+
Union[
|
|
418
|
+
HydratedTextField,
|
|
419
|
+
HydratedFileField,
|
|
420
|
+
HydratedLinkField,
|
|
421
|
+
HydratedConversationField,
|
|
422
|
+
HydratedGenericField,
|
|
423
|
+
],
|
|
424
|
+
]
|
|
425
|
+
paragraphs: dict[str, HydratedParagraph]
|
nucliadb_models/metadata.py
CHANGED
|
@@ -17,7 +17,7 @@ from datetime import datetime
|
|
|
17
17
|
from enum import Enum
|
|
18
18
|
from typing import Any, Dict, List, Optional
|
|
19
19
|
|
|
20
|
-
from pydantic import BaseModel, Field, model_validator
|
|
20
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
21
21
|
from typing_extensions import Self
|
|
22
22
|
|
|
23
23
|
from nucliadb_models.utils import DateTime
|
|
@@ -231,15 +231,24 @@ class InputOrigin(BaseModel):
|
|
|
231
231
|
default=[],
|
|
232
232
|
title="Tags",
|
|
233
233
|
description="Resource tags about the origin system. It can later be used for filtering on search endpoints with '/origin.tags/{tag}'",
|
|
234
|
+
max_length=300,
|
|
234
235
|
)
|
|
235
|
-
collaborators: List[str] = []
|
|
236
|
+
collaborators: List[str] = Field(default=[], max_length=100)
|
|
236
237
|
filename: Optional[str] = None
|
|
237
|
-
related: List[str] = []
|
|
238
|
+
related: List[str] = Field(default=[], max_length=100)
|
|
238
239
|
path: Optional[str] = Field(
|
|
239
240
|
default=None,
|
|
240
241
|
description="Path of the original resource. Typically used to store folder structure information of the resource at the origin system. It can be later used for filtering on search endpoints with '/origin.path/{path}'",
|
|
242
|
+
max_length=2048,
|
|
241
243
|
)
|
|
242
244
|
|
|
245
|
+
@field_validator("tags")
|
|
246
|
+
def validate_tag_length(cls, tags):
|
|
247
|
+
for tag in tags:
|
|
248
|
+
if len(tag) > 512:
|
|
249
|
+
raise ValueError("Each tag must be at most 1024 characters long")
|
|
250
|
+
return tags
|
|
251
|
+
|
|
243
252
|
|
|
244
253
|
class Origin(InputOrigin):
|
|
245
254
|
# Created and modified are redefined to
|
|
@@ -247,6 +256,12 @@ class Origin(InputOrigin):
|
|
|
247
256
|
created: Optional[datetime] = None
|
|
248
257
|
modified: Optional[datetime] = None
|
|
249
258
|
|
|
259
|
+
tags: List[str] = Field(
|
|
260
|
+
default=[],
|
|
261
|
+
title="Tags",
|
|
262
|
+
description="Resource tags about the origin system. It can later be used for filtering on search endpoints with '/origin.tags/{tag}'",
|
|
263
|
+
)
|
|
264
|
+
|
|
250
265
|
class Source(Enum):
|
|
251
266
|
WEB = "WEB"
|
|
252
267
|
DESKTOP = "DESKTOP"
|
nucliadb_models/search.py
CHANGED
|
@@ -79,8 +79,9 @@ ANSWER_JSON_SCHEMA_EXAMPLE = {
|
|
|
79
79
|
class ModelParamDefaults:
|
|
80
80
|
applied_autofilters = ParamDefault(
|
|
81
81
|
default=[],
|
|
82
|
-
title="
|
|
83
|
-
description="
|
|
82
|
+
title="Applied autofilters",
|
|
83
|
+
description="[deprecated] list of filters automatically applied to the search query",
|
|
84
|
+
deprecated=True,
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
|
|
@@ -296,6 +297,8 @@ class KnowledgeboxSearchResults(JsonBaseModel):
|
|
|
296
297
|
relations: Optional[Relations] = None
|
|
297
298
|
nodes: Optional[list[dict[str, str]]] = None
|
|
298
299
|
shards: Optional[list[str]] = None
|
|
300
|
+
|
|
301
|
+
# TODO: remove on a future major release
|
|
299
302
|
autofilters: list[str] = ModelParamDefaults.applied_autofilters.to_pydantic_field()
|
|
300
303
|
|
|
301
304
|
|
|
@@ -344,10 +347,12 @@ SortOrderMap = {
|
|
|
344
347
|
|
|
345
348
|
class SortOptions(BaseModel):
|
|
346
349
|
field: SortField
|
|
347
|
-
limit: Optional[int] = Field(None, gt=0)
|
|
348
350
|
order: SortOrder = SortOrder.DESC
|
|
349
351
|
|
|
350
352
|
|
|
353
|
+
MAX_RANK_FUSION_WINDOW = 500
|
|
354
|
+
|
|
355
|
+
|
|
351
356
|
class RankFusionName(str, Enum):
|
|
352
357
|
RECIPROCAL_RANK_FUSION = "rrf"
|
|
353
358
|
|
|
@@ -377,7 +382,7 @@ class ReciprocalRankFusion(_BaseRankFusion):
|
|
|
377
382
|
)
|
|
378
383
|
window: Optional[int] = Field(
|
|
379
384
|
default=None,
|
|
380
|
-
le=
|
|
385
|
+
le=MAX_RANK_FUSION_WINDOW,
|
|
381
386
|
title="RRF window",
|
|
382
387
|
description="Number of elements for retrieval to do RRF. Window must be greater or equal to top_k. Greater values will increase probability of multi match at cost of retrieval time", # noqa: E501
|
|
383
388
|
)
|
|
@@ -481,11 +486,6 @@ class SearchParamDefaults:
|
|
|
481
486
|
description="The list of facets to calculate. The facets follow the same syntax as filters: https://docs.nuclia.dev/docs/rag/advanced/search-filters", # noqa: E501
|
|
482
487
|
max_items=50,
|
|
483
488
|
)
|
|
484
|
-
autofilter = ParamDefault(
|
|
485
|
-
default=False,
|
|
486
|
-
title="Automatic search filtering",
|
|
487
|
-
description="If set to true, the search will automatically add filters to the query. For example, it will filter results containing the entities detected in the query", # noqa: E501
|
|
488
|
-
)
|
|
489
489
|
chat_query = ParamDefault(
|
|
490
490
|
default=...,
|
|
491
491
|
title="Query",
|
|
@@ -505,10 +505,18 @@ class SearchParamDefaults:
|
|
|
505
505
|
)
|
|
506
506
|
top_k = ParamDefault(
|
|
507
507
|
default=20,
|
|
508
|
+
gt=-1,
|
|
508
509
|
le=200,
|
|
509
510
|
title="Top k",
|
|
510
511
|
description="The number of results search should return. The maximum number of results allowed is 200.",
|
|
511
512
|
)
|
|
513
|
+
offset = ParamDefault(
|
|
514
|
+
default=0,
|
|
515
|
+
gt=-1,
|
|
516
|
+
le=1000,
|
|
517
|
+
title="Results offset",
|
|
518
|
+
description="The number of results to skip, starting from the beginning in sort order. Used for pagination. It can only be used with the keyword and fulltext indexes.",
|
|
519
|
+
)
|
|
512
520
|
highlight = ParamDefault(
|
|
513
521
|
default=False,
|
|
514
522
|
title="Highlight",
|
|
@@ -534,12 +542,6 @@ class SearchParamDefaults:
|
|
|
534
542
|
title="Sort order",
|
|
535
543
|
description="Order to sort results with",
|
|
536
544
|
)
|
|
537
|
-
sort_limit = ParamDefault(
|
|
538
|
-
default=None,
|
|
539
|
-
title="Sort limit",
|
|
540
|
-
description="",
|
|
541
|
-
gt=0,
|
|
542
|
-
)
|
|
543
545
|
sort_field = ParamDefault(
|
|
544
546
|
default=None,
|
|
545
547
|
title="Sort field",
|
|
@@ -876,7 +878,9 @@ class BaseSearchRequest(AuditMetadataBase):
|
|
|
876
878
|
vectorset: Optional[str] = SearchParamDefaults.vectorset.to_pydantic_field()
|
|
877
879
|
with_duplicates: bool = SearchParamDefaults.with_duplicates.to_pydantic_field()
|
|
878
880
|
with_synonyms: bool = SearchParamDefaults.with_synonyms.to_pydantic_field()
|
|
879
|
-
autofilter
|
|
881
|
+
# autofilter is deprecated and its logic was removed. We're just keeping it in the model definition to
|
|
882
|
+
# avoid breaking changes in the python sdks. Please remove on a future major release.
|
|
883
|
+
autofilter: SkipJsonSchema[bool] = False
|
|
880
884
|
resource_filters: list[str] = SearchParamDefaults.resource_filters.to_pydantic_field()
|
|
881
885
|
security: Optional[RequestSecurity] = SearchParamDefaults.security.to_pydantic_field()
|
|
882
886
|
show_hidden: bool = SearchParamDefaults.show_hidden.to_pydantic_field()
|
|
@@ -938,12 +942,32 @@ class SearchRequest(BaseSearchRequest):
|
|
|
938
942
|
)
|
|
939
943
|
faceted: list[str] = SearchParamDefaults.faceted.to_pydantic_field()
|
|
940
944
|
sort: Optional[SortOptions] = SearchParamDefaults.sort.to_pydantic_field()
|
|
945
|
+
offset: int = SearchParamDefaults.offset.to_pydantic_field()
|
|
941
946
|
|
|
942
947
|
@field_validator("faceted")
|
|
943
948
|
@classmethod
|
|
944
949
|
def nested_facets_not_supported(cls, facets):
|
|
945
950
|
return validate_facets(facets)
|
|
946
951
|
|
|
952
|
+
@model_validator(mode="after")
|
|
953
|
+
def offset_sort_only_on_keyword_indexes(self):
|
|
954
|
+
has_non_keyword_indexes = set(self.features) & {SearchOptions.SEMANTIC, SearchOptions.RELATIONS}
|
|
955
|
+
if has_non_keyword_indexes:
|
|
956
|
+
if self.offset > 0:
|
|
957
|
+
raise ValueError("offset cannot be used with the semantic or relations index")
|
|
958
|
+
if self.sort and self.sort.field != SortField.SCORE:
|
|
959
|
+
raise ValueError("sort by date cannot be used with the semantic or relations index")
|
|
960
|
+
|
|
961
|
+
return self
|
|
962
|
+
|
|
963
|
+
@field_validator("sort", mode="after")
|
|
964
|
+
@classmethod
|
|
965
|
+
def sorting_by_title_not_supported(cls, value: Optional[SortOptions]) -> Optional[SortOptions]:
|
|
966
|
+
if value and value.field == SortField.TITLE:
|
|
967
|
+
raise ValueError("sorting by title not supported in /search")
|
|
968
|
+
|
|
969
|
+
return value
|
|
970
|
+
|
|
947
971
|
|
|
948
972
|
class Author(str, Enum):
|
|
949
973
|
NUCLIA = "NUCLIA"
|
|
@@ -1008,6 +1032,12 @@ class Reasoning(BaseModel):
|
|
|
1008
1032
|
)
|
|
1009
1033
|
|
|
1010
1034
|
|
|
1035
|
+
class CitationsType(str, Enum):
|
|
1036
|
+
NONE = "none"
|
|
1037
|
+
DEFAULT = "default"
|
|
1038
|
+
LLM_FOOTNOTES = "llm_footnotes"
|
|
1039
|
+
|
|
1040
|
+
|
|
1011
1041
|
class ChatModel(BaseModel):
|
|
1012
1042
|
"""
|
|
1013
1043
|
This is the model for the predict request payload on the chat endpoint
|
|
@@ -1039,10 +1069,16 @@ class ChatModel(BaseModel):
|
|
|
1039
1069
|
user_prompt: Optional[UserPrompt] = Field(
|
|
1040
1070
|
default=None, description="Optional custom prompt input by the user"
|
|
1041
1071
|
)
|
|
1042
|
-
citations: bool = Field(
|
|
1072
|
+
citations: Union[bool, None, CitationsType] = Field(
|
|
1073
|
+
default=None,
|
|
1074
|
+
description="Whether to include citations in the response. "
|
|
1075
|
+
"If set to None or False, no citations will be computed. "
|
|
1076
|
+
"If set to True or 'default', citations will be computed after answer generation and send as a separate `CitationsGenerativeResponse` chunk. "
|
|
1077
|
+
"If set to 'llm_footnotes', citations will be included in the LLM's response as markdown-styled footnotes. A `FootnoteCitationsGenerativeResponse` chunk will also be sent to map footnote ids to context keys in the `query_context`.",
|
|
1078
|
+
)
|
|
1043
1079
|
citation_threshold: Optional[float] = Field(
|
|
1044
1080
|
default=None,
|
|
1045
|
-
description="If citations is True, this
|
|
1081
|
+
description="If citations is set to True or 'default', this will be the similarity threshold. Value between 0 and 1, lower values will produce more citations. If not set, it will be set to the optimized threshold found by Nuclia.",
|
|
1046
1082
|
ge=0.0,
|
|
1047
1083
|
le=1.0,
|
|
1048
1084
|
)
|
|
@@ -1158,7 +1194,7 @@ ALLOWED_FIELD_TYPES: dict[str, str] = {
|
|
|
1158
1194
|
"t": "text",
|
|
1159
1195
|
"f": "file",
|
|
1160
1196
|
"u": "link",
|
|
1161
|
-
"
|
|
1197
|
+
"c": "conversation",
|
|
1162
1198
|
"a": "generic",
|
|
1163
1199
|
}
|
|
1164
1200
|
|
|
@@ -1166,16 +1202,19 @@ ALLOWED_FIELD_TYPES: dict[str, str] = {
|
|
|
1166
1202
|
class FieldExtensionStrategy(RagStrategy):
|
|
1167
1203
|
name: Literal["field_extension"] = "field_extension"
|
|
1168
1204
|
fields: list[str] = Field(
|
|
1205
|
+
default=[],
|
|
1169
1206
|
title="Fields",
|
|
1170
|
-
description="List of field ids to extend the context with. It will try to extend the retrieval context with the specified fields in the matching resources. The field ids have to be in the format `{field_type}/{field_name}`, like 'a/title', 'a/summary' for title and summary fields or 't/amend' for a text field named 'amend'.",
|
|
1171
|
-
|
|
1207
|
+
description="List of field ids to extend the context with. It will try to extend the retrieval context with the specified fields in the matching resources. The field ids have to be in the format `{field_type}/{field_name}`, like 'a/title', 'a/summary' for title and summary fields or 't/amend' for a text field named 'amend'.",
|
|
1208
|
+
)
|
|
1209
|
+
data_augmentation_field_prefixes: list[str] = Field(
|
|
1210
|
+
default=[],
|
|
1211
|
+
description="List of prefixes for data augmentation added fields to extend the context with. For example, if the prefix is 'simpson', all fields that are a result of data augmentation with that prefix will be used to extend the context.",
|
|
1172
1212
|
)
|
|
1173
1213
|
|
|
1174
|
-
@
|
|
1175
|
-
|
|
1176
|
-
def fields_validator(cls, fields) -> Self:
|
|
1214
|
+
@model_validator(mode="after")
|
|
1215
|
+
def field_extension_strategy_validator(self) -> Self:
|
|
1177
1216
|
# Check that the fields are in the format {field_type}/{field_name}
|
|
1178
|
-
for field in fields:
|
|
1217
|
+
for field in self.fields:
|
|
1179
1218
|
try:
|
|
1180
1219
|
field_type, _ = field.strip("/").split("/")
|
|
1181
1220
|
except ValueError:
|
|
@@ -1188,8 +1227,7 @@ class FieldExtensionStrategy(RagStrategy):
|
|
|
1188
1227
|
f"Field '{field}' does not have a valid field type. "
|
|
1189
1228
|
f"Valid field types are: {allowed_field_types_part}."
|
|
1190
1229
|
)
|
|
1191
|
-
|
|
1192
|
-
return fields
|
|
1230
|
+
return self
|
|
1193
1231
|
|
|
1194
1232
|
|
|
1195
1233
|
class FullResourceApplyTo(BaseModel):
|
|
@@ -1227,6 +1265,7 @@ class HierarchyResourceStrategy(RagStrategy):
|
|
|
1227
1265
|
title="Count",
|
|
1228
1266
|
description="Number of extra characters that are added to each matching paragraph when adding to the context.",
|
|
1229
1267
|
ge=0,
|
|
1268
|
+
le=1024,
|
|
1230
1269
|
)
|
|
1231
1270
|
|
|
1232
1271
|
|
|
@@ -1455,7 +1494,7 @@ class PageImageStrategy(ImageRagStrategy):
|
|
|
1455
1494
|
count: Optional[int] = Field(
|
|
1456
1495
|
default=None,
|
|
1457
1496
|
title="Count",
|
|
1458
|
-
description="Maximum number of images to retrieve
|
|
1497
|
+
description="Maximum number of page images to retrieve. By default, at most 5 images are retrieved.",
|
|
1459
1498
|
)
|
|
1460
1499
|
|
|
1461
1500
|
|
|
@@ -1606,7 +1645,11 @@ class AskRequest(AuditMetadataBase):
|
|
|
1606
1645
|
description="Image that will be used together with the query text for retrieval and then sent to the LLM as part of the context. "
|
|
1607
1646
|
"If a query image is provided, the `extra_context_images` and `rag_images_strategies` will be disabled.",
|
|
1608
1647
|
)
|
|
1609
|
-
|
|
1648
|
+
|
|
1649
|
+
# autofilter is deprecated and its logic was removed. We're just keeping it in the model definition to
|
|
1650
|
+
# avoid breaking changes in the python sdks. Please remove on a future major release.
|
|
1651
|
+
autofilter: SkipJsonSchema[bool] = False
|
|
1652
|
+
|
|
1610
1653
|
highlight: bool = SearchParamDefaults.highlight.to_pydantic_field()
|
|
1611
1654
|
resource_filters: list[str] = SearchParamDefaults.resource_filters.to_pydantic_field()
|
|
1612
1655
|
prompt: Optional[Union[str, CustomPrompt]] = Field(
|
|
@@ -1616,13 +1659,16 @@ class AskRequest(AuditMetadataBase):
|
|
|
1616
1659
|
)
|
|
1617
1660
|
rank_fusion: Union[RankFusionName, RankFusion] = SearchParamDefaults.rank_fusion.to_pydantic_field()
|
|
1618
1661
|
reranker: Union[RerankerName, Reranker] = SearchParamDefaults.reranker.to_pydantic_field()
|
|
1619
|
-
citations: bool = Field(
|
|
1620
|
-
default=
|
|
1621
|
-
description="Whether to include
|
|
1662
|
+
citations: Union[bool, None, CitationsType] = Field(
|
|
1663
|
+
default=None,
|
|
1664
|
+
description="Whether to include citations in the response. "
|
|
1665
|
+
"If set to None or False, no citations will be computed. "
|
|
1666
|
+
"If set to True or 'default', citations will be computed after answer generation and send as a separate `CitationsGenerativeResponse` chunk. "
|
|
1667
|
+
"If set to 'llm_footnotes', citations will be included in the LLM's response as markdown-styled footnotes. A `FootnoteCitationsGenerativeResponse` chunk will also be sent to map footnote ids to context keys in the `query_context`.",
|
|
1622
1668
|
)
|
|
1623
1669
|
citation_threshold: Optional[float] = Field(
|
|
1624
1670
|
default=None,
|
|
1625
|
-
description="If citations is True, this
|
|
1671
|
+
description="If citations is set to True or 'default', this will be the similarity threshold. Value between 0 and 1, lower values will produce more citations. If not set, it will be set to the optimized threshold found by Nuclia.",
|
|
1626
1672
|
ge=0.0,
|
|
1627
1673
|
le=1.0,
|
|
1628
1674
|
)
|
|
@@ -2257,10 +2303,15 @@ class SyncAskResponse(BaseModel):
|
|
|
2257
2303
|
description="The detected relations of the answer",
|
|
2258
2304
|
)
|
|
2259
2305
|
citations: dict[str, Any] = Field(
|
|
2260
|
-
|
|
2306
|
+
default_factory=dict,
|
|
2261
2307
|
title="Citations",
|
|
2262
2308
|
description="The citations of the answer. List of references to the resources used to generate the answer.",
|
|
2263
2309
|
)
|
|
2310
|
+
citation_footnote_to_context: dict[str, str] = Field(
|
|
2311
|
+
default_factory=dict,
|
|
2312
|
+
title="Citation footnote to context",
|
|
2313
|
+
description="""Maps ids in the footnote citations to query_context keys (normally paragraph ids)""",
|
|
2314
|
+
)
|
|
2264
2315
|
augmented_context: Optional[AugmentedContext] = Field(
|
|
2265
2316
|
default=None,
|
|
2266
2317
|
description=(
|
|
@@ -2370,6 +2421,18 @@ class CitationsAskResponseItem(BaseModel):
|
|
|
2370
2421
|
citations: dict[str, Any]
|
|
2371
2422
|
|
|
2372
2423
|
|
|
2424
|
+
class FootnoteCitationsAskResponseItem(BaseModel):
|
|
2425
|
+
type: Literal["footnote_citations"] = "footnote_citations"
|
|
2426
|
+
footnote_to_context: dict[str, str] = Field(
|
|
2427
|
+
description="""Maps ids in the footnote citations to query_context keys (normally paragraph ids)
|
|
2428
|
+
e.g.,
|
|
2429
|
+
{ "block-AA": "f44f4e8acbfb1d48de3fd3c2fb04a885/f/f44f4e8acbfb1d48de3fd3c2fb04a885/73758-73972", ... }
|
|
2430
|
+
If the query_context is a list, it will map to 1-based indices as strings
|
|
2431
|
+
e.g., { "block-AA": "1", "block-AB": "2", ... }
|
|
2432
|
+
"""
|
|
2433
|
+
)
|
|
2434
|
+
|
|
2435
|
+
|
|
2373
2436
|
class StatusAskResponseItem(BaseModel):
|
|
2374
2437
|
type: Literal["status"] = "status"
|
|
2375
2438
|
code: str
|
|
@@ -2400,6 +2463,7 @@ AskResponseItemType = Union[
|
|
|
2400
2463
|
MetadataAskResponseItem,
|
|
2401
2464
|
AugmentedContextResponseItem,
|
|
2402
2465
|
CitationsAskResponseItem,
|
|
2466
|
+
FootnoteCitationsAskResponseItem,
|
|
2403
2467
|
StatusAskResponseItem,
|
|
2404
2468
|
ErrorAskResponseItem,
|
|
2405
2469
|
RetrievalAskResponseItem,
|
{nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.9.5.post5452.dist-info}/METADATA
RENAMED
|
@@ -1,19 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nucliadb_models
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.9.5.post5452
|
|
4
4
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Project-URL: Homepage, https://nuclia.com
|
|
7
7
|
Project-URL: Repository, https://github.com/nuclia/nucliadb
|
|
8
8
|
Classifier: Development Status :: 4 - Beta
|
|
9
9
|
Classifier: Programming Language :: Python
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
11
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
13
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
14
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
-
Requires-Python: <4,>=3.
|
|
15
|
+
Requires-Python: <4,>=3.10
|
|
17
16
|
Description-Content-Type: text/markdown
|
|
18
17
|
Requires-Dist: pydantic!=2.11.5,!=2.11.6,>=2.6
|
|
19
18
|
|
{nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.9.5.post5452.dist-info}/RECORD
RENAMED
|
@@ -1,22 +1,24 @@
|
|
|
1
1
|
nucliadb_models/__init__.py,sha256=3y8-htogKuCZcbhaUZdSjTeEjUSeec9aRWyL8AlKCyM,1077
|
|
2
|
-
nucliadb_models/
|
|
3
|
-
nucliadb_models/
|
|
2
|
+
nucliadb_models/augment.py,sha256=vAtFh4D4eC4nvfwaRTlfeuAMOL9Z9TFZnUNiRAMasss,2543
|
|
3
|
+
nucliadb_models/common.py,sha256=2dtKG4ZNi9p-yoNY76Uvyu1SlMeNYpH-MnuU3Q6w9Js,8169
|
|
4
|
+
nucliadb_models/configuration.py,sha256=BBrJsNjP324Cw_5J3dBrGwvpkHQYbXEo3TUaI9IqAOg,2449
|
|
4
5
|
nucliadb_models/content_types.py,sha256=36Ga-iGf4ivCqgtXC7imFgegrwHB117s9eqP62JtGv0,3456
|
|
5
|
-
nucliadb_models/conversation.py,sha256=
|
|
6
|
+
nucliadb_models/conversation.py,sha256=k9bKhkDiqhqmdrDfDPNoUfG7-2H_-KAyuOnETd8zV0E,5081
|
|
6
7
|
nucliadb_models/entities.py,sha256=i-7Y8qmFRRTih5zw0ajv1U_iiXexe66M3TK8hUikQZk,2356
|
|
7
8
|
nucliadb_models/export_import.py,sha256=mNm9IArOLnC6TLupkwqVFhxD5d08mpIVOVFneECv8UA,1073
|
|
8
|
-
nucliadb_models/external_index_providers.py,sha256=
|
|
9
|
+
nucliadb_models/external_index_providers.py,sha256=pL3leo4MkuJOnKlU1Sg6GT_mnK_VUBxGui-RPmDYVWU,1126
|
|
9
10
|
nucliadb_models/extracted.py,sha256=Owz7LC3le3Dvau3TtRiO8NY84meOf6IxN-RrOqqpMPs,5593
|
|
10
11
|
nucliadb_models/file.py,sha256=tXtgB9c7i2ADsnJ7HdbXyroAmXadGvOeA49htBh7BZo,2263
|
|
11
12
|
nucliadb_models/filters.py,sha256=NQI2-4AFzzJuZy8NeY3jXlTbbU5wxiwMCP-5DrD-7lE,14759
|
|
13
|
+
nucliadb_models/hydration.py,sha256=SlAzraJE6DX0uOpZWxu2k_9-ikYorsj0t8xwsWSBQZY,14363
|
|
12
14
|
nucliadb_models/labels.py,sha256=9zqRgkpZuX3kUPwsTTgCH7JyOWK7dM5pwyuHJR86YdU,3949
|
|
13
15
|
nucliadb_models/link.py,sha256=PF5hHLwdOed5TMBTxtokkgWtMh1bFnORZjybh0NwVCw,2526
|
|
14
|
-
nucliadb_models/metadata.py,sha256=
|
|
16
|
+
nucliadb_models/metadata.py,sha256=OOKGy_83NtlG1QKQZEwMuwu4wbVEe7P30Y2QvnGSDto,8933
|
|
15
17
|
nucliadb_models/notifications.py,sha256=mna8-AoD_29Wds0Thl0AF0zpERnJmYGLZX1w1fUopMY,4036
|
|
16
18
|
nucliadb_models/processing.py,sha256=nhKuHQjqCdb9zJVkYGPTLub23tK9e_lwL5OCDVymZjY,719
|
|
17
19
|
nucliadb_models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
20
|
nucliadb_models/resource.py,sha256=RzCos0QRgSMkaV-p7EoceSmt7UTzt9G9be5BKF-iGrQ,9021
|
|
19
|
-
nucliadb_models/search.py,sha256=
|
|
21
|
+
nucliadb_models/search.py,sha256=_vn3pDXcK4iwiCfim3BtlD5EaQAeXoxl2IfNDsrKesA,97514
|
|
20
22
|
nucliadb_models/security.py,sha256=opxaDLfvk3aU0sjesK0jGrYLx5h4YCwlKKN0moYs_ig,1150
|
|
21
23
|
nucliadb_models/synonyms.py,sha256=afbaVqSQSxGLwi2PusVaLSRpkOtA5AZmWOKd1f4nl2E,690
|
|
22
24
|
nucliadb_models/text.py,sha256=60bxZnOjRHnDdezR8VfR3AZsXTOwePFPs2BKB8wxBak,3414
|
|
@@ -32,7 +34,7 @@ nucliadb_models/graph/responses.py,sha256=Sdq8OgFAL1YT-1lJyLLrkqcScvj7YTEqAUwQ-k
|
|
|
32
34
|
nucliadb_models/internal/__init__.py,sha256=zG33bUz1rHFPtvqQPWn4rDwBJt3FJodGuQYD45quiQg,583
|
|
33
35
|
nucliadb_models/internal/predict.py,sha256=Pnx6MmLfK65eExe1XnVxqmSlvMwdowewwks9BOEoqMw,2029
|
|
34
36
|
nucliadb_models/internal/shards.py,sha256=__y1OZtWGiNcPQEWfSFOj8yw458WGi7mM4vZe0K-L1Y,1691
|
|
35
|
-
nucliadb_models-6.
|
|
36
|
-
nucliadb_models-6.
|
|
37
|
-
nucliadb_models-6.
|
|
38
|
-
nucliadb_models-6.
|
|
37
|
+
nucliadb_models-6.9.5.post5452.dist-info/METADATA,sha256=B4hcUdlQ5Uk9ik2FUjSkT6ZQHyby87Q7eYlGGMx8wkI,745
|
|
38
|
+
nucliadb_models-6.9.5.post5452.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
39
|
+
nucliadb_models-6.9.5.post5452.dist-info/top_level.txt,sha256=UrY1I8oeovIRwkXLYplssTrxQdUjhSEFDFbnwaIV3tA,16
|
|
40
|
+
nucliadb_models-6.9.5.post5452.dist-info/RECORD,,
|
|
File without changes
|
{nucliadb_models-6.8.1.post4983.dist-info → nucliadb_models-6.9.5.post5452.dist-info}/top_level.txt
RENAMED
|
File without changes
|