nucliadb-models 6.6.1.post4642__py3-none-any.whl → 6.9.3.post5295__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb-models might be problematic. Click here for more details.
- nucliadb_models/common.py +9 -0
- nucliadb_models/configuration.py +1 -1
- nucliadb_models/content_types.py +1 -0
- nucliadb_models/conversation.py +42 -9
- nucliadb_models/external_index_providers.py +5 -19
- nucliadb_models/extracted.py +0 -25
- nucliadb_models/file.py +8 -0
- nucliadb_models/hydration.py +424 -0
- nucliadb_models/link.py +8 -0
- nucliadb_models/metadata.py +18 -3
- nucliadb_models/resource.py +1 -0
- nucliadb_models/search.py +131 -31
- nucliadb_models/text.py +8 -0
- nucliadb_models/writer.py +1 -1
- {nucliadb_models-6.6.1.post4642.dist-info → nucliadb_models-6.9.3.post5295.dist-info}/METADATA +2 -3
- {nucliadb_models-6.6.1.post4642.dist-info → nucliadb_models-6.9.3.post5295.dist-info}/RECORD +18 -17
- {nucliadb_models-6.6.1.post4642.dist-info → nucliadb_models-6.9.3.post5295.dist-info}/WHEEL +0 -0
- {nucliadb_models-6.6.1.post4642.dist-info → nucliadb_models-6.9.3.post5295.dist-info}/top_level.txt +0 -0
nucliadb_models/common.py
CHANGED
|
@@ -203,6 +203,15 @@ class FieldTypeName(str, Enum):
|
|
|
203
203
|
"a": FieldTypeName.GENERIC,
|
|
204
204
|
}[abbr]
|
|
205
205
|
|
|
206
|
+
def abbreviation(self) -> str:
|
|
207
|
+
return {
|
|
208
|
+
FieldTypeName.TEXT: "t",
|
|
209
|
+
FieldTypeName.FILE: "f",
|
|
210
|
+
FieldTypeName.LINK: "u",
|
|
211
|
+
FieldTypeName.CONVERSATION: "c",
|
|
212
|
+
FieldTypeName.GENERIC: "a",
|
|
213
|
+
}[self]
|
|
214
|
+
|
|
206
215
|
|
|
207
216
|
class FieldRef(BaseModel):
|
|
208
217
|
field_type: FieldTypeName
|
nucliadb_models/configuration.py
CHANGED
|
@@ -44,7 +44,7 @@ def _model_fields(model: type[BaseModel], skip: list[str]) -> dict[str, Any]:
|
|
|
44
44
|
}
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
# FindConfig is a
|
|
47
|
+
# FindConfig is a FindRequest without `search_configuration`
|
|
48
48
|
FindConfig = create_model("FindConfig", **_model_fields(FindRequest, skip=["search_configuration"]))
|
|
49
49
|
|
|
50
50
|
|
nucliadb_models/content_types.py
CHANGED
nucliadb_models/conversation.py
CHANGED
|
@@ -79,39 +79,72 @@ class FieldConversation(BaseModel):
|
|
|
79
79
|
size: Optional[int] = None
|
|
80
80
|
total: Optional[int] = None
|
|
81
81
|
extract_strategy: Optional[str] = None
|
|
82
|
+
split_strategy: Optional[str] = None
|
|
82
83
|
|
|
83
84
|
|
|
84
85
|
# Creation and update classes (Those used on writer endpoints)
|
|
85
86
|
|
|
86
87
|
|
|
87
88
|
class InputMessageContent(BaseModel):
|
|
88
|
-
text: str
|
|
89
|
+
text: str = Field()
|
|
89
90
|
format: MessageFormat = MessageFormat.PLAIN
|
|
90
|
-
attachments: List[FileB64] = []
|
|
91
|
-
attachments_fields: List[FieldRef] = []
|
|
91
|
+
attachments: List[FileB64] = Field(default=[], max_length=50)
|
|
92
|
+
attachments_fields: List[FieldRef] = Field(default=[], max_length=50)
|
|
92
93
|
|
|
93
94
|
|
|
94
95
|
class InputMessage(BaseModel):
|
|
95
|
-
timestamp: Optional[datetime] =
|
|
96
|
-
|
|
97
|
-
|
|
96
|
+
timestamp: Optional[datetime] = Field(
|
|
97
|
+
default=None, description="Time at which the message was sent, in ISO 8601 format."
|
|
98
|
+
)
|
|
99
|
+
who: Optional[str] = Field(
|
|
100
|
+
default=None, description="Sender of the message, e.g. 'user' or 'assistant'"
|
|
101
|
+
)
|
|
102
|
+
to: List[str] = Field(
|
|
103
|
+
default_factory=list,
|
|
104
|
+
description="List of recipients of the message, e.g. ['assistant'] or ['user']",
|
|
105
|
+
max_length=100,
|
|
106
|
+
)
|
|
98
107
|
content: InputMessageContent
|
|
99
|
-
ident: str
|
|
108
|
+
ident: str = Field(
|
|
109
|
+
description="Unique identifier for the message. Must be unique within the conversation.",
|
|
110
|
+
max_length=128,
|
|
111
|
+
)
|
|
100
112
|
type_: Optional[MessageType] = Field(None, alias="type")
|
|
101
113
|
|
|
102
114
|
@field_validator("ident", mode="after")
|
|
103
115
|
@classmethod
|
|
104
|
-
def
|
|
116
|
+
def validate_ident(cls, value: str) -> str:
|
|
105
117
|
# The split value "0" is reserved by learning
|
|
106
118
|
# Used to mark questions to override in the QA agent
|
|
107
119
|
if value == "0":
|
|
108
120
|
raise ValueError('Message ident cannot be "0"')
|
|
121
|
+
# Ident cannot contain "/" as it is used in the text
|
|
122
|
+
# block match ids (paragraph ids)
|
|
123
|
+
if "/" in value:
|
|
124
|
+
raise ValueError('Message ident cannot contain "/"')
|
|
109
125
|
return value
|
|
110
126
|
|
|
111
127
|
|
|
112
128
|
class InputConversationField(BaseModel):
|
|
113
|
-
messages: List[InputMessage] =
|
|
129
|
+
messages: List[InputMessage] = Field(
|
|
130
|
+
default_factory=list,
|
|
131
|
+
description="List of messages in the conversation field. Each message must have a unique ident. A single conversation can contain up to 51,200 messages. You can add up to 2,048 messages per request.",
|
|
132
|
+
)
|
|
114
133
|
extract_strategy: Optional[str] = Field(
|
|
115
134
|
default=None,
|
|
116
135
|
description="Id of the Nuclia extract strategy used at processing time. If not set, the default strategy was used. Extract strategies are defined at the learning configuration api.",
|
|
117
136
|
)
|
|
137
|
+
split_strategy: Optional[str] = Field(
|
|
138
|
+
default=None,
|
|
139
|
+
description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
@field_validator("messages", mode="after")
|
|
143
|
+
@classmethod
|
|
144
|
+
def idents_are_unique(cls, value: List[InputMessage]) -> List[InputMessage]:
|
|
145
|
+
seen_idents = set()
|
|
146
|
+
for message in value:
|
|
147
|
+
if message.ident in seen_idents:
|
|
148
|
+
raise ValueError(f'Message ident "{message.ident}" is not unique')
|
|
149
|
+
seen_idents.add(message.ident)
|
|
150
|
+
return value
|
|
@@ -22,32 +22,18 @@ from pydantic import BaseModel
|
|
|
22
22
|
class ExternalIndexProviderType(str, Enum):
|
|
23
23
|
"""
|
|
24
24
|
Enum for the different external index providers.
|
|
25
|
-
For now
|
|
25
|
+
For now none are supported, but we may add some in the future.
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
|
-
|
|
28
|
+
UNSET = "unset"
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ExternalIndexProviderBase(BaseModel):
|
|
32
32
|
type: ExternalIndexProviderType
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
class
|
|
36
|
-
|
|
37
|
-
List of cloud providers supported by Pinecone serverless vector database.
|
|
38
|
-
"""
|
|
39
|
-
|
|
40
|
-
AWS_US_EAST_1 = "aws_us_east_1"
|
|
41
|
-
AWS_US_WEST_2 = "aws_us_west_2"
|
|
42
|
-
AWS_EU_WEST_1 = "aws_eu_west_1"
|
|
43
|
-
GCP_US_CENTRAL1 = "gcp_us_central1"
|
|
44
|
-
AZURE_EASTUS2 = "azure_eastus2"
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
class PineconeIndexProvider(ExternalIndexProviderBase):
|
|
48
|
-
type: ExternalIndexProviderType = ExternalIndexProviderType.PINECONE
|
|
49
|
-
api_key: str
|
|
50
|
-
serverless_cloud: PineconeServerlessCloud
|
|
35
|
+
class DummyIndexProvider(ExternalIndexProviderBase):
|
|
36
|
+
type: ExternalIndexProviderType = ExternalIndexProviderType.UNSET
|
|
51
37
|
|
|
52
38
|
|
|
53
|
-
ExternalIndexProvider = Union[
|
|
39
|
+
ExternalIndexProvider = Union[DummyIndexProvider,]
|
nucliadb_models/extracted.py
CHANGED
|
@@ -19,9 +19,7 @@ from pydantic import BaseModel
|
|
|
19
19
|
|
|
20
20
|
from .common import (
|
|
21
21
|
Classification,
|
|
22
|
-
CloudFile,
|
|
23
22
|
CloudLink,
|
|
24
|
-
FieldID,
|
|
25
23
|
Paragraph,
|
|
26
24
|
QuestionAnswers,
|
|
27
25
|
)
|
|
@@ -34,12 +32,6 @@ class ExtractedText(BaseModel):
|
|
|
34
32
|
deleted_splits: Optional[List[str]] = None
|
|
35
33
|
|
|
36
34
|
|
|
37
|
-
class ExtractedTextWrapper(BaseModel):
|
|
38
|
-
body: Optional[ExtractedText] = None
|
|
39
|
-
file: Optional[CloudFile] = None
|
|
40
|
-
field: Optional[FieldID] = None
|
|
41
|
-
|
|
42
|
-
|
|
43
35
|
class Vector(BaseModel):
|
|
44
36
|
start: Optional[int] = None
|
|
45
37
|
end: Optional[int] = None
|
|
@@ -58,12 +50,6 @@ class VectorObject(BaseModel):
|
|
|
58
50
|
deleted_splits: Optional[List[str]] = None
|
|
59
51
|
|
|
60
52
|
|
|
61
|
-
class ExtractedVectorsWrapper(BaseModel):
|
|
62
|
-
vectors: Optional[VectorObject] = None
|
|
63
|
-
file: Optional[CloudFile] = None
|
|
64
|
-
field: Optional[FieldID] = None
|
|
65
|
-
|
|
66
|
-
|
|
67
53
|
class Position(BaseModel):
|
|
68
54
|
start: int
|
|
69
55
|
end: int
|
|
@@ -113,11 +99,6 @@ class FieldComputedMetadata(BaseModel):
|
|
|
113
99
|
deleted_splits: Optional[List[str]] = None
|
|
114
100
|
|
|
115
101
|
|
|
116
|
-
class FieldComputedMetadataWrapper(BaseModel):
|
|
117
|
-
metadata: Optional[FieldComputedMetadata] = None
|
|
118
|
-
field: Optional[FieldID] = None
|
|
119
|
-
|
|
120
|
-
|
|
121
102
|
class Entity(BaseModel):
|
|
122
103
|
token: Optional[str] = None
|
|
123
104
|
root: Optional[str] = None
|
|
@@ -135,12 +116,6 @@ class LargeComputedMetadata(BaseModel):
|
|
|
135
116
|
deleted_splits: Optional[List[str]] = None
|
|
136
117
|
|
|
137
118
|
|
|
138
|
-
class LargeComputedMetadataWrapper(BaseModel):
|
|
139
|
-
real: Optional[LargeComputedMetadata] = None
|
|
140
|
-
file: Optional[CloudFile] = None
|
|
141
|
-
field: Optional[FieldID] = None
|
|
142
|
-
|
|
143
|
-
|
|
144
119
|
class LinkExtractedData(BaseModel):
|
|
145
120
|
date: Optional[datetime] = None
|
|
146
121
|
language: Optional[str] = None
|
nucliadb_models/file.py
CHANGED
|
@@ -35,6 +35,10 @@ class FieldFile(BaseModel):
|
|
|
35
35
|
default=None,
|
|
36
36
|
description="Id of the Nuclia extract strategy used at processing time. If not set, the default strategy was used. Extract strategies are defined at the learning configuration api.",
|
|
37
37
|
)
|
|
38
|
+
split_strategy: Optional[str] = Field(
|
|
39
|
+
default=None,
|
|
40
|
+
description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
|
|
41
|
+
)
|
|
38
42
|
|
|
39
43
|
|
|
40
44
|
# Creation and update classes (Those used on writer endpoints)
|
|
@@ -48,3 +52,7 @@ class FileField(BaseModel):
|
|
|
48
52
|
default=None,
|
|
49
53
|
description="Id of the Nuclia extract strategy to use at processing time. If not set, the default strategy will be used. Extract strategies are defined at the learning configuration api.",
|
|
50
54
|
)
|
|
55
|
+
split_strategy: Optional[str] = Field(
|
|
56
|
+
default=None,
|
|
57
|
+
description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
|
|
58
|
+
)
|
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
# Copyright 2025 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
#
|
|
15
|
+
from typing import Annotated, Optional, Union
|
|
16
|
+
|
|
17
|
+
from pydantic import BaseModel, Field, StringConstraints
|
|
18
|
+
|
|
19
|
+
from nucliadb_models.common import FieldTypeName
|
|
20
|
+
from nucliadb_models.metadata import Origin
|
|
21
|
+
from nucliadb_models.resource import FieldConversation, FieldFile, FieldLink, FieldText
|
|
22
|
+
from nucliadb_models.search import Image
|
|
23
|
+
from nucliadb_models.security import ResourceSecurity
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ResourceHydration(BaseModel, extra="forbid"):
|
|
27
|
+
title: bool = Field(
|
|
28
|
+
default=True,
|
|
29
|
+
description="Hydrate resource titles",
|
|
30
|
+
)
|
|
31
|
+
summary: bool = Field(
|
|
32
|
+
default=False,
|
|
33
|
+
description="Hydrate resource summaries",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
origin: bool = Field(
|
|
37
|
+
default=False,
|
|
38
|
+
description="Hydrate resource origin",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
security: bool = Field(
|
|
42
|
+
default=False,
|
|
43
|
+
description="Hydrate resource security metadata",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TextFieldHydration(BaseModel, extra="forbid"):
|
|
48
|
+
value: bool = Field(
|
|
49
|
+
default=False,
|
|
50
|
+
description="Hydrate text field values. Field values are similar payloads to the ones used to create them",
|
|
51
|
+
)
|
|
52
|
+
extracted_text: bool = Field(
|
|
53
|
+
default=False,
|
|
54
|
+
description="Hydrate extracted text for text fields",
|
|
55
|
+
)
|
|
56
|
+
# TODO: what else should be interesting to add?
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class FileFieldHydration(BaseModel, extra="forbid"):
|
|
60
|
+
value: bool = Field(
|
|
61
|
+
default=False,
|
|
62
|
+
description="Hydrate file field values. Field values are similar payloads to the ones used to create them",
|
|
63
|
+
)
|
|
64
|
+
extracted_text: bool = Field(
|
|
65
|
+
default=False,
|
|
66
|
+
description="Hydrate extracted text for file fields",
|
|
67
|
+
)
|
|
68
|
+
# TODO: what else should be interesting to add?
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class LinkFieldHydration(BaseModel, extra="forbid"):
|
|
72
|
+
value: bool = Field(
|
|
73
|
+
default=False,
|
|
74
|
+
description="Hydrate link field values. Field values are similar payloads to the ones used to create them",
|
|
75
|
+
)
|
|
76
|
+
extracted_text: bool = Field(
|
|
77
|
+
default=False,
|
|
78
|
+
description="Hydrate extracted text for link fields",
|
|
79
|
+
)
|
|
80
|
+
# TODO: what else should be interesting to add?
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class ConversationFieldHydration(BaseModel, extra="forbid"):
|
|
84
|
+
value: bool = Field(
|
|
85
|
+
default=False,
|
|
86
|
+
description="Hydrate conversation field values. Field values are similar payloads to the ones used to create them",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# TODO: add fields to hydrate conversation fields. Think about how to handle
|
|
90
|
+
# splits and fulfill the conversational RAG strategies
|
|
91
|
+
|
|
92
|
+
# TODO: what else should be interesting to add?
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class GenericFieldHydration(BaseModel, extra="forbid"):
|
|
96
|
+
value: bool = Field(
|
|
97
|
+
default=False,
|
|
98
|
+
description="Hydrate generic field values. Field values are similar payloads to the ones used to create them",
|
|
99
|
+
)
|
|
100
|
+
extracted_text: bool = Field(
|
|
101
|
+
default=False,
|
|
102
|
+
description="Hydrate extracted text for generic fields",
|
|
103
|
+
)
|
|
104
|
+
# TODO: what else should be interesting to add?
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class FieldHydration(BaseModel, extra="forbid"):
|
|
108
|
+
text: Optional[TextFieldHydration] = Field(
|
|
109
|
+
default_factory=TextFieldHydration,
|
|
110
|
+
description="Text fields hydration options",
|
|
111
|
+
)
|
|
112
|
+
file: Optional[FileFieldHydration] = Field(
|
|
113
|
+
default_factory=FileFieldHydration,
|
|
114
|
+
description="File fields hydration options",
|
|
115
|
+
)
|
|
116
|
+
link: Optional[LinkFieldHydration] = Field(
|
|
117
|
+
default_factory=LinkFieldHydration,
|
|
118
|
+
description="Link fields hydration options",
|
|
119
|
+
)
|
|
120
|
+
conversation: Optional[ConversationFieldHydration] = Field(
|
|
121
|
+
default_factory=ConversationFieldHydration,
|
|
122
|
+
description="Conversation fields hydration options",
|
|
123
|
+
)
|
|
124
|
+
generic: Optional[GenericFieldHydration] = Field(
|
|
125
|
+
default_factory=GenericFieldHydration,
|
|
126
|
+
description="Generic fields hydration options",
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class NeighbourParagraphHydration(BaseModel, extra="forbid"):
|
|
131
|
+
before: int = Field(
|
|
132
|
+
default=2,
|
|
133
|
+
ge=0,
|
|
134
|
+
description="Number of previous paragraphs to hydrate",
|
|
135
|
+
)
|
|
136
|
+
after: int = Field(
|
|
137
|
+
default=2,
|
|
138
|
+
ge=0,
|
|
139
|
+
description="Number of following paragraphs to hydrate",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class RelatedParagraphHydration(BaseModel, extra="forbid"):
|
|
144
|
+
neighbours: Optional[NeighbourParagraphHydration] = Field(
|
|
145
|
+
default=None,
|
|
146
|
+
description="Hydrate extra paragraphs that surround the original one",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# TODO: FEATURE: implement related paragraphs by page
|
|
150
|
+
# page: bool = Field(
|
|
151
|
+
# default=False,
|
|
152
|
+
# description="Hydrate all paragraphs in the same page. This only applies to fields with pages",
|
|
153
|
+
# )
|
|
154
|
+
|
|
155
|
+
# TODO: description
|
|
156
|
+
# XXX: should we let users control the amount of elements?
|
|
157
|
+
parents: bool = False
|
|
158
|
+
# TODO: description
|
|
159
|
+
# XXX: should we let users control the amount of elements?
|
|
160
|
+
siblings: bool = False
|
|
161
|
+
# TODO: description
|
|
162
|
+
# XXX: should we let users control the amount of elements?
|
|
163
|
+
replacements: bool = False
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class ImageParagraphHydration(BaseModel, extra="forbid"):
|
|
167
|
+
# The source image is also known as reference or reference_file in the
|
|
168
|
+
# paragraph context. The reference/reference_file is the filename of the
|
|
169
|
+
# source image from which the paragraph has been extracted
|
|
170
|
+
source_image: bool = Field(
|
|
171
|
+
default=False,
|
|
172
|
+
description=(
|
|
173
|
+
"When a paragraph has been extracted from an image (using OCR, inception...), "
|
|
174
|
+
"hydrate the image that represents it"
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class TableParagraphHydration(BaseModel, extra="forbid"):
|
|
180
|
+
# TODO: implement. ARAG uses the label "/k/table" to check whether a
|
|
181
|
+
# paragraph is or a table or not. We can also use info on maindb
|
|
182
|
+
table_page_preview: bool = Field(
|
|
183
|
+
default=False,
|
|
184
|
+
description="Hydrate the page preview for the table. This will only hydrate fields with pages",
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class ParagraphPageHydration(BaseModel, extra="forbid"):
|
|
189
|
+
# For some field types (file and link) learning generates previews. A
|
|
190
|
+
# preview is a PDF file representing the content. For a docx for example, is
|
|
191
|
+
# the PDF equivalent. Depending on the field type, the preview can
|
|
192
|
+
# represent, for example, a page in a document or a portion of a webpage.
|
|
193
|
+
page_with_visual: bool = Field(
|
|
194
|
+
default=False,
|
|
195
|
+
description=(
|
|
196
|
+
"When a paragraph has been extracted from a page containing visual "
|
|
197
|
+
"content (images, tables...), hydrate the preview of the paragraph's "
|
|
198
|
+
"page as an image. Not all field types have previews nor visual content"
|
|
199
|
+
),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class ParagraphHydration(BaseModel, extra="forbid"):
|
|
204
|
+
text: bool = Field(
|
|
205
|
+
default=True,
|
|
206
|
+
description="Hydrate paragraph text",
|
|
207
|
+
)
|
|
208
|
+
image: Optional[ImageParagraphHydration] = Field(
|
|
209
|
+
default=None,
|
|
210
|
+
description="Hydrate options for paragraphs extracted from images (using OCR, inception...)",
|
|
211
|
+
)
|
|
212
|
+
table: Optional[TableParagraphHydration] = Field(
|
|
213
|
+
default=None,
|
|
214
|
+
description="Hydrate options for paragraphs extracted from tables",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# TODO: at some point, we should add hydration options for paragraphs from
|
|
218
|
+
# audio and video
|
|
219
|
+
|
|
220
|
+
page: Optional[ParagraphPageHydration] = Field(
|
|
221
|
+
default=None,
|
|
222
|
+
description="Hydrte options for paragraphs within a page. This applies to paragraphs in fields with pages",
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
related: Optional[RelatedParagraphHydration] = Field(
|
|
226
|
+
default=None,
|
|
227
|
+
description="Hydration options for related paragraphs. For example, neighbours or sibling paragraphs",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
class Hydration(BaseModel, extra="forbid"):
|
|
232
|
+
resource: Optional[ResourceHydration] = Field(
|
|
233
|
+
default_factory=ResourceHydration,
|
|
234
|
+
description="Resource hydration options",
|
|
235
|
+
)
|
|
236
|
+
field: FieldHydration = Field(
|
|
237
|
+
default_factory=FieldHydration,
|
|
238
|
+
description="Field hydration options",
|
|
239
|
+
)
|
|
240
|
+
paragraph: ParagraphHydration = Field(
|
|
241
|
+
default_factory=ParagraphHydration,
|
|
242
|
+
description="Paragraph hydration options",
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
ParagraphId = Annotated[
|
|
247
|
+
str,
|
|
248
|
+
StringConstraints(
|
|
249
|
+
pattern=r"^[0-9a-f]{32}/[acftu]/[a-zA-Z0-9:_-]+/[0-9]+-[0-9]+$",
|
|
250
|
+
min_length=32 + 1 + 1 + 1 + 1 + 1 + 3,
|
|
251
|
+
# max field id of 250 and 10 digit paragraphs. More than enough
|
|
252
|
+
max_length=32 + 1 + 1 + 1 + 250 + 1 + 21,
|
|
253
|
+
),
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
class HydrateRequest(BaseModel, extra="forbid"):
|
|
258
|
+
data: list[ParagraphId] = Field(
|
|
259
|
+
description="List of paragraph ids we want to hydrate",
|
|
260
|
+
max_length=50,
|
|
261
|
+
)
|
|
262
|
+
hydration: Hydration = Field(description="Description of how hydration must be performed")
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
### Response models
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class HydratedResource(BaseModel, extra="forbid"):
|
|
269
|
+
id: str = Field(description="Unique resource id")
|
|
270
|
+
slug: str = Field(description="Resource slug")
|
|
271
|
+
|
|
272
|
+
title: Optional[str] = None
|
|
273
|
+
summary: Optional[str] = None
|
|
274
|
+
|
|
275
|
+
origin: Optional[Origin] = None
|
|
276
|
+
|
|
277
|
+
security: Optional[ResourceSecurity] = None
|
|
278
|
+
|
|
279
|
+
# TODO: add resource labels to hydrated resources
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
class FieldExtractedData(BaseModel, extra="forbid"):
|
|
283
|
+
text: Optional[str] = None
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
class SplitFieldExtractedData(BaseModel, extra="forbid"):
|
|
287
|
+
texts: Optional[dict[str, str]] = None
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
class HydratedTextField(BaseModel, extra="forbid"):
|
|
291
|
+
id: str = Field("Unique field id")
|
|
292
|
+
resource: str = Field("Field resource id")
|
|
293
|
+
field_type: FieldTypeName = FieldTypeName.TEXT
|
|
294
|
+
|
|
295
|
+
value: Optional[FieldText] = None
|
|
296
|
+
extracted: Optional[FieldExtractedData] = None
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
class HydratedFileField(BaseModel, extra="forbid"):
|
|
300
|
+
id: str = Field("Unique field id")
|
|
301
|
+
resource: str = Field("Field resource id")
|
|
302
|
+
field_type: FieldTypeName = FieldTypeName.FILE
|
|
303
|
+
|
|
304
|
+
value: Optional[FieldFile] = None
|
|
305
|
+
extracted: Optional[FieldExtractedData] = None
|
|
306
|
+
|
|
307
|
+
previews: Optional[dict[str, Image]] = Field(
|
|
308
|
+
default=None,
|
|
309
|
+
title="Previews of specific parts of the field",
|
|
310
|
+
description=(
|
|
311
|
+
"Previews for specific pages of this field. Previews are differents"
|
|
312
|
+
"depending on the file type. For example, for a PDF file, a preview"
|
|
313
|
+
"will be an image of a single page."
|
|
314
|
+
"In this field, previews will be populated according to the hydration"
|
|
315
|
+
"options requested."
|
|
316
|
+
),
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
class HydratedLinkField(BaseModel, extra="forbid"):
|
|
321
|
+
id: str = Field("Unique field id")
|
|
322
|
+
resource: str = Field("Field resource id")
|
|
323
|
+
field_type: FieldTypeName = FieldTypeName.LINK
|
|
324
|
+
|
|
325
|
+
value: Optional[FieldLink] = None
|
|
326
|
+
extracted: Optional[FieldExtractedData] = None
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
class HydratedConversationField(BaseModel, extra="forbid"):
|
|
330
|
+
id: str = Field("Unique field id")
|
|
331
|
+
resource: str = Field("Field resource id")
|
|
332
|
+
field_type: FieldTypeName = FieldTypeName.CONVERSATION
|
|
333
|
+
|
|
334
|
+
value: Optional[FieldConversation] = None
|
|
335
|
+
extracted: Optional[FieldExtractedData] = None
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
class HydratedGenericField(BaseModel, extra="forbid"):
|
|
339
|
+
id: str = Field("Unique field id")
|
|
340
|
+
resource: str = Field("Field resource id")
|
|
341
|
+
field_type: FieldTypeName = FieldTypeName.TEXT
|
|
342
|
+
|
|
343
|
+
value: Optional[str] = None
|
|
344
|
+
extracted: Optional[FieldExtractedData] = None
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
class RelatedNeighbourParagraphRefs(BaseModel, extra="forbid"):
|
|
348
|
+
before: Optional[list[str]] = None
|
|
349
|
+
after: Optional[list[str]] = None
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
class RelatedParagraphRefs(BaseModel, extra="forbid"):
|
|
353
|
+
neighbours: Optional[RelatedNeighbourParagraphRefs] = None
|
|
354
|
+
parents: Optional[list[str]] = None
|
|
355
|
+
siblings: Optional[list[str]] = None
|
|
356
|
+
replacements: Optional[list[str]] = None
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
class HydratedParagraphImage(BaseModel, extra="forbid"):
|
|
360
|
+
source_image: Optional[Image] = Field(
|
|
361
|
+
default=None,
|
|
362
|
+
description=(
|
|
363
|
+
"Source image for this paragraph. This only applies to paragraphs "
|
|
364
|
+
"extracted from an image using OCR or inception, and if this "
|
|
365
|
+
"hydration option has been enabled in the request"
|
|
366
|
+
),
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
class HydratedParagraphTable(BaseModel, extra="forbid"):
|
|
371
|
+
page_preview_ref: Optional[str] = Field(
|
|
372
|
+
default=None,
|
|
373
|
+
description=(
|
|
374
|
+
"Referento to the page preview for this paragraph. The actual "
|
|
375
|
+
"preview will be found in the previews of its field. This only "
|
|
376
|
+
"applies to paragraphs generated from a table and if the "
|
|
377
|
+
"corresponding hydration option has been enabled in the request"
|
|
378
|
+
),
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
class HydratedParagraphPage(BaseModel, extra="forbid"):
|
|
383
|
+
page_preview_ref: Optional[str] = Field(
|
|
384
|
+
default=None,
|
|
385
|
+
description=(
|
|
386
|
+
"Reference to the page preview for this paragraph. The actual "
|
|
387
|
+
"preview will be found in the previews of its field. This only "
|
|
388
|
+
"applies to paragraphs extracted from a page containing visual "
|
|
389
|
+
"content and if the corresponding hydration option has been enabled "
|
|
390
|
+
"in the request"
|
|
391
|
+
),
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
class HydratedParagraph(BaseModel, extra="forbid"):
|
|
396
|
+
id: str = Field(description="Unique paragraph id")
|
|
397
|
+
field: str = Field(description="Paragraph field id")
|
|
398
|
+
resource: str = Field(description="Paragraph resource id")
|
|
399
|
+
|
|
400
|
+
text: Optional[str] = None
|
|
401
|
+
|
|
402
|
+
# TODO: add labels to hydrated paragraphs
|
|
403
|
+
# labels: Optional[list[str]] = None
|
|
404
|
+
|
|
405
|
+
related: Optional[RelatedParagraphRefs] = None
|
|
406
|
+
|
|
407
|
+
image: Optional[HydratedParagraphImage] = None
|
|
408
|
+
table: Optional[HydratedParagraphTable] = None
|
|
409
|
+
page: Optional[HydratedParagraphPage] = None
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
class Hydrated(BaseModel, extra="forbid"):
|
|
413
|
+
resources: dict[str, HydratedResource]
|
|
414
|
+
fields: dict[
|
|
415
|
+
str,
|
|
416
|
+
Union[
|
|
417
|
+
HydratedTextField,
|
|
418
|
+
HydratedFileField,
|
|
419
|
+
HydratedLinkField,
|
|
420
|
+
HydratedConversationField,
|
|
421
|
+
HydratedGenericField,
|
|
422
|
+
],
|
|
423
|
+
]
|
|
424
|
+
paragraphs: dict[str, HydratedParagraph]
|
nucliadb_models/link.py
CHANGED
|
@@ -37,6 +37,10 @@ class FieldLink(BaseModel):
|
|
|
37
37
|
default=None,
|
|
38
38
|
description="Id of the Nuclia extract strategy used at processing time. If not set, the default strategy was used. Extract strategies are defined at the learning configuration api.",
|
|
39
39
|
)
|
|
40
|
+
split_strategy: Optional[str] = Field(
|
|
41
|
+
default=None,
|
|
42
|
+
description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
|
|
43
|
+
)
|
|
40
44
|
|
|
41
45
|
|
|
42
46
|
# Creation and update classes (Those used on writer endpoints)
|
|
@@ -54,3 +58,7 @@ class LinkField(BaseModel):
|
|
|
54
58
|
default=None,
|
|
55
59
|
description="Id of the Nuclia extract strategy to use at processing time. If not set, the default strategy will be used. Extract strategies are defined at the learning configuration api.",
|
|
56
60
|
)
|
|
61
|
+
split_strategy: Optional[str] = Field(
|
|
62
|
+
default=None,
|
|
63
|
+
description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
|
|
64
|
+
)
|
nucliadb_models/metadata.py
CHANGED
|
@@ -17,7 +17,7 @@ from datetime import datetime
|
|
|
17
17
|
from enum import Enum
|
|
18
18
|
from typing import Any, Dict, List, Optional
|
|
19
19
|
|
|
20
|
-
from pydantic import BaseModel, Field, model_validator
|
|
20
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
21
21
|
from typing_extensions import Self
|
|
22
22
|
|
|
23
23
|
from nucliadb_models.utils import DateTime
|
|
@@ -231,15 +231,24 @@ class InputOrigin(BaseModel):
|
|
|
231
231
|
default=[],
|
|
232
232
|
title="Tags",
|
|
233
233
|
description="Resource tags about the origin system. It can later be used for filtering on search endpoints with '/origin.tags/{tag}'",
|
|
234
|
+
max_length=300,
|
|
234
235
|
)
|
|
235
|
-
collaborators: List[str] = []
|
|
236
|
+
collaborators: List[str] = Field(default=[], max_length=100)
|
|
236
237
|
filename: Optional[str] = None
|
|
237
|
-
related: List[str] = []
|
|
238
|
+
related: List[str] = Field(default=[], max_length=100)
|
|
238
239
|
path: Optional[str] = Field(
|
|
239
240
|
default=None,
|
|
240
241
|
description="Path of the original resource. Typically used to store folder structure information of the resource at the origin system. It can be later used for filtering on search endpoints with '/origin.path/{path}'",
|
|
242
|
+
max_length=2048,
|
|
241
243
|
)
|
|
242
244
|
|
|
245
|
+
@field_validator("tags")
|
|
246
|
+
def validate_tag_length(cls, tags):
|
|
247
|
+
for tag in tags:
|
|
248
|
+
if len(tag) > 512:
|
|
249
|
+
raise ValueError("Each tag must be at most 1024 characters long")
|
|
250
|
+
return tags
|
|
251
|
+
|
|
243
252
|
|
|
244
253
|
class Origin(InputOrigin):
|
|
245
254
|
# Created and modified are redefined to
|
|
@@ -247,6 +256,12 @@ class Origin(InputOrigin):
|
|
|
247
256
|
created: Optional[datetime] = None
|
|
248
257
|
modified: Optional[datetime] = None
|
|
249
258
|
|
|
259
|
+
tags: List[str] = Field(
|
|
260
|
+
default=[],
|
|
261
|
+
title="Tags",
|
|
262
|
+
description="Resource tags about the origin system. It can later be used for filtering on search endpoints with '/origin.tags/{tag}'",
|
|
263
|
+
)
|
|
264
|
+
|
|
250
265
|
class Source(Enum):
|
|
251
266
|
WEB = "WEB"
|
|
252
267
|
DESKTOP = "DESKTOP"
|
nucliadb_models/resource.py
CHANGED
nucliadb_models/search.py
CHANGED
|
@@ -79,8 +79,9 @@ ANSWER_JSON_SCHEMA_EXAMPLE = {
|
|
|
79
79
|
class ModelParamDefaults:
|
|
80
80
|
applied_autofilters = ParamDefault(
|
|
81
81
|
default=[],
|
|
82
|
-
title="
|
|
83
|
-
description="
|
|
82
|
+
title="Applied autofilters",
|
|
83
|
+
description="[deprecated] list of filters automatically applied to the search query",
|
|
84
|
+
deprecated=True,
|
|
84
85
|
)
|
|
85
86
|
|
|
86
87
|
|
|
@@ -296,6 +297,8 @@ class KnowledgeboxSearchResults(JsonBaseModel):
|
|
|
296
297
|
relations: Optional[Relations] = None
|
|
297
298
|
nodes: Optional[list[dict[str, str]]] = None
|
|
298
299
|
shards: Optional[list[str]] = None
|
|
300
|
+
|
|
301
|
+
# TODO: remove on a future major release
|
|
299
302
|
autofilters: list[str] = ModelParamDefaults.applied_autofilters.to_pydantic_field()
|
|
300
303
|
|
|
301
304
|
|
|
@@ -481,11 +484,6 @@ class SearchParamDefaults:
|
|
|
481
484
|
description="The list of facets to calculate. The facets follow the same syntax as filters: https://docs.nuclia.dev/docs/rag/advanced/search-filters", # noqa: E501
|
|
482
485
|
max_items=50,
|
|
483
486
|
)
|
|
484
|
-
autofilter = ParamDefault(
|
|
485
|
-
default=False,
|
|
486
|
-
title="Automatic search filtering",
|
|
487
|
-
description="If set to true, the search will automatically add filters to the query. For example, it will filter results containing the entities detected in the query", # noqa: E501
|
|
488
|
-
)
|
|
489
487
|
chat_query = ParamDefault(
|
|
490
488
|
default=...,
|
|
491
489
|
title="Query",
|
|
@@ -809,6 +807,11 @@ class MinScore(BaseModel):
|
|
|
809
807
|
)
|
|
810
808
|
|
|
811
809
|
|
|
810
|
+
class Image(BaseModel):
|
|
811
|
+
content_type: str
|
|
812
|
+
b64encoded: str
|
|
813
|
+
|
|
814
|
+
|
|
812
815
|
AUDIT_METADATA_MAX_BYTES = 1024 * 10 # 10KB
|
|
813
816
|
|
|
814
817
|
|
|
@@ -871,7 +874,9 @@ class BaseSearchRequest(AuditMetadataBase):
|
|
|
871
874
|
vectorset: Optional[str] = SearchParamDefaults.vectorset.to_pydantic_field()
|
|
872
875
|
with_duplicates: bool = SearchParamDefaults.with_duplicates.to_pydantic_field()
|
|
873
876
|
with_synonyms: bool = SearchParamDefaults.with_synonyms.to_pydantic_field()
|
|
874
|
-
autofilter
|
|
877
|
+
# autofilter is deprecated and its logic was removed. We're just keeping it in the model definition to
|
|
878
|
+
# avoid breaking changes in the python sdks. Please remove on a future major release.
|
|
879
|
+
autofilter: SkipJsonSchema[bool] = False
|
|
875
880
|
resource_filters: list[str] = SearchParamDefaults.resource_filters.to_pydantic_field()
|
|
876
881
|
security: Optional[RequestSecurity] = SearchParamDefaults.security.to_pydantic_field()
|
|
877
882
|
show_hidden: bool = SearchParamDefaults.show_hidden.to_pydantic_field()
|
|
@@ -902,6 +907,11 @@ Please return ONLY the question without any explanation. Just the rephrased ques
|
|
|
902
907
|
Please return ONLY the question without any explanation.""",
|
|
903
908
|
],
|
|
904
909
|
)
|
|
910
|
+
query_image: Optional[Image] = Field(
|
|
911
|
+
default=None,
|
|
912
|
+
title="Query image",
|
|
913
|
+
description="Image that will be used together with the query text for retrieval.",
|
|
914
|
+
)
|
|
905
915
|
|
|
906
916
|
@model_validator(mode="before")
|
|
907
917
|
@classmethod
|
|
@@ -953,11 +963,6 @@ class UserPrompt(BaseModel):
|
|
|
953
963
|
prompt: str
|
|
954
964
|
|
|
955
965
|
|
|
956
|
-
class Image(BaseModel):
|
|
957
|
-
content_type: str
|
|
958
|
-
b64encoded: str
|
|
959
|
-
|
|
960
|
-
|
|
961
966
|
class MaxTokens(BaseModel):
|
|
962
967
|
context: Optional[int] = Field(
|
|
963
968
|
default=None,
|
|
@@ -980,6 +985,35 @@ def parse_max_tokens(max_tokens: Optional[Union[int, MaxTokens]]) -> Optional[Ma
|
|
|
980
985
|
return max_tokens
|
|
981
986
|
|
|
982
987
|
|
|
988
|
+
class Reasoning(BaseModel):
|
|
989
|
+
display: bool = Field(
|
|
990
|
+
default=True,
|
|
991
|
+
description="Whether to display the reasoning steps in the response.",
|
|
992
|
+
)
|
|
993
|
+
effort: Literal["low", "medium", "high"] = Field(
|
|
994
|
+
default="medium",
|
|
995
|
+
description=(
|
|
996
|
+
"Level of reasoning effort. Used by OpenAI models to control the depth of reasoning. "
|
|
997
|
+
"This parameter will be automatically mapped to budget_tokens "
|
|
998
|
+
"if the chosen model does not support effort."
|
|
999
|
+
),
|
|
1000
|
+
)
|
|
1001
|
+
budget_tokens: int = Field(
|
|
1002
|
+
default=15_000,
|
|
1003
|
+
description=(
|
|
1004
|
+
"Token budget for reasoning. Used by Anthropic or Google models to limit the number of "
|
|
1005
|
+
"tokens used for reasoning. This parameter will be automatically mapped to effort "
|
|
1006
|
+
"if the chosen model does not support budget_tokens."
|
|
1007
|
+
),
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
class CitationsType(str, Enum):
|
|
1012
|
+
NONE = "none"
|
|
1013
|
+
DEFAULT = "default"
|
|
1014
|
+
LLM_FOOTNOTES = "llm_footnotes"
|
|
1015
|
+
|
|
1016
|
+
|
|
983
1017
|
class ChatModel(BaseModel):
|
|
984
1018
|
"""
|
|
985
1019
|
This is the model for the predict request payload on the chat endpoint
|
|
@@ -1011,10 +1045,16 @@ class ChatModel(BaseModel):
|
|
|
1011
1045
|
user_prompt: Optional[UserPrompt] = Field(
|
|
1012
1046
|
default=None, description="Optional custom prompt input by the user"
|
|
1013
1047
|
)
|
|
1014
|
-
citations: bool = Field(
|
|
1048
|
+
citations: Union[bool, None, CitationsType] = Field(
|
|
1049
|
+
default=None,
|
|
1050
|
+
description="Whether to include citations in the response. "
|
|
1051
|
+
"If set to None or False, no citations will be computed. "
|
|
1052
|
+
"If set to True or 'default', citations will be computed after answer generation and send as a separate `CitationsGenerativeResponse` chunk. "
|
|
1053
|
+
"If set to 'llm_footnotes', citations will be included in the LLM's response as markdown-styled footnotes. A `FootnoteCitationsGenerativeResponse` chunk will also be sent to map footnote ids to context keys in the `query_context`.",
|
|
1054
|
+
)
|
|
1015
1055
|
citation_threshold: Optional[float] = Field(
|
|
1016
1056
|
default=None,
|
|
1017
|
-
description="If citations is True, this
|
|
1057
|
+
description="If citations is set to True or 'default', this will be the similarity threshold. Value between 0 and 1, lower values will produce more citations. If not set, it will be set to the optimized threshold found by Nuclia.",
|
|
1018
1058
|
ge=0.0,
|
|
1019
1059
|
le=1.0,
|
|
1020
1060
|
)
|
|
@@ -1053,6 +1093,13 @@ class ChatModel(BaseModel):
|
|
|
1053
1093
|
default=None,
|
|
1054
1094
|
description="Seed use for the generative model for a deterministic output.",
|
|
1055
1095
|
)
|
|
1096
|
+
reasoning: Union[Reasoning, bool] = Field(
|
|
1097
|
+
default=False,
|
|
1098
|
+
description=(
|
|
1099
|
+
"Reasoning options for the generative model. "
|
|
1100
|
+
"Set to True to enable default reasoning, False to disable, or provide a Reasoning object for custom options."
|
|
1101
|
+
),
|
|
1102
|
+
)
|
|
1056
1103
|
|
|
1057
1104
|
|
|
1058
1105
|
class RephraseModel(BaseModel):
|
|
@@ -1123,7 +1170,7 @@ ALLOWED_FIELD_TYPES: dict[str, str] = {
|
|
|
1123
1170
|
"t": "text",
|
|
1124
1171
|
"f": "file",
|
|
1125
1172
|
"u": "link",
|
|
1126
|
-
"
|
|
1173
|
+
"c": "conversation",
|
|
1127
1174
|
"a": "generic",
|
|
1128
1175
|
}
|
|
1129
1176
|
|
|
@@ -1131,16 +1178,19 @@ ALLOWED_FIELD_TYPES: dict[str, str] = {
|
|
|
1131
1178
|
class FieldExtensionStrategy(RagStrategy):
|
|
1132
1179
|
name: Literal["field_extension"] = "field_extension"
|
|
1133
1180
|
fields: list[str] = Field(
|
|
1181
|
+
default=[],
|
|
1134
1182
|
title="Fields",
|
|
1135
|
-
description="List of field ids to extend the context with. It will try to extend the retrieval context with the specified fields in the matching resources. The field ids have to be in the format `{field_type}/{field_name}`, like 'a/title', 'a/summary' for title and summary fields or 't/amend' for a text field named 'amend'.",
|
|
1136
|
-
|
|
1183
|
+
description="List of field ids to extend the context with. It will try to extend the retrieval context with the specified fields in the matching resources. The field ids have to be in the format `{field_type}/{field_name}`, like 'a/title', 'a/summary' for title and summary fields or 't/amend' for a text field named 'amend'.",
|
|
1184
|
+
)
|
|
1185
|
+
data_augmentation_field_prefixes: list[str] = Field(
|
|
1186
|
+
default=[],
|
|
1187
|
+
description="List of prefixes for data augmentation added fields to extend the context with. For example, if the prefix is 'simpson', all fields that are a result of data augmentation with that prefix will be used to extend the context.",
|
|
1137
1188
|
)
|
|
1138
1189
|
|
|
1139
|
-
@
|
|
1140
|
-
|
|
1141
|
-
def fields_validator(cls, fields) -> Self:
|
|
1190
|
+
@model_validator(mode="after")
|
|
1191
|
+
def field_extension_strategy_validator(self) -> Self:
|
|
1142
1192
|
# Check that the fields are in the format {field_type}/{field_name}
|
|
1143
|
-
for field in fields:
|
|
1193
|
+
for field in self.fields:
|
|
1144
1194
|
try:
|
|
1145
1195
|
field_type, _ = field.strip("/").split("/")
|
|
1146
1196
|
except ValueError:
|
|
@@ -1153,8 +1203,7 @@ class FieldExtensionStrategy(RagStrategy):
|
|
|
1153
1203
|
f"Field '{field}' does not have a valid field type. "
|
|
1154
1204
|
f"Valid field types are: {allowed_field_types_part}."
|
|
1155
1205
|
)
|
|
1156
|
-
|
|
1157
|
-
return fields
|
|
1206
|
+
return self
|
|
1158
1207
|
|
|
1159
1208
|
|
|
1160
1209
|
class FullResourceApplyTo(BaseModel):
|
|
@@ -1192,6 +1241,7 @@ class HierarchyResourceStrategy(RagStrategy):
|
|
|
1192
1241
|
title="Count",
|
|
1193
1242
|
description="Number of extra characters that are added to each matching paragraph when adding to the context.",
|
|
1194
1243
|
ge=0,
|
|
1244
|
+
le=1024,
|
|
1195
1245
|
)
|
|
1196
1246
|
|
|
1197
1247
|
|
|
@@ -1420,7 +1470,7 @@ class PageImageStrategy(ImageRagStrategy):
|
|
|
1420
1470
|
count: Optional[int] = Field(
|
|
1421
1471
|
default=None,
|
|
1422
1472
|
title="Count",
|
|
1423
|
-
description="Maximum number of images to retrieve
|
|
1473
|
+
description="Maximum number of page images to retrieve. By default, at most 5 images are retrieved.",
|
|
1424
1474
|
)
|
|
1425
1475
|
|
|
1426
1476
|
|
|
@@ -1565,7 +1615,17 @@ class AskRequest(AuditMetadataBase):
|
|
|
1565
1615
|
description="""Additional images added to the retrieval context sent to the LLM."
|
|
1566
1616
|
It allows extending the chat feature with content that may not be in the Knowledge Box.""",
|
|
1567
1617
|
)
|
|
1568
|
-
|
|
1618
|
+
query_image: Optional[Image] = Field(
|
|
1619
|
+
default=None,
|
|
1620
|
+
title="Query image",
|
|
1621
|
+
description="Image that will be used together with the query text for retrieval and then sent to the LLM as part of the context. "
|
|
1622
|
+
"If a query image is provided, the `extra_context_images` and `rag_images_strategies` will be disabled.",
|
|
1623
|
+
)
|
|
1624
|
+
|
|
1625
|
+
# autofilter is deprecated and its logic was removed. We're just keeping it in the model definition to
|
|
1626
|
+
# avoid breaking changes in the python sdks. Please remove on a future major release.
|
|
1627
|
+
autofilter: SkipJsonSchema[bool] = False
|
|
1628
|
+
|
|
1569
1629
|
highlight: bool = SearchParamDefaults.highlight.to_pydantic_field()
|
|
1570
1630
|
resource_filters: list[str] = SearchParamDefaults.resource_filters.to_pydantic_field()
|
|
1571
1631
|
prompt: Optional[Union[str, CustomPrompt]] = Field(
|
|
@@ -1575,13 +1635,16 @@ class AskRequest(AuditMetadataBase):
|
|
|
1575
1635
|
)
|
|
1576
1636
|
rank_fusion: Union[RankFusionName, RankFusion] = SearchParamDefaults.rank_fusion.to_pydantic_field()
|
|
1577
1637
|
reranker: Union[RerankerName, Reranker] = SearchParamDefaults.reranker.to_pydantic_field()
|
|
1578
|
-
citations: bool = Field(
|
|
1579
|
-
default=
|
|
1580
|
-
description="Whether to include
|
|
1638
|
+
citations: Union[bool, None, CitationsType] = Field(
|
|
1639
|
+
default=None,
|
|
1640
|
+
description="Whether to include citations in the response. "
|
|
1641
|
+
"If set to None or False, no citations will be computed. "
|
|
1642
|
+
"If set to True or 'default', citations will be computed after answer generation and send as a separate `CitationsGenerativeResponse` chunk. "
|
|
1643
|
+
"If set to 'llm_footnotes', citations will be included in the LLM's response as markdown-styled footnotes. A `FootnoteCitationsGenerativeResponse` chunk will also be sent to map footnote ids to context keys in the `query_context`.",
|
|
1581
1644
|
)
|
|
1582
1645
|
citation_threshold: Optional[float] = Field(
|
|
1583
1646
|
default=None,
|
|
1584
|
-
description="If citations is True, this
|
|
1647
|
+
description="If citations is set to True or 'default', this will be the similarity threshold. Value between 0 and 1, lower values will produce more citations. If not set, it will be set to the optimized threshold found by Nuclia.",
|
|
1585
1648
|
ge=0.0,
|
|
1586
1649
|
le=1.0,
|
|
1587
1650
|
)
|
|
@@ -1712,6 +1775,14 @@ Using this feature also disables the `citations` parameter. For maximal accuracy
|
|
|
1712
1775
|
description="Load ask parameters from this configuration. Parameters in the request override parameters from the configuration.",
|
|
1713
1776
|
)
|
|
1714
1777
|
|
|
1778
|
+
reasoning: Union[Reasoning, bool] = Field(
|
|
1779
|
+
default=False,
|
|
1780
|
+
description=(
|
|
1781
|
+
"Reasoning options for the generative model. "
|
|
1782
|
+
"Set to True to enable default reasoning, False to disable, or provide a Reasoning object for custom options."
|
|
1783
|
+
),
|
|
1784
|
+
)
|
|
1785
|
+
|
|
1715
1786
|
@field_validator("rag_strategies", mode="before")
|
|
1716
1787
|
@classmethod
|
|
1717
1788
|
def validate_rag_strategies(cls, rag_strategies: list[RagStrategies]) -> list[RagStrategies]:
|
|
@@ -2169,6 +2240,11 @@ class SyncAskResponse(BaseModel):
|
|
|
2169
2240
|
title="Answer",
|
|
2170
2241
|
description="The generative answer to the query",
|
|
2171
2242
|
)
|
|
2243
|
+
reasoning: Optional[str] = Field(
|
|
2244
|
+
default=None,
|
|
2245
|
+
title="Reasoning",
|
|
2246
|
+
description="The reasoning steps followed by the LLM to generate the answer. This is returned only if the reasoning feature is enabled in the request.", # noqa: E501
|
|
2247
|
+
)
|
|
2172
2248
|
answer_json: Optional[dict[str, Any]] = Field(
|
|
2173
2249
|
default=None,
|
|
2174
2250
|
title="Answer JSON",
|
|
@@ -2203,10 +2279,15 @@ class SyncAskResponse(BaseModel):
|
|
|
2203
2279
|
description="The detected relations of the answer",
|
|
2204
2280
|
)
|
|
2205
2281
|
citations: dict[str, Any] = Field(
|
|
2206
|
-
|
|
2282
|
+
default_factory=dict,
|
|
2207
2283
|
title="Citations",
|
|
2208
2284
|
description="The citations of the answer. List of references to the resources used to generate the answer.",
|
|
2209
2285
|
)
|
|
2286
|
+
citation_footnote_to_context: dict[str, str] = Field(
|
|
2287
|
+
default_factory=dict,
|
|
2288
|
+
title="Citation footnote to context",
|
|
2289
|
+
description="""Maps ids in the footnote citations to query_context keys (normally paragraph ids)""",
|
|
2290
|
+
)
|
|
2210
2291
|
augmented_context: Optional[AugmentedContext] = Field(
|
|
2211
2292
|
default=None,
|
|
2212
2293
|
description=(
|
|
@@ -2273,6 +2354,11 @@ class AnswerAskResponseItem(BaseModel):
|
|
|
2273
2354
|
text: str
|
|
2274
2355
|
|
|
2275
2356
|
|
|
2357
|
+
class ReasoningAskResponseItem(BaseModel):
|
|
2358
|
+
type: Literal["reasoning"] = "reasoning"
|
|
2359
|
+
text: str
|
|
2360
|
+
|
|
2361
|
+
|
|
2276
2362
|
class JSONAskResponseItem(BaseModel):
|
|
2277
2363
|
type: Literal["answer_json"] = "answer_json"
|
|
2278
2364
|
object: dict[str, Any]
|
|
@@ -2311,6 +2397,18 @@ class CitationsAskResponseItem(BaseModel):
|
|
|
2311
2397
|
citations: dict[str, Any]
|
|
2312
2398
|
|
|
2313
2399
|
|
|
2400
|
+
class FootnoteCitationsAskResponseItem(BaseModel):
|
|
2401
|
+
type: Literal["footnote_citations"] = "footnote_citations"
|
|
2402
|
+
footnote_to_context: dict[str, str] = Field(
|
|
2403
|
+
description="""Maps ids in the footnote citations to query_context keys (normally paragraph ids)
|
|
2404
|
+
e.g.,
|
|
2405
|
+
{ "block-AA": "f44f4e8acbfb1d48de3fd3c2fb04a885/f/f44f4e8acbfb1d48de3fd3c2fb04a885/73758-73972", ... }
|
|
2406
|
+
If the query_context is a list, it will map to 1-based indices as strings
|
|
2407
|
+
e.g., { "block-AA": "1", "block-AB": "2", ... }
|
|
2408
|
+
"""
|
|
2409
|
+
)
|
|
2410
|
+
|
|
2411
|
+
|
|
2314
2412
|
class StatusAskResponseItem(BaseModel):
|
|
2315
2413
|
type: Literal["status"] = "status"
|
|
2316
2414
|
code: str
|
|
@@ -2336,10 +2434,12 @@ class DebugAskResponseItem(BaseModel):
|
|
|
2336
2434
|
|
|
2337
2435
|
AskResponseItemType = Union[
|
|
2338
2436
|
AnswerAskResponseItem,
|
|
2437
|
+
ReasoningAskResponseItem,
|
|
2339
2438
|
JSONAskResponseItem,
|
|
2340
2439
|
MetadataAskResponseItem,
|
|
2341
2440
|
AugmentedContextResponseItem,
|
|
2342
2441
|
CitationsAskResponseItem,
|
|
2442
|
+
FootnoteCitationsAskResponseItem,
|
|
2343
2443
|
StatusAskResponseItem,
|
|
2344
2444
|
ErrorAskResponseItem,
|
|
2345
2445
|
RetrievalAskResponseItem,
|
nucliadb_models/text.py
CHANGED
|
@@ -59,6 +59,10 @@ class FieldText(BaseModel):
|
|
|
59
59
|
default=None,
|
|
60
60
|
description="Id of the Nuclia extract strategy used at processing time. If not set, the default strategy was used. Extract strategies are defined at the learning configuration api.",
|
|
61
61
|
)
|
|
62
|
+
split_strategy: Optional[str] = Field(
|
|
63
|
+
default=None,
|
|
64
|
+
description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
|
|
65
|
+
)
|
|
62
66
|
|
|
63
67
|
|
|
64
68
|
# Creation and update classes (Those used on writer endpoints)
|
|
@@ -80,6 +84,10 @@ If you need to store more text, consider using a file field instead or splitting
|
|
|
80
84
|
default=None,
|
|
81
85
|
description="Id of the Nuclia extract strategy to use at processing time. If not set, the default strategy will be used. Extract strategies are defined at the learning configuration api.",
|
|
82
86
|
)
|
|
87
|
+
split_strategy: Optional[str] = Field(
|
|
88
|
+
default=None,
|
|
89
|
+
description="Id of the Nuclia split strategy used at processing time. If not set, the default strategy was used. Split strategies are defined at the learning configuration api.",
|
|
90
|
+
)
|
|
83
91
|
|
|
84
92
|
@model_validator(mode="after")
|
|
85
93
|
def check_text_format(self) -> Self:
|
nucliadb_models/writer.py
CHANGED
|
@@ -36,7 +36,7 @@ from nucliadb_models.utils import FieldIdPattern, FieldIdString, SlugString
|
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
class FieldDefaults:
|
|
39
|
-
title = Field(None, title="Title")
|
|
39
|
+
title = Field(None, title="Title", max_length=2048)
|
|
40
40
|
summary = Field(None, title="Summary")
|
|
41
41
|
slug = Field(
|
|
42
42
|
None,
|
{nucliadb_models-6.6.1.post4642.dist-info → nucliadb_models-6.9.3.post5295.dist-info}/METADATA
RENAMED
|
@@ -1,19 +1,18 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nucliadb_models
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.9.3.post5295
|
|
4
4
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
|
5
5
|
License-Expression: Apache-2.0
|
|
6
6
|
Project-URL: Homepage, https://nuclia.com
|
|
7
7
|
Project-URL: Repository, https://github.com/nuclia/nucliadb
|
|
8
8
|
Classifier: Development Status :: 4 - Beta
|
|
9
9
|
Classifier: Programming Language :: Python
|
|
10
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
11
10
|
Classifier: Programming Language :: Python :: 3.10
|
|
12
11
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
13
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
14
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
-
Requires-Python: <4,>=3.
|
|
15
|
+
Requires-Python: <4,>=3.10
|
|
17
16
|
Description-Content-Type: text/markdown
|
|
18
17
|
Requires-Dist: pydantic!=2.11.5,!=2.11.6,>=2.6
|
|
19
18
|
|
{nucliadb_models-6.6.1.post4642.dist-info → nucliadb_models-6.9.3.post5295.dist-info}/RECORD
RENAMED
|
@@ -1,30 +1,31 @@
|
|
|
1
1
|
nucliadb_models/__init__.py,sha256=3y8-htogKuCZcbhaUZdSjTeEjUSeec9aRWyL8AlKCyM,1077
|
|
2
|
-
nucliadb_models/common.py,sha256=
|
|
3
|
-
nucliadb_models/configuration.py,sha256=
|
|
4
|
-
nucliadb_models/content_types.py,sha256=
|
|
5
|
-
nucliadb_models/conversation.py,sha256=
|
|
2
|
+
nucliadb_models/common.py,sha256=2dtKG4ZNi9p-yoNY76Uvyu1SlMeNYpH-MnuU3Q6w9Js,8169
|
|
3
|
+
nucliadb_models/configuration.py,sha256=BBrJsNjP324Cw_5J3dBrGwvpkHQYbXEo3TUaI9IqAOg,2449
|
|
4
|
+
nucliadb_models/content_types.py,sha256=36Ga-iGf4ivCqgtXC7imFgegrwHB117s9eqP62JtGv0,3456
|
|
5
|
+
nucliadb_models/conversation.py,sha256=k9bKhkDiqhqmdrDfDPNoUfG7-2H_-KAyuOnETd8zV0E,5081
|
|
6
6
|
nucliadb_models/entities.py,sha256=i-7Y8qmFRRTih5zw0ajv1U_iiXexe66M3TK8hUikQZk,2356
|
|
7
7
|
nucliadb_models/export_import.py,sha256=mNm9IArOLnC6TLupkwqVFhxD5d08mpIVOVFneECv8UA,1073
|
|
8
|
-
nucliadb_models/external_index_providers.py,sha256=
|
|
9
|
-
nucliadb_models/extracted.py,sha256=
|
|
10
|
-
nucliadb_models/file.py,sha256=
|
|
8
|
+
nucliadb_models/external_index_providers.py,sha256=pL3leo4MkuJOnKlU1Sg6GT_mnK_VUBxGui-RPmDYVWU,1126
|
|
9
|
+
nucliadb_models/extracted.py,sha256=Owz7LC3le3Dvau3TtRiO8NY84meOf6IxN-RrOqqpMPs,5593
|
|
10
|
+
nucliadb_models/file.py,sha256=tXtgB9c7i2ADsnJ7HdbXyroAmXadGvOeA49htBh7BZo,2263
|
|
11
11
|
nucliadb_models/filters.py,sha256=NQI2-4AFzzJuZy8NeY3jXlTbbU5wxiwMCP-5DrD-7lE,14759
|
|
12
|
+
nucliadb_models/hydration.py,sha256=7SFnAcTQRE9etVccpph6aA1AUqsHVwkzT4YF6Uzl0Gs,14262
|
|
12
13
|
nucliadb_models/labels.py,sha256=9zqRgkpZuX3kUPwsTTgCH7JyOWK7dM5pwyuHJR86YdU,3949
|
|
13
|
-
nucliadb_models/link.py,sha256=
|
|
14
|
-
nucliadb_models/metadata.py,sha256=
|
|
14
|
+
nucliadb_models/link.py,sha256=PF5hHLwdOed5TMBTxtokkgWtMh1bFnORZjybh0NwVCw,2526
|
|
15
|
+
nucliadb_models/metadata.py,sha256=OOKGy_83NtlG1QKQZEwMuwu4wbVEe7P30Y2QvnGSDto,8933
|
|
15
16
|
nucliadb_models/notifications.py,sha256=mna8-AoD_29Wds0Thl0AF0zpERnJmYGLZX1w1fUopMY,4036
|
|
16
17
|
nucliadb_models/processing.py,sha256=nhKuHQjqCdb9zJVkYGPTLub23tK9e_lwL5OCDVymZjY,719
|
|
17
18
|
nucliadb_models/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
18
|
-
nucliadb_models/resource.py,sha256=
|
|
19
|
-
nucliadb_models/search.py,sha256=
|
|
19
|
+
nucliadb_models/resource.py,sha256=RzCos0QRgSMkaV-p7EoceSmt7UTzt9G9be5BKF-iGrQ,9021
|
|
20
|
+
nucliadb_models/search.py,sha256=gQEXJ9bXXcxswr7aOzvBeGIQlrq5TgRWIqTxKEbSoCE,96409
|
|
20
21
|
nucliadb_models/security.py,sha256=opxaDLfvk3aU0sjesK0jGrYLx5h4YCwlKKN0moYs_ig,1150
|
|
21
22
|
nucliadb_models/synonyms.py,sha256=afbaVqSQSxGLwi2PusVaLSRpkOtA5AZmWOKd1f4nl2E,690
|
|
22
|
-
nucliadb_models/text.py,sha256=
|
|
23
|
+
nucliadb_models/text.py,sha256=60bxZnOjRHnDdezR8VfR3AZsXTOwePFPs2BKB8wxBak,3414
|
|
23
24
|
nucliadb_models/trainset.py,sha256=BgUfgdClpwhk6UoOq5x6mbpOopgSmqg8he2bBzEzGqg,2406
|
|
24
25
|
nucliadb_models/utils.py,sha256=OnWaDwZGwja8Spd_gpryuUpAMGIMhh-DNDGpoUYyb-A,2460
|
|
25
26
|
nucliadb_models/vectors.py,sha256=_Z157PojPIwoeF5LStO0gz8IwxKy2styHjhdBkLd_44,1329
|
|
26
27
|
nucliadb_models/vectorsets.py,sha256=XAgg9DfdfLYpfLh9OepJ_KPH0_RqRQNpVZJr74UnNh0,788
|
|
27
|
-
nucliadb_models/writer.py,sha256=
|
|
28
|
+
nucliadb_models/writer.py,sha256=6hBH32XLsXUqeNWVQlzZ6X-0dLFVgkbxaMSf_s2Cga4,8237
|
|
28
29
|
nucliadb_models/agents/ingestion.py,sha256=W9cJ0dQT_1vPcjeJ4_Fjb8DylnhQ6qqZrY4v8x1RqUs,3093
|
|
29
30
|
nucliadb_models/graph/__init__.py,sha256=X538kZPZnndmQeEtnzzPv1hYVGUTDe9U1O7UmAqqxXU,645
|
|
30
31
|
nucliadb_models/graph/requests.py,sha256=ppQ7cOnybvrw1wGC7qDps-182PfmicWU6-4vLRfK16w,7169
|
|
@@ -32,7 +33,7 @@ nucliadb_models/graph/responses.py,sha256=Sdq8OgFAL1YT-1lJyLLrkqcScvj7YTEqAUwQ-k
|
|
|
32
33
|
nucliadb_models/internal/__init__.py,sha256=zG33bUz1rHFPtvqQPWn4rDwBJt3FJodGuQYD45quiQg,583
|
|
33
34
|
nucliadb_models/internal/predict.py,sha256=Pnx6MmLfK65eExe1XnVxqmSlvMwdowewwks9BOEoqMw,2029
|
|
34
35
|
nucliadb_models/internal/shards.py,sha256=__y1OZtWGiNcPQEWfSFOj8yw458WGi7mM4vZe0K-L1Y,1691
|
|
35
|
-
nucliadb_models-6.
|
|
36
|
-
nucliadb_models-6.
|
|
37
|
-
nucliadb_models-6.
|
|
38
|
-
nucliadb_models-6.
|
|
36
|
+
nucliadb_models-6.9.3.post5295.dist-info/METADATA,sha256=TFQ9w9VYyh6hIZ_pTdkoxYvLyf8Y5bl7B_l372N63uM,745
|
|
37
|
+
nucliadb_models-6.9.3.post5295.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
38
|
+
nucliadb_models-6.9.3.post5295.dist-info/top_level.txt,sha256=UrY1I8oeovIRwkXLYplssTrxQdUjhSEFDFbnwaIV3tA,16
|
|
39
|
+
nucliadb_models-6.9.3.post5295.dist-info/RECORD,,
|
|
File without changes
|
{nucliadb_models-6.6.1.post4642.dist-info → nucliadb_models-6.9.3.post5295.dist-info}/top_level.txt
RENAMED
|
File without changes
|