nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,614 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from collections.abc import Callable
21
+ from dataclasses import dataclass
22
+ from typing import Annotated, Any, Literal
23
+
24
+ from pydantic import BaseModel, Discriminator, Field, Tag, model_validator
25
+ from typing_extensions import Self
26
+
27
+ import nucliadb_models
28
+ from nucliadb.common.external_index_providers.base import TextBlockMatch
29
+ from nucliadb.common.ids import FieldId, ParagraphId
30
+ from nucliadb_models import filters
31
+ from nucliadb_models.augment import ResourceId
32
+ from nucliadb_models.common import FieldTypeName
33
+ from nucliadb_models.conversation import FieldConversation
34
+ from nucliadb_models.file import FieldFile
35
+ from nucliadb_models.link import FieldLink
36
+ from nucliadb_models.metadata import Extra, Origin
37
+ from nucliadb_models.resource import ExtractedDataTypeName, Resource
38
+ from nucliadb_models.search import (
39
+ ResourceProperties,
40
+ SearchParamDefaults,
41
+ TextPosition,
42
+ )
43
+ from nucliadb_protos import resources_pb2
44
+
45
+
46
+ class SelectProp(BaseModel):
47
+ prop: Any
48
+
49
+ @model_validator(mode="after")
50
+ def set_discriminator(self) -> Self:
51
+ # Ensure discriminator is explicitly set so it's always serialized
52
+ self.prop = self.prop
53
+ return self
54
+
55
+
56
+ def discriminator(name: str) -> Callable[[Any], str | None]:
57
+ def _inner(v: Any) -> str | None:
58
+ if isinstance(v, dict):
59
+ return v.get(name, None)
60
+ else:
61
+ return getattr(v, name, None)
62
+
63
+ return _inner
64
+
65
+
66
+ prop_discriminator = discriminator(name="prop")
67
+ from_discriminator = discriminator(name="from")
68
+ name_discriminator = discriminator(name="name")
69
+
70
+ # Complex ids
71
+
72
+
73
+ class Metadata(BaseModel):
74
+ is_an_image: bool
75
+ is_a_table: bool
76
+
77
+ # for extracted from visual content (ocr, inception, tables)
78
+ source_file: str | None
79
+
80
+ # for documents (pdf, docx...) only
81
+ page: int | None
82
+ in_page_with_visual: bool | None
83
+
84
+ @classmethod
85
+ def from_text_block_match(cls, text_block: TextBlockMatch) -> Self:
86
+ return cls(
87
+ is_an_image=text_block.is_an_image,
88
+ is_a_table=text_block.is_a_table,
89
+ source_file=text_block.representation_file,
90
+ page=text_block.position.page_number,
91
+ in_page_with_visual=text_block.page_with_visual,
92
+ )
93
+
94
+ @classmethod
95
+ def from_db_paragraph(cls, paragraph: resources_pb2.Paragraph) -> Self:
96
+ is_an_image = paragraph.kind in (
97
+ resources_pb2.Paragraph.TypeParagraph.OCR,
98
+ resources_pb2.Paragraph.TypeParagraph.INCEPTION,
99
+ )
100
+ # REVIEW(decoupled-ask): can a paragraph be of a different type and still be a table?
101
+ is_a_table = (
102
+ paragraph.kind == resources_pb2.Paragraph.TypeParagraph.TABLE
103
+ or paragraph.representation.is_a_table
104
+ )
105
+
106
+ if paragraph.representation.reference_file:
107
+ source_file = paragraph.representation.reference_file
108
+ else:
109
+ source_file = None
110
+
111
+ if paragraph.HasField("page"):
112
+ page = paragraph.page.page
113
+ in_page_with_visual = paragraph.page.page_with_visual
114
+ else:
115
+ page = None
116
+ in_page_with_visual = None
117
+
118
+ return cls(
119
+ is_an_image=is_an_image,
120
+ is_a_table=is_a_table,
121
+ source_file=source_file,
122
+ page=page,
123
+ in_page_with_visual=in_page_with_visual,
124
+ )
125
+
126
+
127
+ class Paragraph(BaseModel):
128
+ id: ParagraphId
129
+ metadata: Metadata | None = None
130
+
131
+ @classmethod
132
+ def from_text_block_match(cls, text_block: TextBlockMatch) -> Self:
133
+ return cls(
134
+ id=text_block.paragraph_id,
135
+ metadata=Metadata.from_text_block_match(text_block),
136
+ )
137
+
138
+ @classmethod
139
+ def from_db_paragraph(cls, id: ParagraphId, paragraph: resources_pb2.Paragraph) -> Self:
140
+ return cls(
141
+ id=id,
142
+ metadata=Metadata.from_db_paragraph(paragraph),
143
+ )
144
+
145
+
146
+ # SELECT props
147
+
148
+
149
+ class ParagraphText(SelectProp):
150
+ prop: Literal["text"] = "text"
151
+
152
+
153
+ class ParagraphPosition(SelectProp):
154
+ prop: Literal["position"] = "position"
155
+
156
+
157
+ class ParagraphImage(SelectProp):
158
+ prop: Literal["image"] = "image"
159
+
160
+
161
+ class ParagraphTable(SelectProp):
162
+ prop: Literal["table"] = "table"
163
+
164
+ # sometimes, due to a not perfect extraction, is better to use the page
165
+ # preview instead of the table image for context. This options let users
166
+ # choose
167
+ prefer_page_preview: bool = False
168
+
169
+
170
+ class ParagraphPage(SelectProp):
171
+ prop: Literal["page"] = "page"
172
+ preview: bool = True
173
+
174
+
175
+ class RelatedParagraphs(SelectProp):
176
+ prop: Literal["related"] = "related"
177
+ neighbours_before: int = Field(ge=0, description="Number of previous paragraphs to hydrate")
178
+ neighbours_after: int = Field(ge=0, description="Number of following paragraphs to hydrate")
179
+
180
+
181
+ ParagraphProp = Annotated[
182
+ (
183
+ Annotated[ParagraphText, Tag("text")]
184
+ | Annotated[ParagraphPosition, Tag("position")]
185
+ | Annotated[ParagraphImage, Tag("image")]
186
+ | Annotated[ParagraphTable, Tag("table")]
187
+ | Annotated[ParagraphPage, Tag("page")]
188
+ | Annotated[RelatedParagraphs, Tag("related")]
189
+ ),
190
+ Discriminator(prop_discriminator),
191
+ ]
192
+
193
+
194
+ class FieldText(SelectProp):
195
+ prop: Literal["text"] = "text"
196
+
197
+
198
+ class FieldValue(SelectProp):
199
+ prop: Literal["value"] = "value"
200
+
201
+
202
+ class FieldClassificationLabels(SelectProp):
203
+ prop: Literal["classification_labels"] = "classification_labels"
204
+
205
+
206
+ class FieldEntities(SelectProp):
207
+ """Same as MetadataExtensionStrategy asking for ners"""
208
+
209
+ prop: Literal["entities"] = "entities"
210
+
211
+
212
+ FieldProp = Annotated[
213
+ (
214
+ Annotated[FieldText, Tag("text")]
215
+ | Annotated[FieldValue, Tag("value")]
216
+ | Annotated[FieldClassificationLabels, Tag("classification_labels")]
217
+ | Annotated[FieldEntities, Tag("entities")]
218
+ ),
219
+ Discriminator(prop_discriminator),
220
+ ]
221
+
222
+
223
+ class FileThumbnail(SelectProp):
224
+ """File field thumbnail image"""
225
+
226
+ prop: Literal["thumbnail"] = "thumbnail"
227
+
228
+
229
+ FileProp = Annotated[
230
+ (
231
+ Annotated[FieldText, Tag("text")]
232
+ | Annotated[FieldValue, Tag("value")]
233
+ | Annotated[FieldClassificationLabels, Tag("classification_labels")]
234
+ | Annotated[FieldEntities, Tag("entities")]
235
+ | Annotated[FileThumbnail, Tag("thumbnail")]
236
+ ),
237
+ Discriminator(prop_discriminator),
238
+ ]
239
+
240
+
241
+ class MessageSelector(BaseModel):
242
+ """Selects the message specified by the field id."""
243
+
244
+ name: Literal["message"] = "message"
245
+
246
+ id: str | None = None
247
+ index: Literal["first"] | Literal["last"] | int | None = Field(
248
+ default=None,
249
+ description="Index of the message in the conversation. Indexing starts at 0",
250
+ )
251
+
252
+ @model_validator(mode="after")
253
+ def id_or_index(self) -> Self:
254
+ if self.id is not None and self.index is not None:
255
+ raise ValueError("Can't define both `id` and `index`")
256
+ return self
257
+
258
+
259
+ class PageSelector(BaseModel):
260
+ """Selects all messages from the page of the message specified by the field
261
+ id.
262
+
263
+ """
264
+
265
+ name: Literal["page"] = "page"
266
+
267
+
268
+ class NeighboursSelector(BaseModel):
269
+ """Selects a bunch of messages preceding or following the one specified by
270
+ the field id.
271
+
272
+ """
273
+
274
+ name: Literal["neighbours"] = "neighbours"
275
+ after: int = Field(ge=1)
276
+
277
+
278
+ class WindowSelector(BaseModel):
279
+ """Selects a window of certain size around the message specified by the
280
+ field id.
281
+
282
+ If size=1, this behaves as MessageSelector.
283
+
284
+ If, for example, size=5 and there are 2 messages preceding and 2 following,
285
+ it behaves as a NeighbourSelector(before=2, after=2). However, if there's
286
+ not enough messages before/after, the window will be offset. For example, if
287
+ the selected message is the first on the conversation and size=5, it'll
288
+ select the first 5 messages of the conversation.
289
+
290
+ """
291
+
292
+ name: Literal["window"] = "window"
293
+ size: int = Field(ge=1)
294
+
295
+
296
+ class AnswerSelector(BaseModel):
297
+ """Search for the next message of type ANSWER. For ids containing the split,
298
+ search starts from that message rather than the beginning of the
299
+ conversation.
300
+
301
+ """
302
+
303
+ name: Literal["answer"] = "answer"
304
+
305
+
306
+ class FullSelector(BaseModel):
307
+ """Selects the whole conversation"""
308
+
309
+ name: Literal["full"] = "full"
310
+
311
+
312
+ ConversationSelector = Annotated[
313
+ (
314
+ Annotated[MessageSelector, Tag("message")]
315
+ | Annotated[PageSelector, Tag("page")]
316
+ | Annotated[NeighboursSelector, Tag("neighbours")]
317
+ | Annotated[WindowSelector, Tag("window")]
318
+ | Annotated[AnswerSelector, Tag("answer")]
319
+ | Annotated[FullSelector, Tag("full")]
320
+ ),
321
+ Discriminator(name_discriminator),
322
+ ]
323
+
324
+
325
+ class ConversationText(FieldText):
326
+ prop: Literal["text"] = "text"
327
+ selector: ConversationSelector
328
+
329
+
330
+ class ConversationAttachments(SelectProp):
331
+ prop: Literal["attachments"] = "attachments"
332
+ selector: ConversationSelector = Field(default_factory=FullSelector)
333
+
334
+
335
+ class ConversationAnswerOrAfter(SelectProp):
336
+ """Hacky conversation prop that given a conversation message (paragraph or
337
+ split), if it's type QUESTION, searches an answer and otherwise provides a
338
+ fixed window of messages after.
339
+
340
+ This was originally used in the /ask endpoint for conversation matches if no
341
+ strategy was selected, however, many bugs around it made it not really used.
342
+ Thus, the value provided by this is not clear and further evaluation should
343
+ be performed.
344
+
345
+ """
346
+
347
+ prop: Literal["answer_or_after"] = "answer_or_after"
348
+
349
+
350
+ ConversationProp = Annotated[
351
+ (
352
+ Annotated[ConversationText, Tag("text")]
353
+ | Annotated[FieldText, Tag("text")]
354
+ | Annotated[FieldValue, Tag("value")]
355
+ | Annotated[FieldClassificationLabels, Tag("classification_labels")]
356
+ | Annotated[FieldEntities, Tag("entities")]
357
+ | Annotated[ConversationAttachments, Tag("attachments")]
358
+ | Annotated[ConversationAnswerOrAfter, Tag("answer_or_after")]
359
+ ),
360
+ Discriminator(prop_discriminator),
361
+ ]
362
+
363
+
364
+ class ResourceTitle(SelectProp):
365
+ prop: Literal["title"] = "title"
366
+
367
+
368
+ class ResourceSummary(SelectProp):
369
+ prop: Literal["summary"] = "summary"
370
+
371
+
372
+ class ResourceOrigin(SelectProp):
373
+ """Same as show=["origin"] using GET resource or search endpoints"""
374
+
375
+ prop: Literal["origin"] = "origin"
376
+
377
+
378
+ class ResourceExtra(SelectProp):
379
+ """Same as show=["extra"] and MetadataExtensionStrategy asking for
380
+ extra_metadata
381
+
382
+ """
383
+
384
+ prop: Literal["extra"] = "extra"
385
+
386
+
387
+ class ResourceSecurity(SelectProp):
388
+ """Same as show=["security"] using GET resource or search endpoints"""
389
+
390
+ prop: Literal["security"] = "security"
391
+
392
+
393
+ class ResourceClassificationLabels(SelectProp):
394
+ """Same as MetadataExtensionStrategy asking for classification_labels"""
395
+
396
+ prop: Literal["classification_labels"] = "classification_labels"
397
+
398
+
399
+ class ResourceFieldsFilter(BaseModel):
400
+ ids: list[str]
401
+
402
+
403
+ ResourceProp = Annotated[
404
+ (
405
+ Annotated[ResourceTitle, Tag("title")]
406
+ | Annotated[ResourceSummary, Tag("summary")]
407
+ | Annotated[ResourceOrigin, Tag("origin")]
408
+ | Annotated[ResourceExtra, Tag("extra")]
409
+ | Annotated[ResourceSecurity, Tag("security")]
410
+ | Annotated[ResourceClassificationLabels, Tag("classification_labels")]
411
+ ),
412
+ Discriminator(prop_discriminator),
413
+ ]
414
+
415
+
416
+ # Augmentations
417
+
418
+
419
+ class ResourceAugment(BaseModel, extra="forbid"):
420
+ given: list[ResourceId | FieldId | ParagraphId]
421
+ select: list[ResourceProp]
422
+ from_: Literal["resources"] = Field(default="resources", alias="from")
423
+
424
+
425
+ class DeepResourceAugment(BaseModel, extra="forbid"):
426
+ given: list[ResourceId]
427
+
428
+ # old style serialization parameters
429
+ show: list[ResourceProperties] = SearchParamDefaults.show.to_pydantic_field()
430
+ extracted: list[ExtractedDataTypeName] = SearchParamDefaults.extracted.to_pydantic_field()
431
+ field_type_filter: list[FieldTypeName] = SearchParamDefaults.field_type_filter.to_pydantic_field()
432
+
433
+ from_: Literal["resources.deep"] = Field(default="resources.deep", alias="from")
434
+
435
+
436
+ class FileAugment(BaseModel, extra="forbid"):
437
+ given: list[FieldId | ParagraphId]
438
+ select: list[FileProp]
439
+ from_: Literal["files"] = Field(default="files", alias="from")
440
+
441
+
442
+ class ConversationAugmentLimits(BaseModel):
443
+ max_messages: int | None = Field(default=15, ge=0)
444
+
445
+
446
+ class ConversationAugment(BaseModel, extra="forbid"):
447
+ given: list[FieldId | ParagraphId]
448
+ select: list[ConversationProp]
449
+ from_: Literal["conversations"] = Field(default="conversations", alias="from")
450
+ # TODO(decoupled-storage): remove?
451
+ limits: ConversationAugmentLimits | None = Field(default_factory=ConversationAugmentLimits)
452
+
453
+
454
+ FieldFilter = Annotated[
455
+ (Annotated[filters.Field, Tag("field")] | Annotated[filters.Generated, Tag("generated")]),
456
+ Discriminator(prop_discriminator),
457
+ ]
458
+
459
+
460
+ class FieldAugment(BaseModel, extra="forbid"):
461
+ given: list[ResourceId] | list[FieldId] | list[ParagraphId]
462
+ select: list[FieldProp]
463
+ from_: Literal["fields"] = Field(default="fields", alias="from")
464
+ filter: list[FieldFilter] | None = None
465
+
466
+
467
+ class ParagraphAugment(BaseModel, extra="forbid"):
468
+ given: list[Paragraph]
469
+ select: list[ParagraphProp]
470
+ from_: Literal["paragraphs"] = Field(default="paragraphs", alias="from")
471
+
472
+
473
+ class AugmentationLimits(BaseModel, extra="forbid"):
474
+ # TODO(decoupled-ask): global augmentation limits (max chars, images, image size...)
475
+ ...
476
+
477
+
478
+ Augment = Annotated[
479
+ (
480
+ Annotated[ResourceAugment, Tag("resources")]
481
+ | Annotated[DeepResourceAugment, Tag("resources.deep")]
482
+ | Annotated[FieldAugment, Tag("fields")]
483
+ | Annotated[FileAugment, Tag("files")]
484
+ | Annotated[ConversationAugment, Tag("conversations")]
485
+ | Annotated[ParagraphAugment, Tag("paragraphs")]
486
+ ),
487
+ Discriminator(from_discriminator),
488
+ ]
489
+
490
+
491
+ class AugmentRequest(BaseModel, extra="forbid"):
492
+ augmentations: list[Augment] = Field(
493
+ default_factory=list,
494
+ description="List of augmentations to be performed",
495
+ )
496
+
497
+ limits: AugmentationLimits | None = Field(
498
+ default=None,
499
+ description="Global hydration limits applied to the whole request",
500
+ )
501
+
502
+
503
+ # Augmented data models
504
+
505
+
506
+ @dataclass
507
+ class AugmentedRelatedParagraphs:
508
+ neighbours_before: list[ParagraphId]
509
+ neighbours_after: list[ParagraphId]
510
+
511
+
512
+ @dataclass
513
+ class AugmentedParagraph:
514
+ id: ParagraphId
515
+
516
+ # textual representation of the paragraph
517
+ text: str | None
518
+
519
+ position: TextPosition | None
520
+
521
+ # original image for the paragraph when it has been extracted from an image
522
+ # or a table. This value is the path to be used in the download endpoint
523
+ source_image_path: str | None
524
+
525
+ # image extracted from the table. It can be just from the table or the page,
526
+ # depending on the augment parameters
527
+ table_image_path: str | None
528
+
529
+ # if the paragraph comes from a page, this is the path for the download
530
+ # endpoint to get the page preview image
531
+ page_preview_path: str | None
532
+
533
+ related: AugmentedRelatedParagraphs | None
534
+
535
+
536
+ @dataclass
537
+ class BaseAugmentedField:
538
+ id: FieldId
539
+
540
+ classification_labels: dict[str, set[str]] | None = None
541
+ entities: dict[str, set[str]] | None = None
542
+
543
+
544
+ @dataclass
545
+ class AugmentedTextField(BaseAugmentedField):
546
+ value: nucliadb_models.text.FieldText | None = None
547
+
548
+ text: str | None = None
549
+
550
+
551
+ @dataclass
552
+ class AugmentedFileField(BaseAugmentedField):
553
+ value: FieldFile | None = None
554
+
555
+ text: str | None = None
556
+ thumbnail_path: str | None = None
557
+
558
+
559
+ @dataclass
560
+ class AugmentedLinkField(BaseAugmentedField):
561
+ value: FieldLink | None = None
562
+
563
+ text: str | None = None
564
+
565
+
566
+ @dataclass
567
+ class AugmentedConversationMessage:
568
+ ident: str
569
+ text: str | None = None
570
+ attachments: list[FieldId] | None = None
571
+
572
+
573
+ @dataclass
574
+ class AugmentedConversationField(BaseAugmentedField):
575
+ value: FieldConversation | None = None
576
+ messages: list[AugmentedConversationMessage] | None = None
577
+
578
+
579
+ @dataclass
580
+ class AugmentedGenericField(BaseAugmentedField):
581
+ value: str | None = None
582
+ text: str | None = None
583
+
584
+
585
+ AugmentedField = (
586
+ BaseAugmentedField
587
+ | AugmentedTextField
588
+ | AugmentedFileField
589
+ | AugmentedLinkField
590
+ | AugmentedConversationField
591
+ | AugmentedGenericField
592
+ )
593
+
594
+
595
+ @dataclass
596
+ class AugmentedResource:
597
+ id: str
598
+
599
+ title: str | None
600
+ summary: str | None
601
+
602
+ origin: Origin | None
603
+ extra: Extra | None
604
+ security: nucliadb_models.security.ResourceSecurity | None
605
+
606
+ classification_labels: dict[str, set[str]] | None
607
+
608
+
609
+ @dataclass
610
+ class Augmented:
611
+ resources: dict[str, AugmentedResource]
612
+ resources_deep: dict[str, Resource]
613
+ fields: dict[FieldId, AugmentedField]
614
+ paragraphs: dict[ParagraphId, AugmentedParagraph]
@@ -24,7 +24,7 @@
24
24
 
25
25
  from datetime import datetime
26
26
  from enum import Enum
27
- from typing import TYPE_CHECKING, Optional
27
+ from typing import TYPE_CHECKING
28
28
 
29
29
  from pydantic import BaseModel, Field
30
30
 
@@ -65,8 +65,8 @@ class PushTextFormat(int, Enum):
65
65
  class Text(BaseModel):
66
66
  body: str
67
67
  format: PushTextFormat
68
- extract_strategy: Optional[str] = None
69
- split_strategy: Optional[str] = None
68
+ extract_strategy: str | None = None
69
+ split_strategy: str | None = None
70
70
  classification_labels: list[ClassificationLabel] = []
71
71
 
72
72
 
@@ -75,18 +75,18 @@ class LinkUpload(BaseModel):
75
75
  headers: dict[str, str] = {}
76
76
  cookies: dict[str, str] = {}
77
77
  localstorage: dict[str, str] = {}
78
- css_selector: Optional[str] = Field(
78
+ css_selector: str | None = Field(
79
79
  None,
80
80
  title="Css selector",
81
81
  description="Css selector to parse the link",
82
82
  )
83
- xpath: Optional[str] = Field(
83
+ xpath: str | None = Field(
84
84
  None,
85
85
  title="Xpath",
86
86
  description="Xpath to parse the link",
87
87
  )
88
- extract_strategy: Optional[str] = None
89
- split_strategy: Optional[str] = None
88
+ extract_strategy: str | None = None
89
+ split_strategy: str | None = None
90
90
  classification_labels: list[ClassificationLabel] = []
91
91
 
92
92
 
@@ -99,14 +99,14 @@ class PushMessageFormat(int, Enum):
99
99
 
100
100
 
101
101
  class PushMessageContent(BaseModel):
102
- text: Optional[str] = None
102
+ text: str | None = None
103
103
  format: PushMessageFormat
104
104
  attachments: list[str] = []
105
105
 
106
106
 
107
107
  class PushMessage(BaseModel):
108
- timestamp: Optional[datetime] = None
109
- who: Optional[str] = None
108
+ timestamp: datetime | None = None
109
+ who: str | None = None
110
110
  to: list[str] = []
111
111
  content: PushMessageContent
112
112
  ident: str
@@ -114,8 +114,8 @@ class PushMessage(BaseModel):
114
114
 
115
115
  class PushConversation(BaseModel):
116
116
  messages: list[PushMessage] = []
117
- extract_strategy: Optional[str] = None
118
- split_strategy: Optional[str] = None
117
+ extract_strategy: str | None = None
118
+ split_strategy: str | None = None
119
119
  classification_labels: list[ClassificationLabel] = []
120
120
 
121
121
 
@@ -125,19 +125,19 @@ class Source(SourceValue, Enum): # type: ignore
125
125
 
126
126
 
127
127
  class ProcessingInfo(BaseModel):
128
- seqid: Optional[int] = None
129
- account_seq: Optional[int] = None
130
- queue: Optional[QueueType] = None
128
+ seqid: int | None = None
129
+ account_seq: int | None = None
130
+ queue: QueueType | None = None
131
131
 
132
132
 
133
133
  class PushPayload(BaseModel):
134
134
  uuid: str
135
- slug: Optional[str] = None
135
+ slug: str | None = None
136
136
  kbid: str
137
- source: Optional[Source] = None
137
+ source: Source | None = None
138
138
  userid: str
139
139
 
140
- title: Optional[str] = None
140
+ title: str | None = None
141
141
 
142
142
  genericfield: dict[str, Text] = {}
143
143
 
@@ -160,4 +160,4 @@ class PushPayload(BaseModel):
160
160
  partition: int
161
161
 
162
162
  # List of available processing options (with default values)
163
- processing_options: Optional[PushProcessingOptions] = Field(default_factory=PushProcessingOptions)
163
+ processing_options: PushProcessingOptions | None = Field(default_factory=PushProcessingOptions)