nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,585 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ import asyncio
22
+ from typing import cast
23
+
24
+ from fastapi import Header, Request
25
+ from fastapi_versioning import version
26
+
27
+ from nucliadb.common.ids import FieldId, ParagraphId
28
+ from nucliadb.models.internal import augment as internal_augment
29
+ from nucliadb.models.internal.augment import (
30
+ Augment,
31
+ Augmented,
32
+ ConversationAnswerOrAfter,
33
+ ConversationAttachments,
34
+ ConversationAugment,
35
+ ConversationProp,
36
+ ConversationSelector,
37
+ ConversationText,
38
+ DeepResourceAugment,
39
+ FieldAugment,
40
+ FieldClassificationLabels,
41
+ FieldEntities,
42
+ FieldProp,
43
+ FieldText,
44
+ FileAugment,
45
+ FileProp,
46
+ FileThumbnail,
47
+ FullSelector,
48
+ MessageSelector,
49
+ Metadata,
50
+ Paragraph,
51
+ ParagraphAugment,
52
+ ParagraphImage,
53
+ ParagraphPage,
54
+ ParagraphPosition,
55
+ ParagraphProp,
56
+ ParagraphTable,
57
+ ParagraphText,
58
+ RelatedParagraphs,
59
+ ResourceAugment,
60
+ ResourceClassificationLabels,
61
+ ResourceProp,
62
+ ResourceSummary,
63
+ ResourceTitle,
64
+ WindowSelector,
65
+ )
66
+ from nucliadb.search.api.v1.router import KB_PREFIX, api
67
+ from nucliadb.search.augmentor import augmentor
68
+ from nucliadb.search.search.cache import request_caches
69
+ from nucliadb_models.augment import (
70
+ AugmentedConversationField,
71
+ AugmentedConversationMessage,
72
+ AugmentedField,
73
+ AugmentedFileField,
74
+ AugmentedParagraph,
75
+ AugmentedResource,
76
+ AugmentParagraphs,
77
+ AugmentRequest,
78
+ AugmentResources,
79
+ AugmentResponse,
80
+ )
81
+ from nucliadb_models.common import FieldTypeName
82
+ from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
83
+ from nucliadb_models.search import NucliaDBClientType, ResourceProperties
84
+ from nucliadb_utils.authentication import requires
85
+
86
+
87
+ @api.post(
88
+ f"/{KB_PREFIX}/{{kbid}}/augment",
89
+ status_code=200,
90
+ description="Augment data on a Knowledge Box",
91
+ include_in_schema=False,
92
+ tags=["Augment"],
93
+ )
94
+ @requires(NucliaDBRoles.READER)
95
+ @version(1)
96
+ async def _augment_endpoint(
97
+ request: Request,
98
+ kbid: str,
99
+ item: AugmentRequest,
100
+ x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
101
+ x_nucliadb_user: str = Header(""),
102
+ x_forwarded_for: str = Header(""),
103
+ ) -> AugmentResponse:
104
+ return await augment_endpoint(kbid, item)
105
+
106
+
107
+ async def augment_endpoint(kbid: str, item: AugmentRequest) -> AugmentResponse:
108
+ augmentations = parse_first_augments(item)
109
+
110
+ if len(augmentations) == 0:
111
+ return AugmentResponse(resources={}, fields={}, paragraphs={})
112
+
113
+ with request_caches():
114
+ max_ops = asyncio.Semaphore(50)
115
+
116
+ first_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
117
+ response = build_augment_response(item, first_augmented)
118
+
119
+ # 2nd round trip to augmentor
120
+ #
121
+ # There are some augmentations that require some augmented content to be
122
+ # able to keep augmenting, as neighbour paragraphs.
123
+ #
124
+ # However, as many data is already cached (when using cache), this
125
+ # second round should be orders of magnitude faster than the first round.
126
+ #
127
+ augmentations = parse_second_augments(item, first_augmented)
128
+ if len(augmentations) > 0:
129
+ second_augmented = await augmentor.augment(kbid, augmentations, concurrency_control=max_ops)
130
+ merge_second_augment(item, response, second_augmented)
131
+
132
+ return response
133
+
134
+
135
+ def parse_first_augments(item: AugmentRequest) -> list[Augment]:
136
+ """Parse an augment request and return a list of internal augments to
137
+ fulfill as much as the requested information as it can.
138
+
139
+ Notice there are augments that will require a 2nd round trip to the
140
+ augmentor, e.g., neighbouring paragraphs. This makes code a bit more
141
+ convoluted but avoids synchronization between augments, as many paragraphs
142
+ could lead to the same neighbours.
143
+
144
+ """
145
+ augmentations: list[Augment] = []
146
+
147
+ if item.resources is not None:
148
+ for resource_augment in item.resources:
149
+ show, extracted, resource_select = parse_deep_resource_augment(resource_augment)
150
+ if resource_augment.field_type_filter is None:
151
+ field_type_filter = list(FieldTypeName)
152
+ else:
153
+ field_type_filter = resource_augment.field_type_filter
154
+
155
+ if show:
156
+ augmentations.append(
157
+ DeepResourceAugment(
158
+ given=resource_augment.given,
159
+ show=show,
160
+ extracted=extracted,
161
+ field_type_filter=field_type_filter,
162
+ )
163
+ )
164
+ if resource_select:
165
+ augmentations.append(
166
+ ResourceAugment(
167
+ given=resource_augment.given, # type: ignore[arg-type]
168
+ select=resource_select,
169
+ )
170
+ )
171
+
172
+ if resource_augment.fields is not None:
173
+ # Augment resource fields with an optional field filter
174
+ field_select: list[FieldProp] = []
175
+ if resource_augment.fields.text:
176
+ field_select.append(FieldText())
177
+ if resource_augment.fields.classification_labels:
178
+ field_select.append(FieldClassificationLabels())
179
+
180
+ augmentations.append(
181
+ FieldAugment(
182
+ given=resource_augment.given, # type: ignore[arg-type]
183
+ select=field_select, # type: ignore[arg-type]
184
+ filter=resource_augment.fields.filters,
185
+ )
186
+ )
187
+
188
+ if item.fields is not None:
189
+ for field_augment in item.fields:
190
+ given = [FieldId.from_string(id) for id in field_augment.given]
191
+ select: list[FieldProp] = []
192
+ if field_augment.text:
193
+ select.append(FieldText())
194
+ if field_augment.entities:
195
+ select.append(FieldEntities())
196
+ if field_augment.classification_labels:
197
+ select.append(FieldClassificationLabels())
198
+
199
+ if len(select) > 0:
200
+ augmentations.append(
201
+ FieldAugment(
202
+ given=given,
203
+ select=select,
204
+ )
205
+ )
206
+
207
+ file_select: list[FileProp] = []
208
+ if field_augment.file_thumbnail:
209
+ file_select.append(FileThumbnail())
210
+
211
+ if len(file_select) > 0:
212
+ augmentations.append(
213
+ FileAugment(
214
+ given=given, # type: ignore
215
+ select=file_select,
216
+ )
217
+ )
218
+
219
+ conversation_select: list[ConversationProp] = []
220
+ selector: ConversationSelector
221
+
222
+ if field_augment.full_conversation:
223
+ selector = FullSelector()
224
+ conversation_select.append(ConversationText(selector=selector))
225
+ if (
226
+ field_augment.conversation_text_attachments
227
+ or field_augment.conversation_image_attachments
228
+ ):
229
+ conversation_select.append(ConversationAttachments(selector=selector))
230
+
231
+ elif field_augment.max_conversation_messages is not None:
232
+ # we want to always get the first conversation and the window
233
+ # requested by the user
234
+ first_selector = MessageSelector(index="first")
235
+ window_selector = WindowSelector(size=field_augment.max_conversation_messages)
236
+ conversation_select.append(ConversationText(selector=first_selector))
237
+ conversation_select.append(ConversationText(selector=window_selector))
238
+ if (
239
+ field_augment.conversation_text_attachments
240
+ or field_augment.conversation_image_attachments
241
+ ):
242
+ conversation_select.append(ConversationAttachments(selector=first_selector))
243
+ conversation_select.append(ConversationAttachments(selector=window_selector))
244
+
245
+ if field_augment.conversation_answer_or_messages_after:
246
+ conversation_select.append(ConversationAnswerOrAfter())
247
+
248
+ if len(conversation_select) > 0:
249
+ augmentations.append(
250
+ ConversationAugment(
251
+ given=given, # type: ignore
252
+ select=conversation_select,
253
+ )
254
+ )
255
+
256
+ if item.paragraphs is not None:
257
+ for paragraph_augment in item.paragraphs:
258
+ paragraphs_to_augment, paragraph_selector = parse_paragraph_augment(paragraph_augment)
259
+ augmentations.append(
260
+ ParagraphAugment(
261
+ given=paragraphs_to_augment,
262
+ select=paragraph_selector,
263
+ )
264
+ )
265
+
266
+ return augmentations
267
+
268
+
269
+ def parse_deep_resource_augment(
270
+ item: AugmentResources,
271
+ ) -> tuple[list[ResourceProperties], list[ExtractedDataTypeName], list[ResourceProp]]:
272
+ show = []
273
+ if item.basic:
274
+ show.append(ResourceProperties.BASIC)
275
+ if item.origin:
276
+ show.append(ResourceProperties.ORIGIN)
277
+ if item.extra:
278
+ show.append(ResourceProperties.EXTRA)
279
+ if item.relations:
280
+ show.append(ResourceProperties.RELATIONS)
281
+ if item.values:
282
+ show.append(ResourceProperties.VALUES)
283
+ if item.errors:
284
+ show.append(ResourceProperties.ERRORS)
285
+ if item.security:
286
+ show.append(ResourceProperties.SECURITY)
287
+
288
+ extracted = []
289
+ if item.extracted_text:
290
+ extracted.append(ExtractedDataTypeName.TEXT)
291
+ if item.extracted_metadata:
292
+ extracted.append(ExtractedDataTypeName.METADATA)
293
+ if item.extracted_shortened_metadata:
294
+ extracted.append(ExtractedDataTypeName.SHORTENED_METADATA)
295
+ if item.extracted_large_metadata:
296
+ extracted.append(ExtractedDataTypeName.LARGE_METADATA)
297
+ if item.extracted_vector:
298
+ extracted.append(ExtractedDataTypeName.VECTOR)
299
+ if item.extracted_link:
300
+ extracted.append(ExtractedDataTypeName.LINK)
301
+ if item.extracted_file:
302
+ extracted.append(ExtractedDataTypeName.FILE)
303
+ if item.extracted_qa:
304
+ extracted.append(ExtractedDataTypeName.QA)
305
+
306
+ if len(extracted) > 0:
307
+ show.append(ResourceProperties.EXTRACTED)
308
+
309
+ select: list[ResourceProp] = []
310
+ if item.title:
311
+ select.append(ResourceTitle())
312
+ if item.summary:
313
+ select.append(ResourceSummary())
314
+ if item.classification_labels:
315
+ select.append(ResourceClassificationLabels())
316
+
317
+ return (
318
+ show,
319
+ extracted,
320
+ select,
321
+ )
322
+
323
+
324
+ def parse_paragraph_augment(item: AugmentParagraphs) -> tuple[list[Paragraph], list[ParagraphProp]]:
325
+ paragraphs_to_augment = []
326
+ for paragraph in item.given:
327
+ try:
328
+ paragraph_id = ParagraphId.from_string(paragraph.id)
329
+ except ValueError:
330
+ # invalid paragraph id, skipping
331
+ continue
332
+
333
+ if paragraph.metadata is None:
334
+ metadata = None
335
+ else:
336
+ metadata = Metadata(
337
+ is_an_image=paragraph.metadata.is_an_image,
338
+ is_a_table=paragraph.metadata.is_a_table,
339
+ source_file=paragraph.metadata.source_file,
340
+ page=paragraph.metadata.page,
341
+ in_page_with_visual=paragraph.metadata.in_page_with_visual,
342
+ )
343
+
344
+ paragraphs_to_augment.append(Paragraph(id=paragraph_id, metadata=metadata))
345
+
346
+ selector: list[ParagraphProp] = []
347
+ if item.text:
348
+ selector.append(ParagraphText())
349
+ if item.neighbours_before or item.neighbours_after:
350
+ selector.append(
351
+ RelatedParagraphs(
352
+ neighbours_before=item.neighbours_before or 0,
353
+ neighbours_after=item.neighbours_after or 0,
354
+ )
355
+ )
356
+ if item.source_image:
357
+ selector.append(ParagraphImage())
358
+ if item.table_image:
359
+ selector.append(ParagraphTable(prefer_page_preview=item.table_prefers_page_preview))
360
+ if item.page_preview_image:
361
+ selector.append(ParagraphPage(preview=True))
362
+
363
+ return paragraphs_to_augment, selector
364
+
365
+
366
+ def build_augment_response(item: AugmentRequest, augmented: Augmented) -> AugmentResponse:
367
+ response = AugmentResponse(
368
+ resources={},
369
+ fields={},
370
+ paragraphs={},
371
+ )
372
+
373
+ # start with deep resources, as they return a Resource object we can merge
374
+ # with the augmented model
375
+ for rid, resource_deep in augmented.resources_deep.items():
376
+ if resource_deep is None:
377
+ continue
378
+
379
+ augmented_resource = AugmentedResource(id=rid)
380
+ augmented_resource.updated_from(resource_deep)
381
+ response.resources[rid] = augmented_resource
382
+
383
+ # now we can cherry pick properties from the augmented resources and merge
384
+ # them with the deep ones
385
+ for rid, resource in augmented.resources.items():
386
+ if resource is None:
387
+ continue
388
+
389
+ augmented_resource = response.resources.setdefault(rid, AugmentedResource(id=rid))
390
+
391
+ # merge resource with deep resources without overwriting
392
+ augmented_resource.title = augmented_resource.title or resource.title
393
+ augmented_resource.summary = augmented_resource.summary or resource.summary
394
+
395
+ # properties original to the augmented resources (not in deep resource augment)
396
+ if resource.classification_labels is not None:
397
+ augmented_resource.classification_labels = {
398
+ labelset: list(labels) for labelset, labels in resource.classification_labels.items()
399
+ }
400
+
401
+ for field_id, field in augmented.fields.items():
402
+ if field is None:
403
+ continue
404
+
405
+ # common augments for all fields
406
+
407
+ if field.classification_labels is None:
408
+ classification_labels = None
409
+ else:
410
+ classification_labels = {
411
+ labelset: list(labels) for labelset, labels in field.classification_labels.items()
412
+ }
413
+
414
+ if field.entities is None:
415
+ entities = None
416
+ else:
417
+ entities = {family: list(entity) for family, entity in field.entities.items()}
418
+
419
+ if field_id.type in (
420
+ FieldTypeName.TEXT.abbreviation(),
421
+ FieldTypeName.LINK.abbreviation(),
422
+ FieldTypeName.GENERIC.abbreviation(),
423
+ ):
424
+ response.fields[field_id.full()] = AugmentedField(
425
+ text=field.text, # type: ignore # field is instance of any of the above and has the text property
426
+ classification_labels=classification_labels,
427
+ entities=entities,
428
+ )
429
+
430
+ elif field_id.type == FieldTypeName.FILE.abbreviation():
431
+ field = cast(internal_augment.AugmentedFileField, field)
432
+ response.fields[field_id.full()] = AugmentedFileField(
433
+ text=field.text, # type: ignore # field is instance of any of the above and has the text property
434
+ classification_labels=classification_labels,
435
+ entities=entities,
436
+ thumbnail_image=field.thumbnail_path,
437
+ )
438
+
439
+ elif field_id.type == FieldTypeName.CONVERSATION.abbreviation():
440
+ field = cast(internal_augment.AugmentedConversationField, field)
441
+ conversation = AugmentedConversationField(
442
+ classification_labels=classification_labels,
443
+ entities=entities,
444
+ )
445
+
446
+ if field.messages is not None:
447
+ conversation.messages = []
448
+ for m in field.messages:
449
+ if m.attachments is None:
450
+ attachments = None
451
+ else:
452
+ attachments = []
453
+ for f in m.attachments:
454
+ attachments.append(f.full())
455
+
456
+ conversation.messages.append(
457
+ AugmentedConversationMessage(
458
+ ident=m.ident,
459
+ text=m.text,
460
+ attachments=attachments,
461
+ )
462
+ )
463
+
464
+ response.fields[field_id.full()] = conversation
465
+
466
+ else: # pragma: no cover
467
+ assert False, f"unknown field type: {field_id.type}"
468
+
469
+ for paragraph_id, paragraph in augmented.paragraphs.items():
470
+ if paragraph is None:
471
+ continue
472
+
473
+ augmented_paragraph = AugmentedParagraph()
474
+ augmented_paragraph.text = paragraph.text
475
+ if paragraph.related is not None:
476
+ augmented_paragraph.neighbours_before = list(
477
+ map(lambda x: x.full(), paragraph.related.neighbours_before)
478
+ )
479
+ augmented_paragraph.neighbours_after = list(
480
+ map(lambda x: x.full(), paragraph.related.neighbours_after)
481
+ )
482
+ augmented_paragraph.source_image = paragraph.source_image_path
483
+ augmented_paragraph.table_image = paragraph.table_image_path
484
+ augmented_paragraph.page_preview_image = paragraph.page_preview_path
485
+ response.paragraphs[paragraph_id.full()] = augmented_paragraph
486
+
487
+ return response
488
+
489
+
490
+ def parse_second_augments(item: AugmentRequest, augmented: Augmented) -> list[Augment]:
491
+ """Given an augment request an a first augmentation, return a list of
492
+ augments required to fulfill the requested data.
493
+
494
+ """
495
+ augmentations: list[Augment] = []
496
+
497
+ for paragraph_augment in item.paragraphs or []:
498
+ if paragraph_augment.neighbours_before or paragraph_augment.neighbours_after:
499
+ neighbours = []
500
+ for paragraph_id, paragraph in augmented.paragraphs.items():
501
+ if paragraph.related is not None:
502
+ for neighbour_before in paragraph.related.neighbours_before:
503
+ neighbours.append(Paragraph(id=neighbour_before, metadata=None))
504
+ for neighbour_after in paragraph.related.neighbours_after:
505
+ neighbours.append(Paragraph(id=neighbour_after, metadata=None))
506
+
507
+ if neighbours:
508
+ augmentations.append(
509
+ ParagraphAugment(
510
+ given=neighbours,
511
+ select=[
512
+ ParagraphText(),
513
+ ParagraphPosition(),
514
+ ],
515
+ )
516
+ )
517
+
518
+ return augmentations
519
+
520
+
521
+ def merge_second_augment(item: AugmentRequest, response: AugmentResponse, augmented: Augmented):
522
+ """Merge in-place augmented data with an existing augment response."""
523
+
524
+ if any(
525
+ (
526
+ paragraph_augment.neighbours_before or paragraph_augment.neighbours_after
527
+ for paragraph_augment in item.paragraphs or []
528
+ )
529
+ ):
530
+ # neighbour paragraphs
531
+
532
+ new_paragraphs = {}
533
+ for paragraph_id_str, augmented_paragraph in response.paragraphs.items():
534
+ before_refs = []
535
+ for before_id_str in augmented_paragraph.neighbours_before or []:
536
+ before_id = ParagraphId.from_string(before_id_str)
537
+
538
+ if before_id not in augmented.paragraphs:
539
+ continue
540
+ neighbour = augmented.paragraphs[before_id]
541
+
542
+ if before_id_str not in response.paragraphs:
543
+ if not neighbour.text and not neighbour.position:
544
+ continue
545
+ # create a new paragraph for the neighbour
546
+ new_paragraphs[before_id_str] = AugmentedParagraph(
547
+ text=neighbour.text, position=neighbour.position
548
+ )
549
+
550
+ else:
551
+ # merge neighbour with existing paragraph
552
+ if not response.paragraphs[before_id_str].text:
553
+ response.paragraphs[before_id_str].text = neighbour.text
554
+
555
+ before_refs.append(before_id_str)
556
+
557
+ after_refs = []
558
+ for after_id_str in augmented_paragraph.neighbours_after or []:
559
+ after_id = ParagraphId.from_string(after_id_str)
560
+
561
+ if after_id not in augmented.paragraphs:
562
+ continue
563
+ neighbour = augmented.paragraphs[after_id]
564
+
565
+ if after_id_str not in response.paragraphs:
566
+ if not neighbour.text and not neighbour.position:
567
+ continue
568
+ # create a new paragraph for the neighbour
569
+ new_paragraphs[after_id_str] = AugmentedParagraph(
570
+ text=neighbour.text, position=neighbour.position
571
+ )
572
+
573
+ else:
574
+ # merge neighbour with existing paragraph
575
+ if not response.paragraphs[after_id_str].text:
576
+ response.paragraphs[after_id_str].text = neighbour.text
577
+
578
+ after_refs.append(after_id_str)
579
+
580
+ # update references to contain only the neighbours that existed in
581
+ # the response or we added
582
+ augmented_paragraph.neighbours_before = before_refs
583
+ augmented_paragraph.neighbours_after = after_refs
584
+
585
+ response.paragraphs.update(new_paragraphs)
@@ -19,12 +19,12 @@
19
19
  #
20
20
  import json
21
21
  from time import time
22
- from typing import Optional, Union
23
22
 
24
23
  from fastapi import Request, Response
25
24
  from fastapi_versioning import version
26
25
  from pydantic import ValidationError
27
26
 
27
+ from nucliadb.common.catalog import catalog_facets, catalog_search
28
28
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
29
29
  from nucliadb.common.exceptions import InvalidQueryError
30
30
  from nucliadb.models.responses import HTTPClientError
@@ -33,7 +33,6 @@ from nucliadb.search.api.v1.router import KB_PREFIX, api
33
33
  from nucliadb.search.api.v1.utils import fastapi_query
34
34
  from nucliadb.search.search import cache
35
35
  from nucliadb.search.search.merge import fetch_resources
36
- from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
37
36
  from nucliadb.search.search.query_parser.parsers import parse_catalog
38
37
  from nucliadb.search.search.utils import (
39
38
  maybe_log_request_payload,
@@ -75,31 +74,28 @@ async def catalog_get(
75
74
  response: Response,
76
75
  kbid: str,
77
76
  query: str = fastapi_query(SearchParamDefaults.query),
78
- filter_expression: Optional[str] = fastapi_query(SearchParamDefaults.catalog_filter_expression),
77
+ filter_expression: str | None = fastapi_query(SearchParamDefaults.catalog_filter_expression),
79
78
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
80
79
  faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
81
80
  sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
82
- sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
83
81
  sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
84
82
  page_number: int = fastapi_query(SearchParamDefaults.catalog_page_number),
85
83
  page_size: int = fastapi_query(SearchParamDefaults.catalog_page_size),
86
- with_status: Optional[ResourceProcessingStatus] = fastapi_query(
84
+ with_status: ResourceProcessingStatus | None = fastapi_query(
87
85
  SearchParamDefaults.with_status, deprecated="Use filters instead"
88
86
  ),
89
87
  debug: bool = fastapi_query(SearchParamDefaults.debug, include_in_schema=False),
90
- range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
91
- range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
92
- range_modification_start: Optional[DateTime] = fastapi_query(
88
+ range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
89
+ range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
90
+ range_modification_start: DateTime | None = fastapi_query(
93
91
  SearchParamDefaults.range_modification_start
94
92
  ),
95
- range_modification_end: Optional[DateTime] = fastapi_query(
96
- SearchParamDefaults.range_modification_end
97
- ),
98
- hidden: Optional[bool] = fastapi_query(SearchParamDefaults.hidden),
93
+ range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
94
+ hidden: bool | None = fastapi_query(SearchParamDefaults.hidden),
99
95
  show: list[ResourceProperties] = fastapi_query(
100
96
  SearchParamDefaults.show, default=[ResourceProperties.BASIC, ResourceProperties.ERRORS]
101
97
  ),
102
- ) -> Union[CatalogResponse, HTTPClientError]:
98
+ ) -> CatalogResponse | HTTPClientError:
103
99
  try:
104
100
  expr = (
105
101
  CatalogFilterExpression.model_validate_json(filter_expression) if filter_expression else None
@@ -125,7 +121,7 @@ async def catalog_get(
125
121
  show=show,
126
122
  )
127
123
  if sort_field:
128
- item.sort = SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
124
+ item.sort = SortOptions(field=sort_field, order=sort_order)
129
125
  return await catalog(kbid, item)
130
126
 
131
127
 
@@ -144,14 +140,14 @@ async def catalog_post(
144
140
  request: Request,
145
141
  kbid: str,
146
142
  item: CatalogRequest,
147
- ) -> Union[CatalogResponse, HTTPClientError]:
143
+ ) -> CatalogResponse | HTTPClientError:
148
144
  return await catalog(kbid, item)
149
145
 
150
146
 
151
147
  async def catalog(
152
148
  kbid: str,
153
149
  item: CatalogRequest,
154
- ) -> Union[HTTPClientError, CatalogResponse]:
150
+ ) -> HTTPClientError | CatalogResponse:
155
151
  """
156
152
  Catalog endpoint is a simplified version of the search endpoint, it only
157
153
  returns bm25 results on titles and it does not support vector search.
@@ -164,7 +160,7 @@ async def catalog(
164
160
  query_parser = await parse_catalog(kbid, item)
165
161
 
166
162
  catalog_results = CatalogResponse()
167
- catalog_results.fulltext = await pgcatalog_search(query_parser)
163
+ catalog_results.fulltext = await catalog_search(query_parser)
168
164
  catalog_results.resources = await fetch_resources(
169
165
  resources=[r.rid for r in catalog_results.fulltext.results],
170
166
  kbid=kbid,
@@ -205,7 +201,7 @@ async def catalog(
205
201
  )
206
202
  @requires(NucliaDBRoles.READER)
207
203
  @version(1)
208
- async def catalog_facets(
204
+ async def catalog_facets_endpoint(
209
205
  request: Request, kbid: str, item: CatalogFacetsRequest
210
206
  ) -> CatalogFacetsResponse:
211
- return CatalogFacetsResponse(facets=await pgcatalog_facets(kbid, item))
207
+ return CatalogFacetsResponse(facets=await catalog_facets(kbid, item))