nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
- from typing import Optional, Union
22
21
 
23
22
  from fastapi import Body, Header, Query, Request, Response
24
23
  from fastapi.openapi.models import Example
@@ -46,7 +45,6 @@ from nucliadb_models.search import (
46
45
  KnowledgeboxFindResults,
47
46
  NucliaDBClientType,
48
47
  RankFusionName,
49
- Reranker,
50
48
  RerankerName,
51
49
  ResourceProperties,
52
50
  SearchParamDefaults,
@@ -84,33 +82,31 @@ async def find_knowledgebox(
84
82
  response: Response,
85
83
  kbid: str,
86
84
  query: str = fastapi_query(SearchParamDefaults.query),
87
- filter_expression: Optional[str] = fastapi_query(SearchParamDefaults.filter_expression),
85
+ filter_expression: str | None = fastapi_query(SearchParamDefaults.filter_expression),
88
86
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
89
87
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
90
- top_k: Optional[int] = fastapi_query(SearchParamDefaults.top_k),
91
- min_score: Optional[float] = Query(
88
+ top_k: int | None = fastapi_query(SearchParamDefaults.top_k),
89
+ min_score: float | None = Query(
92
90
  default=None,
93
- description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
91
+ description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
94
92
  deprecated=True,
95
93
  ),
96
- min_score_semantic: Optional[float] = Query(
94
+ min_score_semantic: float | None = Query(
97
95
  default=None,
98
- description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
96
+ description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score",
99
97
  ),
100
98
  min_score_bm25: float = Query(
101
99
  default=0,
102
100
  description="Minimum bm25 score to filter paragraph and document index results",
103
101
  ge=0,
104
102
  ),
105
- vectorset: Optional[str] = fastapi_query(SearchParamDefaults.vectorset),
106
- range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
107
- range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
108
- range_modification_start: Optional[DateTime] = fastapi_query(
103
+ vectorset: str | None = fastapi_query(SearchParamDefaults.vectorset),
104
+ range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
105
+ range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
106
+ range_modification_start: DateTime | None = fastapi_query(
109
107
  SearchParamDefaults.range_modification_start
110
108
  ),
111
- range_modification_end: Optional[DateTime] = fastapi_query(
112
- SearchParamDefaults.range_modification_end
113
- ),
109
+ range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
114
110
  features: list[FindOptions] = fastapi_query(
115
111
  SearchParamDefaults.search_features,
116
112
  default=[
@@ -127,19 +123,18 @@ async def find_knowledgebox(
127
123
  extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
128
124
  with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
129
125
  with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
130
- autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
131
126
  security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
132
127
  show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
133
128
  rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
134
- reranker: Union[RerankerName, Reranker] = fastapi_query(SearchParamDefaults.reranker),
135
- search_configuration: Optional[str] = Query(
129
+ reranker: RerankerName = fastapi_query(SearchParamDefaults.reranker),
130
+ search_configuration: str | None = Query(
136
131
  default=None,
137
132
  description="Load find parameters from this configuration. Parameters in the request override parameters from the configuration.",
138
133
  ),
139
134
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
140
135
  x_nucliadb_user: str = Header(""),
141
136
  x_forwarded_for: str = Header(""),
142
- ) -> Union[KnowledgeboxFindResults, HTTPClientError]:
137
+ ) -> KnowledgeboxFindResults | HTTPClientError:
143
138
  try:
144
139
  expr = FilterExpression.model_validate_json(filter_expression) if filter_expression else None
145
140
 
@@ -166,7 +161,6 @@ async def find_knowledgebox(
166
161
  extracted=extracted,
167
162
  with_duplicates=with_duplicates,
168
163
  with_synonyms=with_synonyms,
169
- autofilter=autofilter,
170
164
  security=security,
171
165
  show_hidden=show_hidden,
172
166
  rank_fusion=rank_fusion,
@@ -198,7 +192,7 @@ async def find_post_knowledgebox(
198
192
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
199
193
  x_nucliadb_user: str = Header(""),
200
194
  x_forwarded_for: str = Header(""),
201
- ) -> Union[KnowledgeboxFindResults, HTTPClientError]:
195
+ ) -> KnowledgeboxFindResults | HTTPClientError:
202
196
  return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
203
197
 
204
198
 
@@ -209,7 +203,7 @@ async def _find_endpoint(
209
203
  x_ndb_client: NucliaDBClientType,
210
204
  x_nucliadb_user: str,
211
205
  x_forwarded_for: str,
212
- ) -> Union[KnowledgeboxFindResults, HTTPClientError]:
206
+ ) -> KnowledgeboxFindResults | HTTPClientError:
213
207
  if item.search_configuration is not None:
214
208
  search_config = await datamanagers.atomic.search_configurations.get(
215
209
  kbid=kbid, name=item.search_configuration
@@ -0,0 +1,328 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from collections.abc import Awaitable
22
+
23
+ from async_lru import alru_cache
24
+ from fastapi import Request, Response
25
+ from fastapi_versioning import version
26
+
27
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
28
+ from nucliadb.ingest.fields.base import Field
29
+ from nucliadb.search.api.v1.router import KB_PREFIX, api
30
+ from nucliadb.search.search import cache
31
+ from nucliadb.search.search.cache import request_caches
32
+ from nucliadb.search.search.hydrator.fields import hydrate_field, page_preview_id
33
+ from nucliadb.search.search.hydrator.images import (
34
+ download_page_preview,
35
+ )
36
+ from nucliadb.search.search.hydrator.paragraphs import ParagraphIndex, hydrate_paragraph
37
+ from nucliadb.search.search.hydrator.resources import hydrate_resource
38
+ from nucliadb_models.hydration import (
39
+ Hydrated,
40
+ HydratedConversationField,
41
+ HydratedFileField,
42
+ HydratedGenericField,
43
+ HydratedLinkField,
44
+ HydratedParagraph,
45
+ HydratedResource,
46
+ HydratedTextField,
47
+ HydrateRequest,
48
+ Hydration,
49
+ ParagraphHydration,
50
+ )
51
+ from nucliadb_models.resource import NucliaDBRoles
52
+ from nucliadb_models.search import Image
53
+ from nucliadb_utils.authentication import requires
54
+
55
+
56
+ @api.post(
57
+ f"/{KB_PREFIX}/{{kbid}}/hydrate",
58
+ status_code=200,
59
+ summary="Hydrate a set of paragraphs",
60
+ description="Internal API endpoint to hydrate a set of paragraphs",
61
+ include_in_schema=False,
62
+ response_model_exclude_unset=True,
63
+ tags=["Hydration"],
64
+ )
65
+ @requires(NucliaDBRoles.READER)
66
+ @version(1)
67
+ async def hydrate_endpoint(
68
+ request: Request,
69
+ response: Response,
70
+ kbid: str,
71
+ item: HydrateRequest,
72
+ ) -> Hydrated:
73
+ with request_caches():
74
+ return await Hydrator(kbid, item.hydration).hydrate(item.data)
75
+
76
+
77
+ class HydratedBuilder:
78
+ """Builder class to construct an Hydrated payload."""
79
+
80
+ def __init__(self) -> None:
81
+ self._resources: dict[str, HydratedResource] = {}
82
+ self._fields: dict[
83
+ str,
84
+ (
85
+ HydratedTextField
86
+ | HydratedFileField
87
+ | HydratedLinkField
88
+ | HydratedConversationField
89
+ | HydratedGenericField
90
+ ),
91
+ ] = {}
92
+ self._paragraphs: dict[str, HydratedParagraph] = {}
93
+
94
+ @property
95
+ def resources(self) -> dict[str, HydratedResource]:
96
+ return self._resources
97
+
98
+ @property
99
+ def fields(
100
+ self,
101
+ ) -> dict[
102
+ str,
103
+ (
104
+ HydratedTextField
105
+ | HydratedFileField
106
+ | HydratedLinkField
107
+ | HydratedConversationField
108
+ | HydratedGenericField
109
+ ),
110
+ ]:
111
+ return self._fields
112
+
113
+ @property
114
+ def paragraphs(self) -> dict[str, HydratedParagraph]:
115
+ return self._paragraphs
116
+
117
+ def build(self) -> Hydrated:
118
+ return Hydrated(
119
+ resources=self._resources,
120
+ fields=self._fields,
121
+ paragraphs=self._paragraphs,
122
+ )
123
+
124
+ def add_resource(self, rid: str, resource: HydratedResource):
125
+ self._resources[rid] = resource
126
+
127
+ def add_field(
128
+ self,
129
+ field_id: FieldId,
130
+ field: (
131
+ HydratedTextField
132
+ | HydratedFileField
133
+ | HydratedLinkField
134
+ | HydratedConversationField
135
+ | HydratedGenericField
136
+ ),
137
+ ):
138
+ self._fields[field_id.full()] = field
139
+
140
+ def has_field(self, field_id: FieldId) -> bool:
141
+ return field_id.full() in self._fields
142
+
143
+ def add_paragraph(self, paragraph_id: ParagraphId, paragraph: HydratedParagraph):
144
+ self._paragraphs[paragraph_id.full()] = paragraph
145
+
146
+ def add_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
147
+ field_id = paragraph_id.field_id
148
+ field = self._fields[field_id.full()]
149
+
150
+ if not isinstance(field, HydratedFileField):
151
+ # Other field types have no page preview concept
152
+ return
153
+
154
+ if field.previews is None:
155
+ field.previews = {}
156
+
157
+ preview_id = page_preview_id(page)
158
+ field.previews[preview_id] = image
159
+
160
+ paragraph = self._paragraphs[paragraph_id.full()]
161
+ assert paragraph.page is not None, "should already be set"
162
+ paragraph.page.page_preview_ref = preview_id
163
+
164
+ def add_table_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
165
+ field_id = paragraph_id.field_id
166
+ field = self._fields[field_id.full()]
167
+
168
+ if not isinstance(field, HydratedFileField):
169
+ # Other field types have no page preview concept
170
+ return
171
+
172
+ if field.previews is None:
173
+ field.previews = {}
174
+
175
+ preview_id = page_preview_id(page)
176
+ field.previews[preview_id] = image
177
+
178
+ paragraph = self._paragraphs[paragraph_id.full()]
179
+ assert paragraph.table is not None, "should already be set"
180
+ paragraph.table.page_preview_ref = preview_id
181
+
182
+
183
+ class Hydrator:
184
+ def __init__(self, kbid: str, config: Hydration):
185
+ self.kbid = kbid
186
+ self.config = config
187
+ self.hydrated = HydratedBuilder()
188
+
189
+ # cached paragraphs per field
190
+ self.field_paragraphs: dict[FieldId, ParagraphIndex] = {}
191
+
192
+ self.max_ops = asyncio.Semaphore(50)
193
+
194
+ async def hydrate(self, paragraph_ids: list[str]) -> Hydrated:
195
+ paragraph_tasks = {}
196
+ field_tasks = {}
197
+ resource_tasks = {}
198
+
199
+ unique_paragraph_ids = set(paragraph_ids)
200
+ for user_paragraph_id in unique_paragraph_ids:
201
+ try:
202
+ paragraph_id = ParagraphId.from_string(user_paragraph_id)
203
+ except ValueError:
204
+ # skip paragraphs with invalid format
205
+ continue
206
+
207
+ field_id = paragraph_id.field_id
208
+ rid = paragraph_id.rid
209
+
210
+ resource = await cache.get_resource(self.kbid, rid)
211
+ if resource is None:
212
+ # skip resources that aren't in the DB
213
+ continue
214
+
215
+ field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
216
+ if not (await resource.field_exists(field_type_pb, field_id.key)):
217
+ # skip a fields that aren't in the DB
218
+ continue
219
+ field = await resource.get_field(field_id.key, field_id.pb_type)
220
+
221
+ if field_id not in self.field_paragraphs:
222
+ field_paragraphs_index = ParagraphIndex(field_id)
223
+ self.field_paragraphs[field_id] = field_paragraphs_index
224
+ field_paragraphs_index = self.field_paragraphs[field_id]
225
+
226
+ paragraph_tasks[paragraph_id] = asyncio.create_task(
227
+ self._limited_concurrency(
228
+ hydrate_paragraph(
229
+ resource, field, paragraph_id, self.config.paragraph, field_paragraphs_index
230
+ ),
231
+ )
232
+ )
233
+
234
+ if field_id not in field_tasks:
235
+ field_tasks[field_id] = asyncio.create_task(
236
+ self._limited_concurrency(hydrate_field(field, field_id, self.config.field))
237
+ )
238
+
239
+ if rid not in resource_tasks:
240
+ if self.config.resource is not None:
241
+ resource_tasks[rid] = asyncio.create_task(
242
+ self._limited_concurrency(hydrate_resource(resource, rid, self.config.resource))
243
+ )
244
+
245
+ ops = [
246
+ *paragraph_tasks.values(),
247
+ *field_tasks.values(),
248
+ *resource_tasks.values(),
249
+ ]
250
+ results = await asyncio.gather(*ops)
251
+ hydrated_paragraphs = results[: len(paragraph_tasks)]
252
+ hydrated_fields = results[len(paragraph_tasks) : len(paragraph_tasks) + len(field_tasks)]
253
+ hydrated_resources = results[
254
+ len(paragraph_tasks) + len(field_tasks) : len(paragraph_tasks)
255
+ + len(field_tasks)
256
+ + len(resource_tasks)
257
+ ]
258
+
259
+ for rid, hydrated_resource in zip(resource_tasks.keys(), hydrated_resources):
260
+ self.hydrated.add_resource(rid, hydrated_resource)
261
+
262
+ for field_id, hydrated_field in zip(field_tasks.keys(), hydrated_fields):
263
+ if hydrated_field is not None:
264
+ self.hydrated.add_field(field_id, hydrated_field)
265
+
266
+ for paragraph_id, (hydrated_paragraph, extra) in zip(
267
+ paragraph_tasks.keys(), hydrated_paragraphs
268
+ ):
269
+ self.hydrated.add_paragraph(paragraph_id, hydrated_paragraph)
270
+
271
+ for related_paragraph_id in extra.related_paragraph_ids:
272
+ field_id = related_paragraph_id.field_id
273
+ rid = related_paragraph_id.rid
274
+
275
+ resource = await cache.get_resource(self.kbid, rid)
276
+ if resource is None:
277
+ # skip resources that aren't in the DB
278
+ continue
279
+
280
+ field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
281
+ if not (await resource.field_exists(field_type_pb, field_id.key)):
282
+ # skip a fields that aren't in the DB
283
+ continue
284
+ field = await resource.get_field(field_id.key, field_id.pb_type)
285
+
286
+ if field_id not in self.field_paragraphs:
287
+ field_paragraphs_index = ParagraphIndex(field_id)
288
+ self.field_paragraphs[field_id] = field_paragraphs_index
289
+ field_paragraphs_index = self.field_paragraphs[field_id]
290
+
291
+ (hydrated_paragraph, _) = await hydrate_paragraph(
292
+ resource,
293
+ field,
294
+ related_paragraph_id,
295
+ ParagraphHydration(
296
+ text=self.config.paragraph.text, image=None, table=None, page=None, related=None
297
+ ),
298
+ field_paragraphs_index,
299
+ )
300
+ self.hydrated.add_paragraph(related_paragraph_id, hydrated_paragraph)
301
+
302
+ if self.hydrated.has_field(field_id):
303
+ # we only hydrate page and table previews for fields the user
304
+ # allowed hydration, skipping fields with explicitly disabled
305
+ # hydration
306
+
307
+ if extra.field_page is not None:
308
+ page_number = extra.field_page
309
+ preview = await self.cached_download_page_preview(field, page_number)
310
+ if preview is not None:
311
+ self.hydrated.add_page_preview(paragraph_id, page_number, preview)
312
+
313
+ if extra.field_table_page is not None:
314
+ page_number = extra.field_table_page
315
+ preview = await self.cached_download_page_preview(field, page_number)
316
+ if preview is not None:
317
+ self.hydrated.add_table_page_preview(paragraph_id, page_number, preview)
318
+
319
+ return self.hydrated.build()
320
+
321
+ # TODO: proper typing
322
+ async def _limited_concurrency(self, aw: Awaitable):
323
+ async with self.max_ops:
324
+ return await aw
325
+
326
+ @alru_cache(maxsize=50)
327
+ async def cached_download_page_preview(self, field: Field, page: int) -> Image | None:
328
+ return await download_page_preview(field, page)
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
- from typing import Optional
22
21
 
23
22
  from fastapi import HTTPException, Request
24
23
  from fastapi_versioning import version
@@ -178,7 +177,7 @@ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
178
177
  )
179
178
 
180
179
  try:
181
- results: Optional[list[Shard]] = await asyncio.wait_for(
180
+ results: list[Shard] | None = await asyncio.wait_for(
182
181
  asyncio.gather(*ops, return_exceptions=True), # type: ignore
183
182
  timeout=settings.search_timeout,
184
183
  )
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
- from typing import Union
22
21
 
23
22
  from fastapi import Header, Request
24
23
  from fastapi.responses import Response, StreamingResponse
@@ -68,7 +67,7 @@ async def predict_proxy_endpoint(
68
67
  x_nucliadb_user: str = Header(""),
69
68
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
70
69
  x_forwarded_for: str = Header(""),
71
- ) -> Union[Response, StreamingResponse, HTTPClientError]:
70
+ ) -> Response | StreamingResponse | HTTPClientError:
72
71
  try:
73
72
  payload = await request.json()
74
73
  except json.JSONDecodeError:
@@ -17,18 +17,19 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Union
20
+ from uuid import UUID
21
21
 
22
22
  from fastapi import Header, Request, Response
23
23
  from fastapi_versioning import version
24
24
  from starlette.responses import StreamingResponse
25
25
 
26
+ from nucliadb.common import datamanagers
26
27
  from nucliadb.models.responses import HTTPClientError
27
- from nucliadb.search.api.v1.resource.utils import get_resource_uuid_by_slug
28
28
  from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_SLUG_PREFIX, api
29
29
  from nucliadb_models.resource import NucliaDBRoles
30
30
  from nucliadb_models.search import AskRequest, NucliaDBClientType, SyncAskResponse
31
- from nucliadb_utils.authentication import requires
31
+ from nucliadb_models.security import RequestSecurity
32
+ from nucliadb_utils.authentication import NucliaUser, requires
32
33
 
33
34
  from ..ask import create_ask_response
34
35
 
@@ -46,7 +47,7 @@ from ..ask import create_ask_response
46
47
  async def resource_ask_endpoint_by_uuid(
47
48
  request: Request,
48
49
  kbid: str,
49
- rid: str,
50
+ rid: UUID,
50
51
  item: AskRequest,
51
52
  x_show_consumption: bool = Header(default=False),
52
53
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
@@ -57,7 +58,16 @@ async def resource_ask_endpoint_by_uuid(
57
58
  description="When set to true, outputs response as JSON in a non-streaming way. "
58
59
  "This is slower and requires waiting for entire answer to be ready.",
59
60
  ),
60
- ) -> Union[StreamingResponse, HTTPClientError, Response]:
61
+ ) -> StreamingResponse | HTTPClientError | Response:
62
+ current_user: NucliaUser = request.user
63
+ # If present, security groups from AuthorizationBackend overrides any
64
+ # security group of the payload
65
+ if current_user.security_groups:
66
+ if item.security is None:
67
+ item.security = RequestSecurity(groups=current_user.security_groups)
68
+ else:
69
+ item.security.groups = current_user.security_groups
70
+
61
71
  return await create_ask_response(
62
72
  kbid=kbid,
63
73
  ask_request=item,
@@ -65,7 +75,7 @@ async def resource_ask_endpoint_by_uuid(
65
75
  client_type=x_ndb_client,
66
76
  origin=x_forwarded_for,
67
77
  x_synchronous=x_synchronous,
68
- resource=rid,
78
+ resource=str(rid),
69
79
  extra_predict_headers={"X-Show-Consumption": str(x_show_consumption).lower()},
70
80
  )
71
81
 
@@ -94,10 +104,20 @@ async def resource_ask_endpoint_by_slug(
94
104
  description="When set to true, outputs response as JSON in a non-streaming way. "
95
105
  "This is slower and requires waiting for entire answer to be ready.",
96
106
  ),
97
- ) -> Union[StreamingResponse, HTTPClientError, Response]:
98
- resource_id = await get_resource_uuid_by_slug(kbid, slug)
107
+ ) -> StreamingResponse | HTTPClientError | Response:
108
+ resource_id = await datamanagers.atomic.resources.get_resource_uuid_from_slug(kbid=kbid, slug=slug)
99
109
  if resource_id is None:
100
110
  return HTTPClientError(status_code=404, detail="Resource not found")
111
+
112
+ current_user: NucliaUser = request.user
113
+ # If present, security groups from AuthorizationBackend overrides any
114
+ # security group of the payload
115
+ if current_user.security_groups:
116
+ if item.security is None:
117
+ item.security = RequestSecurity(groups=current_user.security_groups)
118
+ else:
119
+ item.security.groups = current_user.security_groups
120
+
101
121
  return await create_ask_response(
102
122
  kbid=kbid,
103
123
  ask_request=item,
@@ -17,14 +17,13 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Union
21
20
 
22
21
  from fastapi import Header, Request, Response
23
22
  from fastapi_versioning import version
24
23
 
24
+ from nucliadb.common import datamanagers
25
25
  from nucliadb.common.models_utils import from_proto
26
26
  from nucliadb.models.responses import HTTPClientError
27
- from nucliadb.search.api.v1.resource.utils import get_resource_uuid_by_slug
28
27
  from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RESOURCE_SLUG_PREFIX, api
29
28
  from nucliadb.search.predict_models import AugmentedField, RunAgentsResponse
30
29
  from nucliadb.search.search.exceptions import ResourceNotFoundError
@@ -58,7 +57,7 @@ async def run_agents_by_uuid(
58
57
  rid: str,
59
58
  item: ResourceAgentsRequest,
60
59
  x_nucliadb_user: str = Header(""),
61
- ) -> Union[ResourceAgentsResponse, HTTPClientError]:
60
+ ) -> ResourceAgentsResponse | HTTPClientError:
62
61
  return await _run_agents_endpoint(kbid, rid, x_nucliadb_user, item)
63
62
 
64
63
 
@@ -80,8 +79,8 @@ async def run_agents_by_slug(
80
79
  slug: str,
81
80
  item: ResourceAgentsRequest,
82
81
  x_nucliadb_user: str = Header(""),
83
- ) -> Union[ResourceAgentsResponse, HTTPClientError]:
84
- resource_id = await get_resource_uuid_by_slug(kbid, slug)
82
+ ) -> ResourceAgentsResponse | HTTPClientError:
83
+ resource_id = await datamanagers.atomic.resources.get_resource_uuid_from_slug(kbid=kbid, slug=slug)
85
84
  if resource_id is None:
86
85
  return HTTPClientError(status_code=404, detail="Resource not found")
87
86
  return await _run_agents_endpoint(kbid, resource_id, x_nucliadb_user, item)
@@ -89,7 +88,7 @@ async def run_agents_by_slug(
89
88
 
90
89
  async def _run_agents_endpoint(
91
90
  kbid: str, resource_id: str, user_id: str, item: ResourceAgentsRequest
92
- ) -> Union[ResourceAgentsResponse, HTTPClientError]:
91
+ ) -> ResourceAgentsResponse | HTTPClientError:
93
92
  try:
94
93
  run_agents_response: RunAgentsResponse = await run_agents(
95
94
  kbid,
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
- from typing import Optional, Union, cast
21
+ from typing import cast
22
22
 
23
23
  from fastapi import Header, Request, Response
24
24
  from fastapi_versioning import version
@@ -62,25 +62,23 @@ async def resource_search(
62
62
  kbid: str,
63
63
  query: str,
64
64
  rid: str,
65
- filter_expression: Optional[str] = fastapi_query(SearchParamDefaults.filter_expression),
65
+ filter_expression: str | None = fastapi_query(SearchParamDefaults.filter_expression),
66
66
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
67
67
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
68
68
  faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
69
- sort: Optional[SortField] = fastapi_query(SearchParamDefaults.sort_field, alias="sort_field"),
69
+ sort: SortField | None = fastapi_query(SearchParamDefaults.sort_field, alias="sort_field"),
70
70
  sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
71
- top_k: Optional[int] = fastapi_query(SearchParamDefaults.top_k),
72
- range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
73
- range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
74
- range_modification_start: Optional[DateTime] = fastapi_query(
71
+ top_k: int | None = fastapi_query(SearchParamDefaults.top_k),
72
+ range_creation_start: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_start),
73
+ range_creation_end: DateTime | None = fastapi_query(SearchParamDefaults.range_creation_end),
74
+ range_modification_start: DateTime | None = fastapi_query(
75
75
  SearchParamDefaults.range_modification_start
76
76
  ),
77
- range_modification_end: Optional[DateTime] = fastapi_query(
78
- SearchParamDefaults.range_modification_end
79
- ),
77
+ range_modification_end: DateTime | None = fastapi_query(SearchParamDefaults.range_modification_end),
80
78
  highlight: bool = fastapi_query(SearchParamDefaults.highlight),
81
79
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
82
80
  debug: bool = fastapi_query(SearchParamDefaults.debug),
83
- ) -> Union[ResourceSearchResults, HTTPClientError]:
81
+ ) -> ResourceSearchResults | HTTPClientError:
84
82
  top_k = top_k or SearchParamDefaults.top_k # type: ignore
85
83
  top_k = cast(int, top_k)
86
84