nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@
19
19
  #
20
20
  from typing import Dict
21
21
 
22
- from fastapi import Request
22
+ from fastapi import Header, Request
23
23
  from fastapi_versioning import version
24
24
  from nuclia_models.config.proto import ExtractConfig, SplitConfiguration
25
25
 
@@ -60,15 +60,11 @@ async def download_model(
60
60
  )
61
61
  @requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
62
62
  @version(1)
63
- async def get_configuration(
64
- request: Request,
65
- kbid: str,
66
- ):
63
+ async def get_configuration(request: Request, kbid: str):
67
64
  return await learning_config_proxy(
68
65
  request,
69
66
  "GET",
70
67
  f"/config/{kbid}",
71
- extra_headers={"X-STF-USER": request.headers.get("X-NUCLIADB-USER", "")},
72
68
  )
73
69
 
74
70
 
@@ -108,7 +104,6 @@ async def get_model(
108
104
  request,
109
105
  "GET",
110
106
  f"/models/{kbid}/model/{model_id}",
111
- extra_headers={"X-STF-USER": request.headers.get("X-NUCLIADB-USER", "")},
112
107
  )
113
108
 
114
109
 
@@ -123,10 +118,35 @@ async def get_model(
123
118
  @requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
124
119
  @version(1)
125
120
  async def get_schema_for_configuration_updates(
126
- request: Request,
127
- kbid: str,
121
+ request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
128
122
  ):
129
- return await learning_config_proxy(request, "GET", f"/schema/{kbid}")
123
+ return await learning_config_proxy(
124
+ request,
125
+ "GET",
126
+ f"/schema/{kbid}",
127
+ headers={"account-id": x_nucliadb_account},
128
+ )
129
+
130
+
131
+ @api.get(
132
+ path=f"/{KB_PREFIX}/{{kbid}}/generative_providers",
133
+ status_code=200,
134
+ summary="Available models for a knowledge box",
135
+ description="Get all available models for a knowledge box grouped by provider",
136
+ response_model=None,
137
+ tags=["Models"],
138
+ )
139
+ @requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
140
+ @version(1)
141
+ async def get_models_group_by_providers(
142
+ request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
143
+ ):
144
+ return await learning_config_proxy(
145
+ request,
146
+ "GET",
147
+ f"/generative_providers/{kbid}",
148
+ headers={"account-id": x_nucliadb_account},
149
+ )
130
150
 
131
151
 
132
152
  @api.get(
@@ -77,7 +77,7 @@ async def list_resources(
77
77
 
78
78
  # Get counters from maindb
79
79
  driver = get_driver()
80
- async with driver.transaction(read_only=True) as txn:
80
+ async with driver.ro_transaction() as txn:
81
81
  # Filter parameters for serializer
82
82
  show: list[ResourceProperties] = [ResourceProperties.BASIC]
83
83
  field_types: list[FieldTypeName] = []
@@ -335,7 +335,7 @@ async def _get_resource_field(
335
335
  storage = await get_storage(service_name=SERVICE_NAME)
336
336
  driver = get_driver()
337
337
  pb_field_id = to_proto.field_type_name(field_type)
338
- async with driver.transaction(read_only=True) as txn:
338
+ async with driver.ro_transaction() as txn:
339
339
  kb = ORMKnowledgeBox(txn, storage, kbid)
340
340
 
341
341
  if rid is None:
@@ -287,7 +287,7 @@ async def processing_status(
287
287
  storage = await get_storage(service_name=SERVICE_NAME)
288
288
  driver = get_driver()
289
289
 
290
- async with driver.transaction(read_only=True) as txn:
290
+ async with driver.ro_transaction() as txn:
291
291
  kb = KnowledgeBox(txn, storage, kbid)
292
292
 
293
293
  max_simultaneous = asyncio.Semaphore(10)
@@ -201,7 +201,7 @@ async def get_resource_title_cached(
201
201
 
202
202
 
203
203
  async def get_resource_title(kv_driver: Driver, kbid: str, resource_uuid: str) -> Optional[str]:
204
- async with kv_driver.transaction(read_only=True) as txn:
204
+ async with kv_driver.ro_transaction() as txn:
205
205
  basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=resource_uuid)
206
206
  if basic is None:
207
207
  return None
@@ -23,6 +23,7 @@ from . import ( # noqa: F401
23
23
  feedback,
24
24
  find,
25
25
  graph,
26
+ hydrate,
26
27
  knowledgebox,
27
28
  predict_proxy,
28
29
  search,
@@ -25,6 +25,7 @@ from fastapi import Request, Response
25
25
  from fastapi_versioning import version
26
26
  from pydantic import ValidationError
27
27
 
28
+ from nucliadb.common.catalog import catalog_facets, catalog_search
28
29
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
29
30
  from nucliadb.common.exceptions import InvalidQueryError
30
31
  from nucliadb.models.responses import HTTPClientError
@@ -33,7 +34,6 @@ from nucliadb.search.api.v1.router import KB_PREFIX, api
33
34
  from nucliadb.search.api.v1.utils import fastapi_query
34
35
  from nucliadb.search.search import cache
35
36
  from nucliadb.search.search.merge import fetch_resources
36
- from nucliadb.search.search.pgcatalog import pgcatalog_facets, pgcatalog_search
37
37
  from nucliadb.search.search.query_parser.parsers import parse_catalog
38
38
  from nucliadb.search.search.utils import (
39
39
  maybe_log_request_payload,
@@ -164,7 +164,7 @@ async def catalog(
164
164
  query_parser = await parse_catalog(kbid, item)
165
165
 
166
166
  catalog_results = CatalogResponse()
167
- catalog_results.fulltext = await pgcatalog_search(query_parser)
167
+ catalog_results.fulltext = await catalog_search(query_parser)
168
168
  catalog_results.resources = await fetch_resources(
169
169
  resources=[r.rid for r in catalog_results.fulltext.results],
170
170
  kbid=kbid,
@@ -205,7 +205,7 @@ async def catalog(
205
205
  )
206
206
  @requires(NucliaDBRoles.READER)
207
207
  @version(1)
208
- async def catalog_facets(
208
+ async def catalog_facets_endpoint(
209
209
  request: Request, kbid: str, item: CatalogFacetsRequest
210
210
  ) -> CatalogFacetsResponse:
211
- return CatalogFacetsResponse(facets=await pgcatalog_facets(kbid, item))
211
+ return CatalogFacetsResponse(facets=await catalog_facets(kbid, item))
@@ -46,7 +46,6 @@ from nucliadb_models.search import (
46
46
  KnowledgeboxFindResults,
47
47
  NucliaDBClientType,
48
48
  RankFusionName,
49
- Reranker,
50
49
  RerankerName,
51
50
  ResourceProperties,
52
51
  SearchParamDefaults,
@@ -127,11 +126,10 @@ async def find_knowledgebox(
127
126
  extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
128
127
  with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
129
128
  with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
130
- autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
131
129
  security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
132
130
  show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
133
131
  rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
134
- reranker: Union[RerankerName, Reranker] = fastapi_query(SearchParamDefaults.reranker),
132
+ reranker: RerankerName = fastapi_query(SearchParamDefaults.reranker),
135
133
  search_configuration: Optional[str] = Query(
136
134
  default=None,
137
135
  description="Load find parameters from this configuration. Parameters in the request override parameters from the configuration.",
@@ -166,7 +164,6 @@ async def find_knowledgebox(
166
164
  extracted=extracted,
167
165
  with_duplicates=with_duplicates,
168
166
  with_synonyms=with_synonyms,
169
- autofilter=autofilter,
170
167
  security=security,
171
168
  show_hidden=show_hidden,
172
169
  rank_fusion=rank_fusion,
@@ -0,0 +1,328 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ from typing import Awaitable, Optional, Union
22
+
23
+ from async_lru import alru_cache
24
+ from fastapi import Request, Response
25
+ from fastapi_versioning import version
26
+
27
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
28
+ from nucliadb.ingest.fields.base import Field
29
+ from nucliadb.search.api.v1.router import KB_PREFIX, api
30
+ from nucliadb.search.search import cache
31
+ from nucliadb.search.search.cache import request_caches
32
+ from nucliadb.search.search.hydrator.fields import hydrate_field, page_preview_id
33
+ from nucliadb.search.search.hydrator.images import (
34
+ download_page_preview,
35
+ )
36
+ from nucliadb.search.search.hydrator.paragraphs import ParagraphIndex, hydrate_paragraph
37
+ from nucliadb.search.search.hydrator.resources import hydrate_resource
38
+ from nucliadb_models.hydration import (
39
+ Hydrated,
40
+ HydratedConversationField,
41
+ HydratedFileField,
42
+ HydratedGenericField,
43
+ HydratedLinkField,
44
+ HydratedParagraph,
45
+ HydratedResource,
46
+ HydratedTextField,
47
+ HydrateRequest,
48
+ Hydration,
49
+ ParagraphHydration,
50
+ )
51
+ from nucliadb_models.resource import NucliaDBRoles
52
+ from nucliadb_models.search import Image
53
+ from nucliadb_utils.authentication import requires
54
+
55
+
56
+ @api.post(
57
+ f"/{KB_PREFIX}/{{kbid}}/hydrate",
58
+ status_code=200,
59
+ summary="Hydrate a set of paragraphs",
60
+ description="Internal API endpoint to hydrate a set of paragraphs",
61
+ include_in_schema=False,
62
+ response_model_exclude_unset=True,
63
+ tags=["Hydration"],
64
+ )
65
+ @requires(NucliaDBRoles.READER)
66
+ @version(1)
67
+ async def hydrate_endpoint(
68
+ request: Request,
69
+ response: Response,
70
+ kbid: str,
71
+ item: HydrateRequest,
72
+ ) -> Hydrated:
73
+ with request_caches():
74
+ return await Hydrator(kbid, item.hydration).hydrate(item.data)
75
+
76
+
77
+ class HydratedBuilder:
78
+ """Builder class to construct an Hydrated payload."""
79
+
80
+ def __init__(self) -> None:
81
+ self._resources: dict[str, HydratedResource] = {}
82
+ self._fields: dict[
83
+ str,
84
+ Union[
85
+ HydratedTextField,
86
+ HydratedFileField,
87
+ HydratedLinkField,
88
+ HydratedConversationField,
89
+ HydratedGenericField,
90
+ ],
91
+ ] = {}
92
+ self._paragraphs: dict[str, HydratedParagraph] = {}
93
+
94
+ @property
95
+ def resources(self) -> dict[str, HydratedResource]:
96
+ return self._resources
97
+
98
+ @property
99
+ def fields(
100
+ self,
101
+ ) -> dict[
102
+ str,
103
+ Union[
104
+ HydratedTextField,
105
+ HydratedFileField,
106
+ HydratedLinkField,
107
+ HydratedConversationField,
108
+ HydratedGenericField,
109
+ ],
110
+ ]:
111
+ return self._fields
112
+
113
+ @property
114
+ def paragraphs(self) -> dict[str, HydratedParagraph]:
115
+ return self._paragraphs
116
+
117
+ def build(self) -> Hydrated:
118
+ return Hydrated(
119
+ resources=self._resources,
120
+ fields=self._fields,
121
+ paragraphs=self._paragraphs,
122
+ )
123
+
124
+ def add_resource(self, rid: str, resource: HydratedResource):
125
+ self._resources[rid] = resource
126
+
127
+ def add_field(
128
+ self,
129
+ field_id: FieldId,
130
+ field: Union[
131
+ HydratedTextField,
132
+ HydratedFileField,
133
+ HydratedLinkField,
134
+ HydratedConversationField,
135
+ HydratedGenericField,
136
+ ],
137
+ ):
138
+ self._fields[field_id.full()] = field
139
+
140
+ def has_field(self, field_id: FieldId) -> bool:
141
+ return field_id.full() in self._fields
142
+
143
+ def add_paragraph(self, paragraph_id: ParagraphId, paragraph: HydratedParagraph):
144
+ self._paragraphs[paragraph_id.full()] = paragraph
145
+
146
+ def add_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
147
+ field_id = paragraph_id.field_id
148
+ field = self._fields[field_id.full()]
149
+
150
+ if not isinstance(field, HydratedFileField):
151
+ # Other field types have no page preview concept
152
+ return
153
+
154
+ if field.previews is None:
155
+ field.previews = {}
156
+
157
+ preview_id = page_preview_id(page)
158
+ field.previews[preview_id] = image
159
+
160
+ paragraph = self._paragraphs[paragraph_id.full()]
161
+ assert paragraph.page is not None, "should already be set"
162
+ paragraph.page.page_preview_ref = preview_id
163
+
164
+ def add_table_page_preview(self, paragraph_id: ParagraphId, page: int, image: Image):
165
+ field_id = paragraph_id.field_id
166
+ field = self._fields[field_id.full()]
167
+
168
+ if not isinstance(field, HydratedFileField):
169
+ # Other field types have no page preview concept
170
+ return
171
+
172
+ if field.previews is None:
173
+ field.previews = {}
174
+
175
+ preview_id = page_preview_id(page)
176
+ field.previews[preview_id] = image
177
+
178
+ paragraph = self._paragraphs[paragraph_id.full()]
179
+ assert paragraph.table is not None, "should already be set"
180
+ paragraph.table.page_preview_ref = preview_id
181
+
182
+
183
+ class Hydrator:
184
+ def __init__(self, kbid: str, config: Hydration):
185
+ self.kbid = kbid
186
+ self.config = config
187
+ self.hydrated = HydratedBuilder()
188
+
189
+ # cached paragraphs per field
190
+ self.field_paragraphs: dict[FieldId, ParagraphIndex] = {}
191
+
192
+ self.max_ops = asyncio.Semaphore(50)
193
+
194
+ async def hydrate(self, paragraph_ids: list[str]) -> Hydrated:
195
+ paragraph_tasks = {}
196
+ field_tasks = {}
197
+ resource_tasks = {}
198
+
199
+ unique_paragraph_ids = set(paragraph_ids)
200
+ for user_paragraph_id in unique_paragraph_ids:
201
+ try:
202
+ paragraph_id = ParagraphId.from_string(user_paragraph_id)
203
+ except ValueError:
204
+ # skip paragraphs with invalid format
205
+ continue
206
+
207
+ field_id = paragraph_id.field_id
208
+ rid = paragraph_id.rid
209
+
210
+ resource = await cache.get_resource(self.kbid, rid)
211
+ if resource is None:
212
+ # skip resources that aren't in the DB
213
+ continue
214
+
215
+ field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
216
+ if not (await resource.field_exists(field_type_pb, field_id.key)):
217
+ # skip a fields that aren't in the DB
218
+ continue
219
+ field = await resource.get_field(field_id.key, field_id.pb_type)
220
+
221
+ if field_id not in self.field_paragraphs:
222
+ field_paragraphs_index = ParagraphIndex(field_id)
223
+ self.field_paragraphs[field_id] = field_paragraphs_index
224
+ field_paragraphs_index = self.field_paragraphs[field_id]
225
+
226
+ paragraph_tasks[paragraph_id] = asyncio.create_task(
227
+ self._limited_concurrency(
228
+ hydrate_paragraph(
229
+ resource, field, paragraph_id, self.config.paragraph, field_paragraphs_index
230
+ ),
231
+ )
232
+ )
233
+
234
+ if field_id not in field_tasks:
235
+ field_tasks[field_id] = asyncio.create_task(
236
+ self._limited_concurrency(hydrate_field(resource, field_id, self.config.field))
237
+ )
238
+
239
+ if rid not in resource_tasks:
240
+ if self.config.resource is not None:
241
+ resource_tasks[rid] = asyncio.create_task(
242
+ self._limited_concurrency(hydrate_resource(resource, rid, self.config.resource))
243
+ )
244
+
245
+ ops = [
246
+ *paragraph_tasks.values(),
247
+ *field_tasks.values(),
248
+ *resource_tasks.values(),
249
+ ]
250
+ results = await asyncio.gather(*ops)
251
+ hydrated_paragraphs = results[: len(paragraph_tasks)]
252
+ hydrated_fields = results[len(paragraph_tasks) : len(paragraph_tasks) + len(field_tasks)]
253
+ hydrated_resources = results[
254
+ len(paragraph_tasks) + len(field_tasks) : len(paragraph_tasks)
255
+ + len(field_tasks)
256
+ + len(resource_tasks)
257
+ ]
258
+
259
+ for rid, hydrated_resource in zip(resource_tasks.keys(), hydrated_resources):
260
+ self.hydrated.add_resource(rid, hydrated_resource)
261
+
262
+ for field_id, hydrated_field in zip(field_tasks.keys(), hydrated_fields):
263
+ if hydrated_field is not None:
264
+ self.hydrated.add_field(field_id, hydrated_field)
265
+
266
+ for paragraph_id, (hydrated_paragraph, extra) in zip(
267
+ paragraph_tasks.keys(), hydrated_paragraphs
268
+ ):
269
+ self.hydrated.add_paragraph(paragraph_id, hydrated_paragraph)
270
+
271
+ for related_paragraph_id in extra.related_paragraph_ids:
272
+ field_id = related_paragraph_id.field_id
273
+ rid = related_paragraph_id.rid
274
+
275
+ resource = await cache.get_resource(self.kbid, rid)
276
+ if resource is None:
277
+ # skip resources that aren't in the DB
278
+ continue
279
+
280
+ field_type_pb = FIELD_TYPE_STR_TO_PB[field_id.type]
281
+ if not (await resource.field_exists(field_type_pb, field_id.key)):
282
+ # skip a fields that aren't in the DB
283
+ continue
284
+ field = await resource.get_field(field_id.key, field_id.pb_type)
285
+
286
+ if field_id not in self.field_paragraphs:
287
+ field_paragraphs_index = ParagraphIndex(field_id)
288
+ self.field_paragraphs[field_id] = field_paragraphs_index
289
+ field_paragraphs_index = self.field_paragraphs[field_id]
290
+
291
+ (hydrated_paragraph, _) = await hydrate_paragraph(
292
+ resource,
293
+ field,
294
+ related_paragraph_id,
295
+ ParagraphHydration(
296
+ text=self.config.paragraph.text, image=None, table=None, page=None, related=None
297
+ ),
298
+ field_paragraphs_index,
299
+ )
300
+ self.hydrated.add_paragraph(related_paragraph_id, hydrated_paragraph)
301
+
302
+ if self.hydrated.has_field(field_id):
303
+ # we only hydrate page and table previews for fields the user
304
+ # allowed hydration, skipping fields with explicitly disabled
305
+ # hydration
306
+
307
+ if extra.field_page is not None:
308
+ page_number = extra.field_page
309
+ preview = await self.cached_download_page_preview(field, page_number)
310
+ if preview is not None:
311
+ self.hydrated.add_page_preview(paragraph_id, page_number, preview)
312
+
313
+ if extra.field_table_page is not None:
314
+ page_number = extra.field_table_page
315
+ preview = await self.cached_download_page_preview(field, page_number)
316
+ if preview is not None:
317
+ self.hydrated.add_table_page_preview(paragraph_id, page_number, preview)
318
+
319
+ return self.hydrated.build()
320
+
321
+ # TODO: proper typing
322
+ async def _limited_concurrency(self, aw: Awaitable):
323
+ async with self.max_ops:
324
+ return await aw
325
+
326
+ @alru_cache(maxsize=None)
327
+ async def cached_download_page_preview(self, field: Field, page: int) -> Optional[Image]:
328
+ return await download_page_preview(field, page)
@@ -28,7 +28,8 @@ from nucliadb.search.api.v1.resource.utils import get_resource_uuid_by_slug
28
28
  from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_SLUG_PREFIX, api
29
29
  from nucliadb_models.resource import NucliaDBRoles
30
30
  from nucliadb_models.search import AskRequest, NucliaDBClientType, SyncAskResponse
31
- from nucliadb_utils.authentication import requires
31
+ from nucliadb_models.security import RequestSecurity
32
+ from nucliadb_utils.authentication import NucliaUser, requires
32
33
 
33
34
  from ..ask import create_ask_response
34
35
 
@@ -58,6 +59,15 @@ async def resource_ask_endpoint_by_uuid(
58
59
  "This is slower and requires waiting for entire answer to be ready.",
59
60
  ),
60
61
  ) -> Union[StreamingResponse, HTTPClientError, Response]:
62
+ current_user: NucliaUser = request.user
63
+ # If present, security groups from AuthorizationBackend overrides any
64
+ # security group of the payload
65
+ if current_user.security_groups:
66
+ if item.security is None:
67
+ item.security = RequestSecurity(groups=current_user.security_groups)
68
+ else:
69
+ item.security.groups = current_user.security_groups
70
+
61
71
  return await create_ask_response(
62
72
  kbid=kbid,
63
73
  ask_request=item,
@@ -98,6 +108,16 @@ async def resource_ask_endpoint_by_slug(
98
108
  resource_id = await get_resource_uuid_by_slug(kbid, slug)
99
109
  if resource_id is None:
100
110
  return HTTPClientError(status_code=404, detail="Resource not found")
111
+
112
+ current_user: NucliaUser = request.user
113
+ # If present, security groups from AuthorizationBackend overrides any
114
+ # security group of the payload
115
+ if current_user.security_groups:
116
+ if item.security is None:
117
+ item.security = RequestSecurity(groups=current_user.security_groups)
118
+ else:
119
+ item.security.groups = current_user.security_groups
120
+
101
121
  return await create_ask_response(
102
122
  kbid=kbid,
103
123
  ask_request=item,
@@ -148,7 +148,6 @@ async def search_knowledgebox(
148
148
  extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
149
149
  with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
150
150
  with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
151
- autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
152
151
  security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
153
152
  show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
154
153
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
@@ -187,7 +186,6 @@ async def search_knowledgebox(
187
186
  extracted=extracted,
188
187
  with_duplicates=with_duplicates,
189
188
  with_synonyms=with_synonyms,
190
- autofilter=autofilter,
191
189
  security=security,
192
190
  show_hidden=show_hidden,
193
191
  )
@@ -262,7 +260,7 @@ async def search(
262
260
  start_time = time()
263
261
 
264
262
  parsed = await parse_search(kbid, item)
265
- pb_query, incomplete_results, autofilters, _ = await legacy_convert_retrieval_to_proto(parsed)
263
+ pb_query, incomplete_results, _ = await legacy_convert_retrieval_to_proto(parsed)
266
264
 
267
265
  # We need to query all nodes
268
266
  results, queried_shards = await nidx_query(kbid, Method.SEARCH, pb_query)
@@ -290,5 +288,4 @@ async def search(
290
288
  )
291
289
 
292
290
  search_results.shards = queried_shards
293
- search_results.autofilters = autofilters
294
291
  return search_results, incomplete_results
@@ -447,6 +447,10 @@ class DummyPredictEngine(PredictEngine):
447
447
  self.cluster_url = "http://localhost:8000"
448
448
  self.public_url = "http://localhost:8000"
449
449
  self.calls = []
450
+ self.ndjson_reasoning = [
451
+ b'{"chunk": {"type": "reasoning", "text": "dummy "}}\n',
452
+ b'{"chunk": {"type": "reasoning", "text": "reasoning"}}\n',
453
+ ]
450
454
  self.ndjson_answer = [
451
455
  b'{"chunk": {"type": "text", "text": "valid "}}\n',
452
456
  b'{"chunk": {"type": "text", "text": "answer "}}\n',
@@ -482,8 +486,11 @@ class DummyPredictEngine(PredictEngine):
482
486
  self.calls.append(("chat_query_ndjson", item))
483
487
 
484
488
  async def generate():
485
- for item in self.ndjson_answer:
486
- yield GenerativeChunk.model_validate_json(item)
489
+ if item.reasoning is not False:
490
+ for chunk in self.ndjson_reasoning:
491
+ yield GenerativeChunk.model_validate_json(chunk)
492
+ for chunk in self.ndjson_answer:
493
+ yield GenerativeChunk.model_validate_json(chunk)
487
494
 
488
495
  return (DUMMY_LEARNING_ID, DUMMY_LEARNING_MODEL, generate())
489
496
 
@@ -21,8 +21,6 @@ import contextlib
21
21
  import logging
22
22
  from typing import Optional
23
23
 
24
- import backoff
25
-
26
24
  from nucliadb.common.cache import (
27
25
  extracted_text_cache,
28
26
  get_extracted_text_cache,
@@ -54,7 +52,7 @@ async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
54
52
 
55
53
 
56
54
  async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
57
- async with get_driver().transaction(read_only=True) as txn:
55
+ async with get_driver().ro_transaction() as txn:
58
56
  storage = await get_storage(service_name=SERVICE_NAME)
59
57
  kb = KnowledgeBoxORM(txn, storage, kbid)
60
58
  return await kb.get(uuid)
@@ -74,23 +72,6 @@ async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
74
72
  return extracted_text
75
73
 
76
74
 
77
- @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
78
- async def field_get_extracted_text(field: Field) -> Optional[ExtractedText]:
79
- try:
80
- return await field.get_extracted_text()
81
- except Exception:
82
- logger.warning(
83
- "Error getting extracted text for field. Retrying",
84
- exc_info=True,
85
- extra={
86
- "kbid": field.kbid,
87
- "resource_id": field.resource.uuid,
88
- "field": f"{field.type}/{field.id}",
89
- },
90
- )
91
- raise
92
-
93
-
94
75
  async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
95
76
  rid = field.rid
96
77
  orm_resource = await get_resource(kbid, rid)