nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -27,11 +27,11 @@ Backfill the data into the PG catalog
27
27
  import logging
28
28
  from typing import cast
29
29
 
30
- from nucliadb.common import datamanagers
31
30
  from nucliadb.common.catalog import catalog_update, get_catalog
32
31
  from nucliadb.common.catalog.pg import PGCatalog
33
32
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
34
33
  from nucliadb.ingest.orm.index_message import get_resource_index_message
34
+ from nucliadb.ingest.orm.resource import Resource
35
35
  from nucliadb.migrator.context import ExecutionContext
36
36
 
37
37
  logger = logging.getLogger(__name__)
@@ -73,7 +73,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
73
73
  # Index each resource
74
74
  for rid in resources_to_index:
75
75
  rid = str(rid).replace("-", "")
76
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
76
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
77
77
  if resource is None:
78
78
  logger.warning(f"Could not load resource {rid} for kbid {kbid}")
79
79
  continue
@@ -24,7 +24,6 @@ Backfill field status (from error)
24
24
  """
25
25
 
26
26
  import logging
27
- from typing import Optional
28
27
 
29
28
  from nucliadb.migrator.context import ExecutionContext
30
29
  from nucliadb_protos import resources_pb2, writer_pb2
@@ -33,7 +32,7 @@ logger = logging.getLogger(__name__)
33
32
 
34
33
 
35
34
  async def migrate(context: ExecutionContext) -> None:
36
- start: Optional[str] = ""
35
+ start: str | None = ""
37
36
  while True:
38
37
  if start is None:
39
38
  break
@@ -43,7 +42,7 @@ async def migrate(context: ExecutionContext) -> None:
43
42
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
44
43
 
45
44
 
46
- async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
45
+ async def do_batch(context: ExecutionContext, start: str) -> str | None:
47
46
  logger.info(f"Running batch from {start}")
48
47
  async with context.kv_driver.rw_transaction() as txn:
49
48
  async with txn.connection.cursor() as cur: # type: ignore
@@ -64,7 +63,7 @@ async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
64
63
  field_keys = [r[0] for r in records]
65
64
 
66
65
  # Retrieve resources basic (to check status)
67
- resource_keys = set(["/".join(f.split("/")[:5]) for f in field_keys])
66
+ resource_keys = {"/".join(f.split("/")[:5]) for f in field_keys}
68
67
  await cur.execute(
69
68
  """
70
69
  SELECT key, value FROM resources
@@ -26,7 +26,6 @@ is stored in object storage.
26
26
  """
27
27
 
28
28
  import logging
29
- from typing import Optional
30
29
 
31
30
  from nucliadb.migrator.context import ExecutionContext
32
31
 
@@ -34,7 +33,7 @@ logger = logging.getLogger(__name__)
34
33
 
35
34
 
36
35
  async def migrate(context: ExecutionContext) -> None:
37
- start: Optional[str] = ""
36
+ start: str | None = ""
38
37
  while True:
39
38
  if start is None:
40
39
  break
@@ -45,7 +44,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
45
44
  pass
46
45
 
47
46
 
48
- async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
47
+ async def do_batch(context: ExecutionContext, start: str) -> str | None:
49
48
  logger.info(f"Running batch from {start}")
50
49
  async with context.kv_driver.rw_transaction() as txn:
51
50
  async with txn.connection.cursor() as cur: # type: ignore
@@ -27,11 +27,11 @@ Backfill the catalog with labels from fields metadata
27
27
  import logging
28
28
  from typing import cast
29
29
 
30
- from nucliadb.common import datamanagers
31
30
  from nucliadb.common.catalog import catalog_update, get_catalog
32
31
  from nucliadb.common.catalog.pg import PGCatalog
33
32
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
34
33
  from nucliadb.ingest.orm.index_message import get_resource_index_message
34
+ from nucliadb.ingest.orm.resource import Resource
35
35
  from nucliadb.migrator.context import ExecutionContext
36
36
  from nucliadb_protos import resources_pb2
37
37
 
@@ -82,7 +82,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
82
82
  # Index each resource
83
83
  for key in to_index:
84
84
  rid = key.split("/")[4]
85
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
85
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
86
86
  if resource is None:
87
87
  logger.warning(f"Could not load resource {rid} for kbid {kbid}")
88
88
  continue
@@ -36,7 +36,7 @@ from nucliadb.ingest.fields.conversation import (
36
36
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
37
37
  from nucliadb.migrator.context import ExecutionContext
38
38
  from nucliadb_protos import resources_pb2
39
- from nucliadb_protos.resources_pb2 import SplitMetadata, SplitsMetadata
39
+ from nucliadb_protos.resources_pb2 import SplitsMetadata
40
40
  from nucliadb_utils.storages.storage import Storage
41
41
 
42
42
  logger = logging.getLogger(__name__)
@@ -102,5 +102,5 @@ async def build_splits_metadata(
102
102
  if page is None:
103
103
  continue
104
104
  for message in page.messages:
105
- splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
105
+ splits_metadata.metadata.get_or_create(message.ident)
106
106
  return splits_metadata
@@ -0,0 +1,137 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import logging
21
+ import uuid
22
+ from collections.abc import AsyncIterator
23
+ from typing import cast
24
+
25
+ from nucliadb.common import datamanagers
26
+ from nucliadb.common.maindb.pg import PGTransaction
27
+ from nucliadb.ingest.orm.index_message import get_resource_index_message
28
+ from nucliadb.ingest.orm.resource import Resource
29
+ from nucliadb.migrator.context import ExecutionContext
30
+ from nucliadb_protos.writer_pb2 import ShardObject, Shards
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ async def migrate(context: ExecutionContext) -> None: ...
36
+
37
+
38
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
39
+ """
40
+ Reindex resources that have conversation fields
41
+ """
42
+ kb_shards = await datamanagers.atomic.cluster.get_kb_shards(kbid=kbid, for_update=False)
43
+ if kb_shards is not None:
44
+ async for rid in iter_affected_resource_ids(context, kbid):
45
+ await reindex_resource(context, kbid, rid, kb_shards)
46
+ else:
47
+ logger.warning(
48
+ "Migration 41: KB shards not found, skipping reindexing",
49
+ extra={"kbid": kbid},
50
+ )
51
+
52
+
53
+ async def reindex_resource(
54
+ context: ExecutionContext,
55
+ kbid: str,
56
+ rid: str,
57
+ kb_shards: Shards,
58
+ ) -> None:
59
+ """
60
+ Reindex a single resource
61
+ """
62
+ async with datamanagers.with_ro_transaction() as rs_txn:
63
+ # Fetch the resource
64
+ resource = await Resource.get(rs_txn, kbid=kbid, rid=rid)
65
+ if resource is None:
66
+ logger.warning(
67
+ "Migration 41: Resource not found, skipping reindexing",
68
+ extra={"kbid": kbid, "rid": rid},
69
+ )
70
+ return
71
+
72
+ # Get the shard for the resource
73
+ shard: ShardObject | None = None
74
+ shard_id = await datamanagers.resources.get_resource_shard_id(
75
+ rs_txn, kbid=kbid, rid=rid, for_update=False
76
+ )
77
+ if shard_id is not None:
78
+ shard = next((shard for shard in kb_shards.shards if shard.shard == shard_id), None)
79
+ if shard is None:
80
+ logger.warning(
81
+ "Migration 41: Shard not found for resource, skipping reindexing",
82
+ extra={"kbid": kbid, "rid": rid, "shard_id": shard_id},
83
+ )
84
+ return
85
+
86
+ # Create the index message and reindex the resource
87
+ index_message = await get_resource_index_message(resource, reindex=True)
88
+ await context.shard_manager.add_resource(
89
+ shard,
90
+ index_message,
91
+ 0,
92
+ partition="0",
93
+ kb=kbid,
94
+ reindex_id=uuid.uuid4().hex,
95
+ )
96
+ logger.info(
97
+ "Migration 41: Resource reindexed",
98
+ extra={"kbid": kbid, "rid": rid},
99
+ )
100
+
101
+
102
+ async def iter_affected_resource_ids(context: ExecutionContext, kbid: str) -> AsyncIterator[str]:
103
+ start = ""
104
+ while True:
105
+ keys_batch = await get_batch(context, kbid, start)
106
+ if keys_batch is None:
107
+ break
108
+ start = keys_batch[-1]
109
+ for key in keys_batch:
110
+ # The keys have the format /kbs/{kbid}/r/{rid}/f/c/{field_id}
111
+ rid = key.split("/")[4]
112
+ yield rid
113
+
114
+
115
+ async def get_batch(context: ExecutionContext, kbid: str, start: str) -> list[str] | None:
116
+ """
117
+ Get a batch of resource keys that hold conversation fields for the given KB.
118
+ Starting after the given start key.
119
+ Returns None if no more keys are found.
120
+ """
121
+ batch_size = 100
122
+ async with context.kv_driver.rw_transaction() as txn:
123
+ txn = cast(PGTransaction, txn)
124
+ async with txn.connection.cursor() as cur:
125
+ await cur.execute(
126
+ """
127
+ SELECT key FROM resources
128
+ WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
129
+ AND key > %s
130
+ ORDER BY key
131
+ LIMIT %s""",
132
+ (kbid, start, batch_size),
133
+ )
134
+ rows = await cur.fetchall()
135
+ if len(rows) == 0:
136
+ return None
137
+ return [row[0] for row in rows]
@@ -0,0 +1,34 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ # Concurrent index must be created outside of a transaction but psycopg automatically
26
+ # creates transactions. We temporarily disable this for building indexes.
27
+ await txn.connection.commit()
28
+ try:
29
+ await txn.connection.set_autocommit(True)
30
+ await txn.connection.execute(
31
+ "CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
32
+ )
33
+ finally:
34
+ await txn.connection.set_autocommit(False)
@@ -18,11 +18,10 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
22
 
23
- from nucliadb.common import datamanagers
24
23
 
25
-
26
- async def get_resource_uuid_by_slug(kbid: str, slug: str) -> Optional[str]:
27
- async with datamanagers.with_ro_transaction() as txn:
28
- return await datamanagers.resources.get_resource_uuid_from_slug(txn, kbid=kbid, slug=slug)
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute("CREATE STATISTICS catalog_kbid_labels ON kbid, labels FROM catalog;")
27
+ await cur.execute("ANALYZE catalog;")
@@ -0,0 +1,26 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute("DROP STATISTICS catalog_kbid_labels;")
@@ -21,8 +21,8 @@ import asyncio
21
21
  import json
22
22
  import logging
23
23
  import tarfile
24
+ from collections.abc import AsyncIterator
24
25
  from datetime import datetime, timezone
25
- from typing import AsyncIterator, Optional
26
26
 
27
27
  from nucliadb.backups.const import (
28
28
  BackupFinishedStream,
@@ -37,7 +37,6 @@ from nucliadb.export_import.utils import (
37
37
  download_binary,
38
38
  get_broker_message,
39
39
  get_cloud_files,
40
- get_entities,
41
40
  get_labels,
42
41
  get_search_configurations,
43
42
  get_synonyms,
@@ -76,7 +75,6 @@ async def backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
76
75
  """
77
76
  await backup_resources(context, kbid, backup_id)
78
77
  await backup_labels(context, kbid, backup_id)
79
- await backup_entities(context, kbid, backup_id)
80
78
  await backup_synonyms(context, kbid, backup_id)
81
79
  await backup_search_configurations(context, kbid, backup_id)
82
80
  await notify_backup_completed(context, kbid, backup_id)
@@ -235,15 +233,6 @@ async def backup_labels(context: ApplicationContext, kbid: str, backup_id: str):
235
233
  )
236
234
 
237
235
 
238
- async def backup_entities(context: ApplicationContext, kbid: str, backup_id: str):
239
- entities = await get_entities(context, kbid)
240
- await context.blob_storage.upload_object(
241
- bucket=settings.backups_bucket,
242
- key=StorageKeys.ENTITIES.format(backup_id=backup_id),
243
- data=entities.SerializeToString(),
244
- )
245
-
246
-
247
236
  async def backup_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
248
237
  synonyms = await get_synonyms(context, kbid)
249
238
  await context.blob_storage.upload_object(
@@ -266,9 +255,7 @@ async def backup_search_configurations(context: ApplicationContext, kbid: str, b
266
255
  )
267
256
 
268
257
 
269
- async def get_metadata(
270
- context: ApplicationContext, kbid: str, backup_id: str
271
- ) -> Optional[BackupMetadata]:
258
+ async def get_metadata(context: ApplicationContext, kbid: str, backup_id: str) -> BackupMetadata | None:
272
259
  async with context.kv_driver.ro_transaction() as txn:
273
260
  metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
274
261
  if metadata_raw is None:
@@ -24,7 +24,8 @@ import functools
24
24
  import json
25
25
  import logging
26
26
  import tarfile
27
- from typing import Any, AsyncIterator, Callable, Optional, Union
27
+ from collections.abc import AsyncIterator, Callable
28
+ from typing import Any
28
29
 
29
30
  from pydantic import TypeAdapter
30
31
 
@@ -35,7 +36,6 @@ from nucliadb.common.context import ApplicationContext
35
36
  from nucliadb.export_import.utils import (
36
37
  import_binary,
37
38
  restore_broker_message,
38
- set_entities_groups,
39
39
  set_labels,
40
40
  set_search_configurations,
41
41
  set_synonyms,
@@ -74,7 +74,6 @@ async def restore_kb(context: ApplicationContext, kbid: str, backup_id: str):
74
74
  """
75
75
  await restore_resources(context, kbid, backup_id)
76
76
  await restore_labels(context, kbid, backup_id)
77
- await restore_entities(context, kbid, backup_id)
78
77
  await restore_synonyms(context, kbid, backup_id)
79
78
  await restore_search_configurations(context, kbid, backup_id)
80
79
  await delete_last_restored(context, kbid, backup_id)
@@ -101,7 +100,7 @@ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: s
101
100
  await set_last_restored(context, kbid, backup_id, key)
102
101
 
103
102
 
104
- async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> Optional[str]:
103
+ async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> str | None:
105
104
  key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
106
105
  async with context.kv_driver.ro_transaction() as txn:
107
106
  raw = await txn.get(key)
@@ -191,7 +190,7 @@ class ResourceBackupReader:
191
190
  data = await self.read(tarinfo_size + padding_bytes)
192
191
  return data[:tarinfo_size]
193
192
 
194
- async def read_item(self) -> Union[BrokerMessage, CloudFile, CloudFileBinary]:
193
+ async def read_item(self) -> BrokerMessage | CloudFile | CloudFileBinary:
195
194
  tarinfo = await self.read_tarinfo()
196
195
  if tarinfo.name.startswith("broker-message"):
197
196
  raw_bm = await self.read_data(tarinfo)
@@ -257,16 +256,6 @@ async def restore_labels(context: ApplicationContext, kbid: str, backup_id: str)
257
256
  await set_labels(context, kbid, labels)
258
257
 
259
258
 
260
- async def restore_entities(context: ApplicationContext, kbid: str, backup_id: str):
261
- raw = await context.blob_storage.downloadbytes(
262
- bucket=settings.backups_bucket,
263
- key=StorageKeys.ENTITIES.format(backup_id=backup_id),
264
- )
265
- entities = kb_pb2.EntitiesGroups()
266
- entities.ParseFromString(raw.getvalue())
267
- await set_entities_groups(context, kbid, entities)
268
-
269
-
270
259
  async def restore_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
271
260
  raw = await context.blob_storage.downloadbytes(
272
261
  bucket=settings.backups_bucket,
nucliadb/backups/tasks.py CHANGED
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Awaitable, Callable
20
+ from collections.abc import Awaitable, Callable
21
21
 
22
22
  from nucliadb.backups.const import BackupsNatsConfig
23
23
  from nucliadb.backups.create import backup_kb_task
@@ -38,6 +38,7 @@ def creator_consumer() -> NatsTaskConsumer[CreateBackupRequest]:
38
38
  callback=backup_kb_task,
39
39
  msg_type=CreateBackupRequest,
40
40
  max_concurrent_messages=10,
41
+ max_retries=100,
41
42
  )
42
43
  return consumer
43
44
 
@@ -64,6 +65,7 @@ def restorer_consumer() -> NatsTaskConsumer[RestoreBackupRequest]:
64
65
  callback=restore_kb_task,
65
66
  msg_type=RestoreBackupRequest,
66
67
  max_concurrent_messages=10,
68
+ max_retries=100,
67
69
  )
68
70
  return consumer
69
71
 
@@ -90,6 +92,7 @@ def deleter_consumer() -> NatsTaskConsumer[DeleteBackupRequest]:
90
92
  callback=delete_backup_task,
91
93
  msg_type=DeleteBackupRequest,
92
94
  max_concurrent_messages=2,
95
+ max_retries=100,
93
96
  )
94
97
  return consumer
95
98
 
@@ -21,7 +21,6 @@ import contextlib
21
21
  import logging
22
22
  import threading
23
23
  from datetime import datetime, timezone
24
- from typing import Optional
25
24
 
26
25
  from cachetools import TTLCache
27
26
 
@@ -47,7 +46,7 @@ class BackPressureCache:
47
46
  self._cache = TTLCache(maxsize=1024, ttl=5 * 60)
48
47
  self._lock = threading.Lock()
49
48
 
50
- def get(self, key: str) -> Optional[BackPressureData]:
49
+ def get(self, key: str) -> BackPressureData | None:
51
50
  with self._lock:
52
51
  data = self._cache.get(key, None)
53
52
  if data is None:
@@ -72,7 +71,7 @@ def cached_back_pressure(cache_key: str):
72
71
  Context manager that handles the caching of the try again in time so that
73
72
  we don't recompute try again times if we have already applied back pressure.
74
73
  """
75
- data: Optional[BackPressureData] = _cache.get(cache_key)
74
+ data: BackPressureData | None = _cache.get(cache_key)
76
75
  if data is not None:
77
76
  back_pressure_type = data.type
78
77
  RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})
@@ -20,7 +20,6 @@
20
20
  import asyncio
21
21
  import logging
22
22
  import threading
23
- from typing import Optional
24
23
 
25
24
  from cachetools import TTLCache
26
25
  from fastapi import HTTPException
@@ -118,12 +117,6 @@ class BackPressureMaterializer:
118
117
  extra={"kbid": kbid},
119
118
  )
120
119
  return 0
121
-
122
- if pending > 0:
123
- logger.info(
124
- f"Processing returned {pending} pending messages for KB",
125
- extra={"kbid": kbid},
126
- )
127
120
  self.processing_pending_cache[kbid] = pending
128
121
  return pending
129
122
 
@@ -184,7 +177,7 @@ class BackPressureMaterializer:
184
177
  pending=pending,
185
178
  max_wait=settings.max_wait_time,
186
179
  )
187
- data = BackPressureData(type="indexing", try_after=try_after)
180
+ data = BackPressureData(type="indexing", try_after=try_after, pending=pending)
188
181
  raise BackPressureException(data)
189
182
 
190
183
  def check_ingest(self):
@@ -199,7 +192,7 @@ class BackPressureMaterializer:
199
192
  pending=ingest_pending,
200
193
  max_wait=settings.max_wait_time,
201
194
  )
202
- data = BackPressureData(type="ingest", try_after=try_after)
195
+ data = BackPressureData(type="ingest", try_after=try_after, pending=ingest_pending)
203
196
  raise BackPressureException(data)
204
197
 
205
198
  async def check_processing(self, kbid: str):
@@ -215,11 +208,11 @@ class BackPressureMaterializer:
215
208
  pending=kb_pending,
216
209
  max_wait=settings.max_wait_time,
217
210
  )
218
- data = BackPressureData(type="processing", try_after=try_after)
211
+ data = BackPressureData(type="processing", try_after=try_after, pending=kb_pending)
219
212
  raise BackPressureException(data)
220
213
 
221
214
 
222
- MATERIALIZER: Optional[BackPressureMaterializer] = None
215
+ MATERIALIZER: BackPressureMaterializer | None = None
223
216
  materializer_lock = threading.Lock()
224
217
 
225
218
 
@@ -268,7 +261,7 @@ def get_materializer() -> BackPressureMaterializer:
268
261
  return MATERIALIZER
269
262
 
270
263
 
271
- async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) -> None:
264
+ async def maybe_back_pressure(kbid: str, resource_uuid: str | None = None) -> None:
272
265
  """
273
266
  This function does system checks to see if we need to put back pressure on writes.
274
267
  In that case, a HTTP 429 will be raised with the estimated time to try again.
@@ -278,7 +271,7 @@ async def maybe_back_pressure(kbid: str, resource_uuid: Optional[str] = None) ->
278
271
  await back_pressure_checks(kbid, resource_uuid)
279
272
 
280
273
 
281
- async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
274
+ async def back_pressure_checks(kbid: str, resource_uuid: str | None = None):
282
275
  """
283
276
  Will raise a 429 if back pressure is needed:
284
277
  - If the processing engine is behind.
@@ -299,6 +292,7 @@ async def back_pressure_checks(kbid: str, resource_uuid: Optional[str] = None):
299
292
  "resource_uuid": resource_uuid,
300
293
  "try_after": exc.data.try_after,
301
294
  "back_pressure_type": exc.data.type,
295
+ "pending": exc.data.pending,
302
296
  },
303
297
  )
304
298
  raise HTTPException(
@@ -29,30 +29,30 @@ class BackPressureSettings(BaseSettings):
29
29
  )
30
30
  indexing_rate: float = Field(
31
31
  default=10,
32
- description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time", # noqa
32
+ description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time",
33
33
  )
34
34
  ingest_rate: float = Field(
35
35
  default=4,
36
- description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time", # noqa
36
+ description="Estimation of the ingest processed consumer rate in messages per second. This is used to calculate the try again in time",
37
37
  )
38
38
  processing_rate: float = Field(
39
39
  default=1,
40
- description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time", # noqa
40
+ description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time",
41
41
  )
42
42
  max_indexing_pending: int = Field(
43
43
  default=1000,
44
- description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks", # noqa
44
+ description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks",
45
45
  alias="back_pressure_max_indexing_pending",
46
46
  )
47
47
  max_ingest_pending: int = Field(
48
48
  # Disabled by default
49
49
  default=0,
50
- description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks", # noqa
50
+ description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks",
51
51
  alias="back_pressure_max_ingest_pending",
52
52
  )
53
53
  max_processing_pending: int = Field(
54
54
  default=1000,
55
- description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks", # noqa
55
+ description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks",
56
56
  alias="back_pressure_max_processing_pending",
57
57
  )
58
58
  indexing_check_interval: int = Field(
@@ -28,6 +28,7 @@ from nucliadb_utils.nats import NatsConnectionManager
28
28
  class BackPressureData:
29
29
  type: str
30
30
  try_after: datetime
31
+ pending: int = 0
31
32
 
32
33
 
33
34
  class BackPressureException(Exception):