nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -21,8 +21,9 @@ from __future__ import annotations
21
21
 
22
22
  import asyncio
23
23
  import logging
24
+ from collections.abc import AsyncGenerator
24
25
  from contextlib import asynccontextmanager
25
- from typing import Any, AsyncGenerator, Optional
26
+ from typing import Any
26
27
 
27
28
  import backoff
28
29
  import psycopg
@@ -72,7 +73,7 @@ class DataLayer:
72
73
  self.connection = connection
73
74
  self.log_on_select_for_update = settings.driver_pg_log_on_select_for_update
74
75
 
75
- async def get(self, key: str, select_for_update: bool = False) -> Optional[bytes]:
76
+ async def get(self, key: str, select_for_update: bool = False) -> bytes | None:
76
77
  with pg_observer({"type": "get"}):
77
78
  statement = "SELECT value FROM resources WHERE key = %s"
78
79
  if select_for_update:
@@ -116,7 +117,7 @@ class DataLayer:
116
117
  async with self.connection.cursor() as cur:
117
118
  await cur.execute("DELETE FROM resources WHERE key LIKE %s", (prefix + "%",))
118
119
 
119
- async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[Optional[bytes]]:
120
+ async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[bytes | None]:
120
121
  with pg_observer({"type": "batch_get"}):
121
122
  async with self.connection.cursor() as cur:
122
123
  statement = "SELECT key, value FROM resources WHERE key = ANY(%s)"
@@ -134,7 +135,7 @@ class DataLayer:
134
135
  prefix: str,
135
136
  limit: int = DEFAULT_SCAN_LIMIT,
136
137
  include_start: bool = True,
137
- ) -> AsyncGenerator[str, None]:
138
+ ) -> AsyncGenerator[str]:
138
139
  query = "SELECT key FROM resources WHERE key LIKE %s ORDER BY key"
139
140
 
140
141
  args: list[Any] = [prefix + "%"]
@@ -190,7 +191,7 @@ class PGTransaction(Transaction):
190
191
  async def batch_get(self, keys: list[str], for_update: bool = True):
191
192
  return await self.data_layer.batch_get(keys, select_for_update=for_update)
192
193
 
193
- async def get(self, key: str, for_update: bool = True) -> Optional[bytes]:
194
+ async def get(self, key: str, for_update: bool = True) -> bytes | None:
194
195
  return await self.data_layer.get(key, select_for_update=for_update)
195
196
 
196
197
  async def set(self, key: str, value: bytes):
@@ -243,7 +244,7 @@ class ReadOnlyPGTransaction(Transaction):
243
244
  return await DataLayer(conn).batch_get(keys, select_for_update=False)
244
245
 
245
246
  @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
246
- async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
247
+ async def get(self, key: str, for_update: bool = False) -> bytes | None:
247
248
  async with self.driver._get_connection() as conn:
248
249
  return await DataLayer(conn).get(key, select_for_update=False)
249
250
 
@@ -330,7 +331,7 @@ class PGDriver(Driver):
330
331
  metric.set(value)
331
332
 
332
333
  @asynccontextmanager
333
- async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction, None]:
334
+ async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
334
335
  if read_only:
335
336
  yield ReadOnlyPGTransaction(self)
336
337
  else:
@@ -343,7 +344,7 @@ class PGDriver(Driver):
343
344
  await txn.abort()
344
345
 
345
346
  @asynccontextmanager
346
- async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection, None]:
347
+ async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection]:
347
348
  timeout = self.acquire_timeout_ms / 1000
348
349
  # Manual retry loop since backoff.on_exception does not play well with async context managers
349
350
  retries = 0
nucliadb/common/nidx.py CHANGED
@@ -19,7 +19,6 @@
19
19
  #
20
20
 
21
21
  import os
22
- from typing import Optional, Union
23
22
 
24
23
  from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
25
24
  from nidx_protos.nodewriter_pb2 import (
@@ -54,7 +53,7 @@ class NidxUtility:
54
53
  pass
55
54
 
56
55
 
57
- def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
56
+ def _storage_config(prefix: str, bucket: str | None) -> dict[str, str]:
58
57
  config = {}
59
58
  if storage_settings.file_backend == FileBackendConfig.LOCAL:
60
59
  local_bucket = bucket or storage_settings.local_indexing_bucket
@@ -161,7 +160,7 @@ class NidxNatsIndexer:
161
160
  async def index(self, writer: IndexMessage) -> int:
162
161
  res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
163
162
  logger.info(
164
- f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}" # noqa
163
+ f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}"
165
164
  )
166
165
  return res.seq
167
166
 
@@ -185,7 +184,7 @@ class NidxGrpcIndexer:
185
184
  class NidxServiceUtility(NidxUtility):
186
185
  """Implements Nidx utility connecting to the network service"""
187
186
 
188
- indexer: Union[NidxNatsIndexer, NidxGrpcIndexer]
187
+ indexer: NidxNatsIndexer | NidxGrpcIndexer
189
188
 
190
189
  def __init__(self, service_name: str):
191
190
  self.service_name = service_name
@@ -18,8 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
+ from collections.abc import AsyncGenerator
21
22
  from datetime import datetime, timezone
22
- from typing import AsyncGenerator, Type, Union, cast
23
+ from typing import Type, cast
23
24
 
24
25
  from nucliadb.common.maindb.driver import Driver
25
26
  from nucliadb.export_import import logger
@@ -34,7 +35,7 @@ MAINDB_IMPORT_KEY = "/kbs/{kbid}/imports/{id}"
34
35
  STORAGE_EXPORT_KEY = "exports/{export_id}"
35
36
  STORAGE_IMPORT_KEY = "imports/{import_id}"
36
37
 
37
- Metadata = Union[ExportMetadata, ImportMetadata]
38
+ Metadata = ExportMetadata | ImportMetadata
38
39
 
39
40
 
40
41
  class ExportImportDataManager:
@@ -59,7 +60,7 @@ class ExportImportDataManager:
59
60
  if data is None or data == b"":
60
61
  raise MetadataNotFound()
61
62
  decoded = data.decode("utf-8")
62
- model_type: Union[Type[ExportMetadata], Type[ImportMetadata]]
63
+ model_type: Type[ExportMetadata] | Type[ImportMetadata]
63
64
  if type == "export":
64
65
  model_type = ExportMetadata
65
66
  elif type == "import":
@@ -18,11 +18,12 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  from nucliadb.common.context import ApplicationContext
24
24
  from nucliadb.export_import import logger
25
25
  from nucliadb.export_import.datamanager import ExportImportDataManager
26
+ from nucliadb.export_import.exceptions import MetadataNotFound
26
27
  from nucliadb.export_import.models import (
27
28
  ExportedItemType,
28
29
  ExportMetadata,
@@ -33,7 +34,6 @@ from nucliadb.export_import.utils import (
33
34
  download_binary,
34
35
  get_broker_message,
35
36
  get_cloud_files,
36
- get_entities,
37
37
  get_labels,
38
38
  get_learning_config,
39
39
  iter_kb_resource_uuids,
@@ -43,7 +43,7 @@ from nucliadb_telemetry import errors
43
43
 
44
44
 
45
45
  async def export_kb(
46
- context: ApplicationContext, kbid: str, metadata: Optional[ExportMetadata] = None
46
+ context: ApplicationContext, kbid: str, metadata: ExportMetadata | None = None
47
47
  ) -> AsyncGenerator[bytes, None]:
48
48
  """Export the data of a knowledgebox to a stream of bytes.
49
49
 
@@ -63,9 +63,6 @@ async def export_kb(
63
63
  async for chunk in resources_iterator:
64
64
  yield chunk
65
65
 
66
- async for chunk in export_entities(context, kbid):
67
- yield chunk
68
-
69
66
  async for chunk in export_labels(context, kbid):
70
67
  yield chunk
71
68
 
@@ -76,7 +73,14 @@ async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMe
76
73
  """
77
74
  kbid, export_id = msg.kbid, msg.id
78
75
  dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
79
- metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
76
+ try:
77
+ metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
78
+ except MetadataNotFound: # pragma: no cover
79
+ logger.error(
80
+ "Export metadata not found. Skipping export.", extra={"kbid": kbid, "export_id": export_id}
81
+ )
82
+ return
83
+
80
84
  iterator = export_kb(context, kbid, metadata)
81
85
 
82
86
  retry_handler = TaskRetryHandler("export", dm, metadata)
@@ -167,18 +171,6 @@ async def export_resource_with_binaries(
167
171
  yield bm_bytes
168
172
 
169
173
 
170
- async def export_entities(
171
- context: ApplicationContext,
172
- kbid: str,
173
- ) -> AsyncGenerator[bytes, None]:
174
- entities = await get_entities(context, kbid)
175
- if len(entities.entities_groups) > 0:
176
- data = entities.SerializeToString()
177
- yield ExportedItemType.ENTITIES.encode("utf-8")
178
- yield len(data).to_bytes(4, byteorder="big")
179
- yield data
180
-
181
-
182
174
  async def export_labels(
183
175
  context: ApplicationContext,
184
176
  kbid: str,
@@ -17,11 +17,13 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import AsyncGenerator, Callable, Optional, cast
20
+ from collections.abc import AsyncGenerator, Callable
21
+ from typing import cast
21
22
 
22
23
  from nucliadb.common.context import ApplicationContext
23
24
  from nucliadb.export_import import logger
24
25
  from nucliadb.export_import.datamanager import ExportImportDataManager
26
+ from nucliadb.export_import.exceptions import MetadataNotFound
25
27
  from nucliadb.export_import.models import (
26
28
  ExportedItemType,
27
29
  ImportMetadata,
@@ -32,7 +34,6 @@ from nucliadb.export_import.utils import (
32
34
  TaskRetryHandler,
33
35
  import_binary,
34
36
  restore_broker_message,
35
- set_entities_groups,
36
37
  set_labels,
37
38
  )
38
39
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
@@ -46,7 +47,7 @@ async def import_kb(
46
47
  context: ApplicationContext,
47
48
  kbid: str,
48
49
  stream: AsyncGenerator[bytes, None],
49
- metadata: Optional[ImportMetadata] = None,
50
+ metadata: ImportMetadata | None = None,
50
51
  ) -> None:
51
52
  """
52
53
  Imports exported data from a stream into a knowledgebox.
@@ -72,8 +73,8 @@ async def import_kb(
72
73
  await import_binary(context, kbid, cf, binary_generator)
73
74
 
74
75
  elif item_type == ExportedItemType.ENTITIES:
75
- entities = cast(kb_pb2.EntitiesGroups, data)
76
- await set_entities_groups(context, kbid, entities)
76
+ # This is not supported anymore, we ignore it if we find it in and old backup
77
+ pass
77
78
 
78
79
  elif item_type == ExportedItemType.LABELS:
79
80
  labels = cast(kb_pb2.Labels, data)
@@ -99,7 +100,13 @@ async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTask
99
100
  """
100
101
  kbid, import_id = msg.kbid, msg.id
101
102
  dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
102
- metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
103
+ try:
104
+ metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
105
+ except MetadataNotFound: # pragma: no cover
106
+ logger.error(
107
+ "Import metadata not found. Skipping import.", extra={"kbid": kbid, "import_id": import_id}
108
+ )
109
+ return
103
110
 
104
111
  retry_handler = TaskRetryHandler("import", dm, metadata)
105
112
 
@@ -56,6 +56,7 @@ def get_exports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
56
56
  callback=export_kb_to_blob_storage,
57
57
  msg_type=NatsTaskMessage,
58
58
  max_concurrent_messages=10,
59
+ max_retries=100,
59
60
  )
60
61
 
61
62
 
@@ -77,6 +78,7 @@ def get_imports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
77
78
  callback=import_kb_from_blob_storage,
78
79
  msg_type=NatsTaskMessage,
79
80
  max_concurrent_messages=10,
81
+ max_retries=100,
80
82
  )
81
83
 
82
84
 
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import functools
21
- from typing import AsyncGenerator, AsyncIterator, Callable, Optional
21
+ from collections.abc import AsyncGenerator, AsyncIterator, Callable
22
22
 
23
23
  import backoff
24
24
  from google.protobuf.message import DecodeError as ProtobufDecodeError
@@ -35,6 +35,7 @@ from nucliadb.export_import.exceptions import (
35
35
  )
36
36
  from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
37
37
  from nucliadb.ingest.orm.broker_message import generate_broker_message
38
+ from nucliadb.ingest.orm.resource import Resource
38
39
  from nucliadb_models.configuration import SearchConfiguration
39
40
  from nucliadb_models.export_import import Status
40
41
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
@@ -171,14 +172,6 @@ async def import_binary(
171
172
  )
172
173
 
173
174
 
174
- async def set_entities_groups(
175
- context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
176
- ) -> None:
177
- async with datamanagers.with_transaction() as txn:
178
- await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
179
- await txn.commit()
180
-
181
-
182
175
  async def set_synonyms(context: ApplicationContext, kbid: str, synonyms: kb_pb2.Synonyms) -> None:
183
176
  async with datamanagers.with_transaction() as txn:
184
177
  await datamanagers.synonyms.set(txn, kbid=kbid, synonyms=synonyms)
@@ -207,9 +200,9 @@ async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> Asyn
207
200
 
208
201
  async def get_broker_message(
209
202
  context: ApplicationContext, kbid: str, rid: str
210
- ) -> Optional[writer_pb2.BrokerMessage]:
203
+ ) -> writer_pb2.BrokerMessage | None:
211
204
  async with datamanagers.with_ro_transaction() as txn:
212
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
205
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
213
206
  if resource is None:
214
207
  return None
215
208
  resource.disable_vectors = False
@@ -284,11 +277,6 @@ async def download_binary(
284
277
  assert downloaded_bytes == cf.size, "Downloaded bytes do not match the expected size"
285
278
 
286
279
 
287
- async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
288
- async with datamanagers.with_ro_transaction() as txn:
289
- return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
290
-
291
-
292
280
  async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
293
281
  async with datamanagers.with_ro_transaction() as txn:
294
282
  return await datamanagers.labels.get_labels(txn, kbid=kbid)
@@ -434,7 +422,7 @@ class ExportStreamReader:
434
422
 
435
423
  async def maybe_read_learning_config(
436
424
  self,
437
- ) -> tuple[Optional[learning_proxy.LearningConfiguration], bytes]:
425
+ ) -> tuple[learning_proxy.LearningConfiguration | None, bytes]:
438
426
  """
439
427
  Tries to read a learning config from the beginning of the stream.
440
428
  Returs the learning config if found. It also returns any leftover bytes that
@@ -533,7 +521,7 @@ class TaskRetryHandler:
533
521
 
534
522
  async def get_learning_config(
535
523
  kbid: str,
536
- ) -> Optional[learning_proxy.LearningConfiguration]:
524
+ ) -> learning_proxy.LearningConfiguration | None:
537
525
  return await learning_proxy.get_configuration(kbid)
538
526
 
539
527
 
nucliadb/health.py CHANGED
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import asyncio
21
21
  import logging
22
- from typing import Awaitable, Callable, Optional
22
+ from collections.abc import Awaitable, Callable
23
23
 
24
24
  from grpc import aio
25
25
  from grpc_health.v1 import health, health_pb2, health_pb2_grpc
@@ -41,7 +41,7 @@ def nats_manager_healthy() -> bool:
41
41
 
42
42
 
43
43
  def pubsub_check() -> bool:
44
- driver: Optional[PubSubDriver] = get_utility(Utility.PUBSUB)
44
+ driver: PubSubDriver | None = get_utility(Utility.PUBSUB)
45
45
  if driver is None:
46
46
  return True
47
47
  if isinstance(driver, NatsPubsub):
nucliadb/ingest/app.py CHANGED
@@ -19,7 +19,7 @@
19
19
  #
20
20
  import asyncio
21
21
  import importlib.metadata
22
- from typing import Awaitable, Callable
22
+ from collections.abc import Awaitable, Callable
23
23
 
24
24
  from nucliadb import health
25
25
  from nucliadb.backups.tasks import initialize_consumers as initialize_backup_consumers
@@ -96,7 +96,7 @@ async def initialize_grpc(): # pragma: no cover
96
96
  finalizers = await initialize()
97
97
  grpc_finalizer = await start_grpc(SERVICE_NAME)
98
98
 
99
- return [grpc_finalizer] + finalizers
99
+ return [grpc_finalizer, *finalizers]
100
100
 
101
101
 
102
102
  async def initialize_pull_workers() -> list[Callable[[], Awaitable[None]]]:
@@ -114,14 +114,14 @@ async def main_consumer(): # pragma: no cover
114
114
 
115
115
  ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
116
116
 
117
- await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown] + finalizers)
117
+ await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown, *finalizers])
118
118
 
119
119
 
120
120
  async def main_orm_grpc(): # pragma: no cover
121
121
  finalizers = await initialize()
122
122
  grpc_finalizer = await start_grpc(SERVICE_NAME)
123
123
  metrics_server = await serve_metrics()
124
- await run_until_exit([grpc_finalizer, metrics_server.shutdown] + finalizers)
124
+ await run_until_exit([grpc_finalizer, metrics_server.shutdown, *finalizers])
125
125
 
126
126
 
127
127
  async def main_ingest_processed_consumer(): # pragma: no cover
@@ -134,7 +134,7 @@ async def main_ingest_processed_consumer(): # pragma: no cover
134
134
  consumer = await consumer_service.start_ingest_processed_consumer_v2(SERVICE_NAME)
135
135
 
136
136
  await run_until_exit(
137
- [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine] + finalizers
137
+ [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine, *finalizers]
138
138
  )
139
139
 
140
140
 
@@ -158,8 +158,8 @@ async def main_subscriber_workers(): # pragma: no cover
158
158
  backup_consumers_finalizers = await initialize_backup_consumers(context)
159
159
 
160
160
  await run_until_exit(
161
- backup_consumers_finalizers
162
- + [
161
+ [
162
+ *backup_consumers_finalizers,
163
163
  imports_consumer.finalize,
164
164
  exports_consumer.finalize,
165
165
  stop_ingest_utility,
@@ -169,8 +169,8 @@ async def main_subscriber_workers(): # pragma: no cover
169
169
  grpc_health_finalizer,
170
170
  metrics_server.shutdown,
171
171
  context.finalize,
172
+ *finalizers,
172
173
  ]
173
- + finalizers
174
174
  )
175
175
 
176
176
 
@@ -20,12 +20,10 @@
20
20
  import asyncio
21
21
  import logging
22
22
  import time
23
- from typing import Optional, Union
24
23
 
25
24
  import backoff
26
25
  import nats
27
26
  import nats.js.api
28
- import nats.js.errors
29
27
  from nats.aio.client import Msg
30
28
  from nats.js import JetStreamContext
31
29
 
@@ -74,8 +72,8 @@ class IngestConsumer:
74
72
  partition: str,
75
73
  storage: Storage,
76
74
  nats_connection_manager: NatsConnectionManager,
77
- pubsub: Optional[PubSubDriver] = None,
78
- lock: Optional[Union[asyncio.Lock, asyncio.Semaphore]] = None,
75
+ pubsub: PubSubDriver | None = None,
76
+ lock: asyncio.Lock | asyncio.Semaphore | None = None,
79
77
  ):
80
78
  self.driver = driver
81
79
  self.partition = partition
@@ -85,9 +83,9 @@ class IngestConsumer:
85
83
 
86
84
  self.lock = lock or asyncio.Lock()
87
85
  self.processor = Processor(driver, storage, pubsub, partition)
88
- self.subscription: Optional[JetStreamContext.PullSubscription] = None
86
+ self.subscription: JetStreamContext.PullSubscription | None = None
89
87
 
90
- async def ack_message(self, msg: Msg, kbid: Optional[str] = None):
88
+ async def ack_message(self, msg: Msg, kbid: str | None = None):
91
89
  await msg.ack()
92
90
 
93
91
  async def initialize(self):
@@ -162,7 +160,7 @@ class IngestConsumer:
162
160
  async def subscription_worker(self, msg: Msg):
163
161
  context.clear_context()
164
162
 
165
- kbid: Optional[str] = None
163
+ kbid: str | None = None
166
164
  subject = msg.subject
167
165
  reply = msg.reply
168
166
  seqid = int(reply.split(".")[5])
@@ -238,7 +236,7 @@ class IngestConsumer:
238
236
  logger.info(
239
237
  f"An error happend while processing a message from {message_source}. "
240
238
  f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
241
- f"Check sentry for more details: {str(e)}"
239
+ f"Check sentry for more details: {e!s}"
242
240
  )
243
241
  await self.ack_message(msg, kbid)
244
242
  logger.info("Message acked because of deadletter", extra={"seqid": seqid})
@@ -250,7 +248,7 @@ class IngestConsumer:
250
248
  logger.info(
251
249
  f"An error happend while processing a message from {message_source}. "
252
250
  f"This message has been dropped and won't be retried again"
253
- f"Check sentry for more details: {str(e)}"
251
+ f"Check sentry for more details: {e!s}"
254
252
  )
255
253
  await self.ack_message(msg, kbid)
256
254
  logger.info("Message acked because of drop", extra={"seqid": seqid})
@@ -260,7 +258,7 @@ class IngestConsumer:
260
258
  logger.exception(
261
259
  f"An error happend while processing a message from {message_source}. "
262
260
  "Message has not been ACKd and will be retried. "
263
- f"Check sentry for more details: {str(e)}"
261
+ f"Check sentry for more details: {e!s}"
264
262
  )
265
263
  await msg.nak()
266
264
  logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})
@@ -21,7 +21,6 @@ import asyncio
21
21
  import base64
22
22
  import time
23
23
  from contextlib import contextmanager
24
- from typing import Optional
25
24
 
26
25
  from aiohttp.client_exceptions import ClientConnectorError
27
26
  from opentelemetry import trace
@@ -35,7 +34,6 @@ from nucliadb.common.http_clients.exceptions import ServiceUnavailableException
35
34
  from nucliadb.common.http_clients.processing import (
36
35
  ProcessingHTTPClient,
37
36
  ProcessingPullMessageProgressUpdater,
38
- get_nua_api_id,
39
37
  )
40
38
  from nucliadb.common.maindb.driver import Driver
41
39
  from nucliadb.ingest import SERVICE_NAME, logger, logger_activity
@@ -96,7 +94,7 @@ class PullV2Worker:
96
94
  driver: Driver,
97
95
  storage: Storage,
98
96
  pull_time_error_backoff: int,
99
- pubsub: Optional[PubSubDriver] = None,
97
+ pubsub: PubSubDriver | None = None,
100
98
  pull_time_empty_backoff: float = 5.0,
101
99
  pull_api_timeout: int = 60,
102
100
  ):
@@ -142,12 +140,9 @@ class PullV2Worker:
142
140
  data = None
143
141
  if nuclia_settings.nuclia_service_account is not None:
144
142
  headers["X-STF-NUAKEY"] = f"Bearer {nuclia_settings.nuclia_service_account}"
145
- # parse jwt sub to get pull type id
146
- try:
147
- get_nua_api_id()
148
- except Exception as exc:
143
+ if nuclia_settings.nuclia_service_account is None:
149
144
  logger.exception("Could not read NUA API Key. Can not start pull worker")
150
- raise ReallyStopPulling() from exc
145
+ raise ReallyStopPulling()
151
146
 
152
147
  ack_tokens = []
153
148
  async with ProcessingHTTPClient() as processing_http_client:
@@ -19,8 +19,8 @@
19
19
  #
20
20
  import asyncio
21
21
  import sys
22
+ from collections.abc import Awaitable, Callable
22
23
  from functools import partial
23
- from typing import Awaitable, Callable, Optional
24
24
 
25
25
  from nucliadb.common.maindb.utils import setup_driver
26
26
  from nucliadb.ingest import SERVICE_NAME, logger
@@ -55,7 +55,7 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
55
55
 
56
56
 
57
57
  async def start_ingest_consumers(
58
- service_name: Optional[str] = None,
58
+ service_name: str | None = None,
59
59
  ) -> Callable[[], Awaitable[None]]:
60
60
  if transaction_settings.transaction_local:
61
61
  raise ConfigurationError("Can not start ingest consumers in local mode")
@@ -91,7 +91,7 @@ async def start_ingest_consumers(
91
91
 
92
92
 
93
93
  async def start_ingest_processed_consumer_v2(
94
- service_name: Optional[str] = None,
94
+ service_name: str | None = None,
95
95
  ) -> Callable[[], Awaitable[None]]:
96
96
  """
97
97
  This is not meant to be deployed with a stateful set like the other consumers.
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
- from typing import Callable, Coroutine
21
+ from collections.abc import Callable, Coroutine
22
22
 
23
23
 
24
24
  class DelayedTaskHandler: