nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -21,7 +21,6 @@ import argparse
21
21
  import asyncio
22
22
  import logging
23
23
  from datetime import datetime
24
- from typing import Optional
25
24
 
26
25
  from nidx_protos.nodewriter_pb2 import (
27
26
  NewShardRequest,
@@ -39,7 +38,7 @@ from nucliadb.common.nidx import get_nidx_api_client
39
38
  from nucliadb.common.vector_index_config import nucliadb_index_config_to_nidx
40
39
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
41
40
  from nucliadb.migrator.settings import settings
42
- from nucliadb_protos import utils_pb2, writer_pb2
41
+ from nucliadb_protos import writer_pb2
43
42
  from nucliadb_telemetry import errors
44
43
 
45
44
  from .utils import (
@@ -47,6 +46,7 @@ from .utils import (
47
46
  get_resource,
48
47
  get_rollover_resource_index_message,
49
48
  index_resource_to_shard,
49
+ wait_for_nidx,
50
50
  )
51
51
 
52
52
  logger = logging.getLogger(__name__)
@@ -61,7 +61,7 @@ class UnexpectedRolloverError(Exception):
61
61
  async def create_rollover_index(
62
62
  app_context: ApplicationContext,
63
63
  kbid: str,
64
- external: Optional[ExternalIndexManager] = None,
64
+ external: ExternalIndexManager | None = None,
65
65
  ) -> None:
66
66
  """
67
67
  Creates a new index for a knowledgebox in the index node cluster (and to the external index provider if configured).
@@ -150,7 +150,6 @@ async def create_rollover_shards(
150
150
 
151
151
  req = NewShardRequest(
152
152
  kbid=kbid,
153
- release_channel=utils_pb2.ReleaseChannel.STABLE,
154
153
  vectorsets_configs=vectorsets,
155
154
  )
156
155
 
@@ -174,7 +173,7 @@ async def create_rollover_shards(
174
173
  return kb_shards
175
174
 
176
175
 
177
- def _get_shard(shards: writer_pb2.Shards, shard_id: str) -> Optional[writer_pb2.ShardObject]:
176
+ def _get_shard(shards: writer_pb2.Shards, shard_id: str) -> writer_pb2.ShardObject | None:
178
177
  for shard in shards.shards:
179
178
  if shard_id == shard.shard:
180
179
  return shard
@@ -222,7 +221,7 @@ def _to_ts(dt: datetime) -> int:
222
221
 
223
222
 
224
223
  async def index_to_rollover_index(
225
- app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
224
+ app_context: ApplicationContext, kbid: str, external: ExternalIndexManager | None = None
226
225
  ) -> None:
227
226
  """
228
227
  Indexes all data in a kb in rollover indexes. This happens before the cutover.
@@ -256,6 +255,7 @@ async def index_to_rollover_index(
256
255
  for rid in resource_ids
257
256
  ]
258
257
  await asyncio.gather(*batch)
258
+ await wait_for_indexing_to_catch_up(app_context)
259
259
 
260
260
  async with datamanagers.with_transaction() as txn:
261
261
  state.resources_indexed = True
@@ -264,12 +264,28 @@ async def index_to_rollover_index(
264
264
  await txn.commit()
265
265
 
266
266
 
267
+ async def wait_for_indexing_to_catch_up(app_context: ApplicationContext):
268
+ try:
269
+ app_context.nats_manager
270
+ except AssertionError:
271
+ logger.warning("Nats manager not initialized. Cannot wait for indexing to catch up")
272
+ return
273
+ max_pending = 1000
274
+ while True:
275
+ try:
276
+ await wait_for_nidx(app_context.nats_manager, max_wait_seconds=60, max_pending=max_pending)
277
+ return
278
+ except asyncio.TimeoutError:
279
+ logger.warning(f"Nidx is behind more than {max_pending} messages. Throttling rollover.")
280
+ await asyncio.sleep(30)
281
+
282
+
267
283
  async def _index_resource_to_rollover_index(
268
284
  app_context: ApplicationContext,
269
285
  rollover_shards: writer_pb2.Shards,
270
286
  kbid: str,
271
287
  resource_id: str,
272
- external: Optional[ExternalIndexManager] = None,
288
+ external: ExternalIndexManager | None = None,
273
289
  ) -> None:
274
290
  async with resource_index_semaphore:
275
291
  async with datamanagers.with_transaction() as txn:
@@ -323,7 +339,7 @@ async def _index_resource_to_rollover_index(
323
339
 
324
340
 
325
341
  async def cutover_index(
326
- app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
342
+ app_context: ApplicationContext, kbid: str, external: ExternalIndexManager | None = None
327
343
  ) -> None:
328
344
  """
329
345
  Swaps our the current active index for a knowledgebox.
@@ -428,7 +444,7 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
428
444
 
429
445
 
430
446
  async def validate_indexed_data(
431
- app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
447
+ app_context: ApplicationContext, kbid: str, external: ExternalIndexManager | None = None
432
448
  ) -> list[str]:
433
449
  """
434
450
  Goes through all the resources in a knowledgebox and validates it
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import enum
21
- from typing import Optional
22
21
 
23
22
  from pydantic import Field
24
23
  from pydantic_settings import BaseSettings
@@ -52,13 +51,9 @@ class Settings(BaseSettings):
52
51
  description="Maximum number of entity labels (/e/) per field that are indexed (excess is not indexed)",
53
52
  )
54
53
 
55
- nidx_api_address: Optional[str] = Field(default=None, description="NIDX gRPC API address")
56
- nidx_searcher_address: Optional[str] = Field(
57
- default=None, description="NIDX gRPC searcher API address"
58
- )
59
- nidx_indexer_address: Optional[str] = Field(
60
- default=None, description="NIDX gRPC indexer API address"
61
- )
54
+ nidx_api_address: str | None = Field(default=None, description="NIDX gRPC API address")
55
+ nidx_searcher_address: str | None = Field(default=None, description="NIDX gRPC searcher API address")
56
+ nidx_indexer_address: str | None = Field(default=None, description="NIDX gRPC indexer API address")
62
57
 
63
58
 
64
59
  settings = Settings()
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  import asyncio
20
20
  import logging
21
- from typing import TYPE_CHECKING, Optional, Union
21
+ from typing import TYPE_CHECKING
22
22
 
23
23
  import backoff
24
24
  from nidx_protos import nodereader_pb2
@@ -32,6 +32,7 @@ from nucliadb.common.cluster.settings import settings
32
32
  from nucliadb.ingest.orm import index_message
33
33
  from nucliadb.ingest.orm.resource import Resource
34
34
  from nucliadb_protos import writer_pb2
35
+ from nucliadb_utils.nats import NatsConnectionManager
35
36
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
36
37
 
37
38
  if TYPE_CHECKING: # pragma: no cover
@@ -45,13 +46,13 @@ logger = logging.getLogger(__name__)
45
46
  _lock = asyncio.Lock()
46
47
 
47
48
 
48
- async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
49
+ async def setup_cluster() -> KBShardManager | StandaloneKBShardManager:
49
50
  async with _lock:
50
51
  if get_utility(Utility.SHARD_MANAGER) is not None:
51
52
  # already setup
52
53
  return get_utility(Utility.SHARD_MANAGER)
53
54
 
54
- mng: Union[KBShardManager, StandaloneKBShardManager]
55
+ mng: KBShardManager | StandaloneKBShardManager
55
56
  if settings.standalone_mode:
56
57
  mng = StandaloneKBShardManager()
57
58
  else:
@@ -69,17 +70,17 @@ def get_shard_manager() -> KBShardManager:
69
70
  return get_utility(Utility.SHARD_MANAGER) # type: ignore
70
71
 
71
72
 
72
- async def get_resource(kbid: str, resource_id: str) -> Optional[Resource]:
73
+ async def get_resource(kbid: str, resource_id: str) -> Resource | None:
73
74
  async with datamanagers.with_ro_transaction() as txn:
74
- return await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
75
+ return await Resource.get(txn, kbid=kbid, rid=resource_id)
75
76
 
76
77
 
77
78
  @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
78
79
  async def get_rollover_resource_index_message(
79
80
  kbid: str, resource_id: str
80
- ) -> Optional[nodereader_pb2.Resource]:
81
+ ) -> nodereader_pb2.Resource | None:
81
82
  async with datamanagers.with_ro_transaction() as txn:
82
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
83
+ resource = await Resource.get(txn, kbid=kbid, rid=resource_id)
83
84
  if resource is None:
84
85
  logger.warning(
85
86
  "Resource not found while indexing, skipping",
@@ -97,7 +98,7 @@ async def index_resource_to_shard(
97
98
  kbid: str,
98
99
  resource_id: str,
99
100
  shard: writer_pb2.ShardObject,
100
- resource_index_message: Optional[nodereader_pb2.Resource] = None,
101
+ resource_index_message: nodereader_pb2.Resource | None = None,
101
102
  ) -> None:
102
103
  logger.info("Indexing resource", extra={"kbid": kbid, "resource_id": resource_id})
103
104
  sm = app_context.shard_manager
@@ -125,3 +126,28 @@ async def delete_resource_from_shard(
125
126
  partition = partitioning.generate_partition(kbid, resource_id)
126
127
 
127
128
  await sm.delete_resource(shard, resource_id, 0, str(partition), kbid)
129
+
130
+
131
+ async def get_nats_consumer_pending_messages(
132
+ nats_manager: NatsConnectionManager, *, stream: str, consumer: str
133
+ ) -> int:
134
+ # get raw js client
135
+ js = nats_manager.js
136
+ consumer_info = await js.consumer_info(stream, consumer)
137
+ return consumer_info.num_pending
138
+
139
+
140
+ async def wait_for_nidx(
141
+ nats_manager: NatsConnectionManager,
142
+ max_pending: int,
143
+ poll_interval_seconds: int = 5,
144
+ max_wait_seconds: int = 60,
145
+ ):
146
+ async with asyncio.timeout(max_wait_seconds): # type: ignore
147
+ while True:
148
+ pending = await get_nats_consumer_pending_messages(
149
+ nats_manager, stream="nidx", consumer="nidx"
150
+ )
151
+ if pending < max_pending:
152
+ return
153
+ await asyncio.sleep(poll_interval_seconds)
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
- from typing import Optional
22
21
 
23
22
  from nucliadb.common.cluster.manager import KBShardManager
24
23
  from nucliadb.common.cluster.settings import in_standalone_mode
@@ -58,13 +57,13 @@ class ApplicationContext:
58
57
  self.service_name = service_name
59
58
  self._initialized: bool = False
60
59
  self._lock = asyncio.Lock()
61
- self._kv_driver: Optional[Driver] = None
62
- self._blob_storage: Optional[Storage] = None
63
- self._shard_manager: Optional[KBShardManager] = None
64
- self._partitioning: Optional[PartitionUtility] = None
65
- self._nats_manager: Optional[NatsConnectionManager] = None
66
- self._transaction: Optional[TransactionUtility] = None
67
- self._nidx: Optional[NidxUtility] = None
60
+ self._kv_driver: Driver | None = None
61
+ self._blob_storage: Storage | None = None
62
+ self._shard_manager: KBShardManager | None = None
63
+ self._partitioning: PartitionUtility | None = None
64
+ self._nats_manager: NatsConnectionManager | None = None
65
+ self._transaction: TransactionUtility | None = None
66
+ self._nidx: NidxUtility | None = None
68
67
  self.enabled_kv_driver = kv_driver
69
68
  self.enabled_blob_storage = blob_storage
70
69
  self.enabled_shard_manager = shard_manager
@@ -19,7 +19,6 @@
19
19
  #
20
20
 
21
21
  from contextlib import asynccontextmanager
22
- from typing import Optional
23
22
 
24
23
  from fastapi import FastAPI
25
24
  from starlette.routing import Mount
@@ -28,7 +27,7 @@ from nucliadb.common.context import ApplicationContext
28
27
 
29
28
 
30
29
  @asynccontextmanager
31
- async def inject_app_context(app: FastAPI, context: Optional[ApplicationContext] = None):
30
+ async def inject_app_context(app: FastAPI, context: ApplicationContext | None = None):
32
31
  if context is None:
33
32
  context = ApplicationContext()
34
33
 
@@ -31,7 +31,6 @@
31
31
  from . import (
32
32
  atomic,
33
33
  cluster,
34
- entities,
35
34
  exceptions,
36
35
  fields,
37
36
  kb,
@@ -47,7 +46,6 @@ from .utils import with_ro_transaction, with_rw_transaction, with_transaction
47
46
  __all__ = (
48
47
  "atomic",
49
48
  "cluster",
50
- "entities",
51
49
  "exceptions",
52
50
  "fields",
53
51
  "kb",
@@ -57,7 +55,7 @@ __all__ = (
57
55
  "search_configurations",
58
56
  "synonyms",
59
57
  "vectorsets",
60
- "with_transaction",
61
- "with_rw_transaction",
62
58
  "with_ro_transaction",
59
+ "with_rw_transaction",
60
+ "with_transaction",
63
61
  )
@@ -35,10 +35,11 @@ it's transaction
35
35
 
36
36
  """
37
37
 
38
+ from collections.abc import Awaitable, Callable
38
39
  from functools import wraps
39
- from typing import Awaitable, Callable, TypeVar
40
+ from typing import Concatenate, TypeVar
40
41
 
41
- from typing_extensions import Concatenate, ParamSpec
42
+ from typing_extensions import ParamSpec
42
43
 
43
44
  from nucliadb.common.maindb.driver import Transaction
44
45
 
@@ -88,6 +89,7 @@ class resources:
88
89
  get_resource_uuid_from_slug = ro_txn_wrap(resources_dm.get_resource_uuid_from_slug)
89
90
  resource_exists = ro_txn_wrap(resources_dm.resource_exists)
90
91
  slug_exists = ro_txn_wrap(resources_dm.slug_exists)
92
+ get_all_field_ids = ro_txn_wrap(resources_dm.get_all_field_ids)
91
93
 
92
94
 
93
95
  class labelset:
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import logging
21
- from typing import Optional
22
21
 
23
22
  from nucliadb.common.maindb.driver import Transaction
24
23
  from nucliadb_protos import writer_pb2
@@ -33,7 +32,7 @@ KB_SHARDS = "/kbs/{kbid}/shards"
33
32
 
34
33
  async def get_kb_shards(
35
34
  txn: Transaction, *, kbid: str, for_update: bool = False
36
- ) -> Optional[writer_pb2.Shards]:
35
+ ) -> writer_pb2.Shards | None:
37
36
  key = KB_SHARDS.format(kbid=kbid)
38
37
  return await get_kv_pb(txn, key, writer_pb2.Shards, for_update=for_update)
39
38
 
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
22
21
 
23
22
  from google.protobuf.message import Message
24
23
 
@@ -34,7 +33,7 @@ KB_RESOURCE_FIELD_STATUS = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/status"
34
33
 
35
34
  async def get_raw(
36
35
  txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str
37
- ) -> Optional[bytes]:
36
+ ) -> bytes | None:
38
37
  key = KB_RESOURCE_FIELD.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
39
38
  return await txn.get(key)
40
39
 
@@ -62,7 +61,7 @@ async def delete(txn: Transaction, *, kbid: str, rid: str, field_type: str, fiel
62
61
 
63
62
  async def get_error(
64
63
  txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str
65
- ) -> Optional[writer_pb2.Error]:
64
+ ) -> writer_pb2.Error | None:
66
65
  key = KB_RESOURCE_FIELD_ERROR.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
67
66
  return await get_kv_pb(txn, key, writer_pb2.Error)
68
67
 
@@ -85,7 +84,7 @@ async def set_error(
85
84
 
86
85
  async def get_status(
87
86
  txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str
88
- ) -> Optional[writer_pb2.FieldStatus]:
87
+ ) -> writer_pb2.FieldStatus | None:
89
88
  key = KB_RESOURCE_FIELD_STATUS.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
90
89
  return await get_kv_pb(txn, key, writer_pb2.FieldStatus)
91
90
 
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import logging
21
- from typing import AsyncIterator, Optional
21
+ from collections.abc import AsyncIterator
22
22
 
23
23
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
24
24
  from nucliadb.common.maindb.driver import Transaction
@@ -47,7 +47,7 @@ async def exists_kb(txn: Transaction, *, kbid: str) -> bool:
47
47
  return await get_config(txn, kbid=kbid, for_update=False) is not None
48
48
 
49
49
 
50
- async def get_kb_uuid(txn: Transaction, *, slug: str) -> Optional[str]:
50
+ async def get_kb_uuid(txn: Transaction, *, slug: str) -> str | None:
51
51
  uuid = await txn.get(KB_SLUGS.format(slug=slug), for_update=False)
52
52
  if uuid is not None:
53
53
  return uuid.decode()
@@ -67,7 +67,7 @@ async def delete_kb_slug(txn: Transaction, *, slug: str):
67
67
 
68
68
  async def get_config(
69
69
  txn: Transaction, *, kbid: str, for_update: bool = False
70
- ) -> Optional[knowledgebox_pb2.KnowledgeBoxConfig]:
70
+ ) -> knowledgebox_pb2.KnowledgeBoxConfig | None:
71
71
  key = KB_UUID.format(kbid=kbid)
72
72
  payload = await txn.get(key, for_update=for_update)
73
73
  if payload is None:
@@ -105,8 +105,8 @@ async def get_matryoshka_vector_dimension(
105
105
  txn: Transaction,
106
106
  *,
107
107
  kbid: str,
108
- vectorset_id: Optional[str] = None,
109
- ) -> Optional[int]:
108
+ vectorset_id: str | None = None,
109
+ ) -> int | None:
110
110
  """Return vector dimension for matryoshka models"""
111
111
  from . import vectorsets
112
112
 
@@ -145,7 +145,7 @@ async def get_matryoshka_vector_dimension(
145
145
 
146
146
  async def get_external_index_provider_metadata(
147
147
  txn: Transaction, *, kbid: str
148
- ) -> Optional[knowledgebox_pb2.StoredExternalIndexProviderMetadata]:
148
+ ) -> knowledgebox_pb2.StoredExternalIndexProviderMetadata | None:
149
149
  kb_config = await get_config(txn, kbid=kbid)
150
150
  if kb_config is None:
151
151
  return None
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import logging
21
- from typing import Optional
22
21
 
23
22
  import orjson
24
23
 
@@ -50,7 +49,7 @@ async def get_labels(txn: Transaction, *, kbid: str) -> kb_pb2.Labels:
50
49
  return labels
51
50
 
52
51
 
53
- async def _get_labelset_ids(txn: Transaction, *, kbid: str) -> Optional[list[str]]:
52
+ async def _get_labelset_ids(txn: Transaction, *, kbid: str) -> list[str] | None:
54
53
  key = KB_LABELSET_IDS.format(kbid=kbid)
55
54
  data = await txn.get(key, for_update=True)
56
55
  if not data:
@@ -84,7 +83,7 @@ async def _set_labelset_ids(txn: Transaction, *, kbid: str, labelsets: list[str]
84
83
  await txn.set(key, data)
85
84
 
86
85
 
87
- async def get_labelset(txn: Transaction, *, kbid: str, labelset_id: str) -> Optional[kb_pb2.LabelSet]:
86
+ async def get_labelset(txn: Transaction, *, kbid: str, labelset_id: str) -> kb_pb2.LabelSet | None:
88
87
  labelset_key = KB_LABELSET.format(kbid=kbid, id=labelset_id)
89
88
  payload = await txn.get(labelset_key)
90
89
  if payload:
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import TYPE_CHECKING, AsyncGenerator, Optional
20
+ from collections.abc import AsyncGenerator
21
21
 
22
22
  import backoff
23
23
 
@@ -28,14 +28,9 @@ from nucliadb.common.maindb.exceptions import ConflictError, NotFoundError
28
28
  # These should be refactored
29
29
  from nucliadb.ingest.settings import settings as ingest_settings
30
30
  from nucliadb_protos import resources_pb2
31
- from nucliadb_utils.utilities import get_storage
32
31
 
33
32
  from .utils import with_ro_transaction
34
33
 
35
- if TYPE_CHECKING:
36
- from nucliadb.ingest.orm.resource import Resource as ResourceORM
37
-
38
-
39
34
  KB_RESOURCE_BASIC = "/kbs/{kbid}/r/{uuid}"
40
35
  KB_RESOURCE_BASIC_FS = "/kbs/{kbid}/r/{uuid}/basic" # Only used on FS driver
41
36
  KB_RESOURCE_ORIGIN = "/kbs/{kbid}/r/{uuid}/origin"
@@ -61,7 +56,7 @@ async def resource_exists(txn: Transaction, *, kbid: str, rid: str) -> bool:
61
56
  # id and slug
62
57
 
63
58
 
64
- async def get_resource_uuid_from_slug(txn: Transaction, *, kbid: str, slug: str) -> Optional[str]:
59
+ async def get_resource_uuid_from_slug(txn: Transaction, *, kbid: str, slug: str) -> str | None:
65
60
  encoded_uuid = await txn.get(KB_RESOURCE_SLUG.format(kbid=kbid, slug=slug, for_update=False))
66
61
  if not encoded_uuid:
67
62
  return None
@@ -70,7 +65,7 @@ async def get_resource_uuid_from_slug(txn: Transaction, *, kbid: str, slug: str)
70
65
 
71
66
  async def slug_exists(txn: Transaction, *, kbid: str, slug: str) -> bool:
72
67
  key = KB_RESOURCE_SLUG.format(kbid=kbid, slug=slug)
73
- encoded_slug: Optional[bytes] = await txn.get(key)
68
+ encoded_slug: bytes | None = await txn.get(key)
74
69
  return encoded_slug not in (None, b"")
75
70
 
76
71
 
@@ -102,7 +97,7 @@ async def modify_slug(txn: Transaction, *, kbid: str, rid: str, new_slug: str) -
102
97
  @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
103
98
  async def get_resource_shard_id(
104
99
  txn: Transaction, *, kbid: str, rid: str, for_update: bool = False
105
- ) -> Optional[str]:
100
+ ) -> str | None:
106
101
  key = KB_RESOURCE_SHARD.format(kbid=kbid, uuid=rid)
107
102
  shard = await txn.get(key, for_update=for_update)
108
103
  if shard is not None:
@@ -118,7 +113,7 @@ async def set_resource_shard_id(txn: Transaction, *, kbid: str, rid: str, shard:
118
113
  # Basic
119
114
 
120
115
 
121
- async def get_basic(txn: Transaction, *, kbid: str, rid: str) -> Optional[resources_pb2.Basic]:
116
+ async def get_basic(txn: Transaction, *, kbid: str, rid: str) -> resources_pb2.Basic | None:
122
117
  raw = await get_basic_raw(txn, kbid=kbid, rid=rid)
123
118
  if raw is None:
124
119
  return None
@@ -127,7 +122,7 @@ async def get_basic(txn: Transaction, *, kbid: str, rid: str) -> Optional[resour
127
122
  return basic
128
123
 
129
124
 
130
- async def get_basic_raw(txn: Transaction, *, kbid: str, rid: str) -> Optional[bytes]:
125
+ async def get_basic_raw(txn: Transaction, *, kbid: str, rid: str) -> bytes | None:
131
126
  if ingest_settings.driver == "local":
132
127
  raw_basic = await txn.get(KB_RESOURCE_BASIC_FS.format(kbid=kbid, uuid=rid))
133
128
  else:
@@ -151,7 +146,7 @@ async def set_basic(txn: Transaction, *, kbid: str, rid: str, basic: resources_p
151
146
  # Origin
152
147
 
153
148
 
154
- async def get_origin(txn: Transaction, *, kbid: str, rid: str) -> Optional[resources_pb2.Origin]:
149
+ async def get_origin(txn: Transaction, *, kbid: str, rid: str) -> resources_pb2.Origin | None:
155
150
  key = KB_RESOURCE_ORIGIN.format(kbid=kbid, uuid=rid)
156
151
  return await get_kv_pb(txn, key, resources_pb2.Origin)
157
152
 
@@ -164,7 +159,7 @@ async def set_origin(txn: Transaction, *, kbid: str, rid: str, origin: resources
164
159
  # Extra
165
160
 
166
161
 
167
- async def get_extra(txn: Transaction, *, kbid: str, rid: str) -> Optional[resources_pb2.Extra]:
162
+ async def get_extra(txn: Transaction, *, kbid: str, rid: str) -> resources_pb2.Extra | None:
168
163
  key = KB_RESOURCE_EXTRA.format(kbid=kbid, uuid=rid)
169
164
  return await get_kv_pb(txn, key, resources_pb2.Extra)
170
165
 
@@ -177,7 +172,7 @@ async def set_extra(txn: Transaction, *, kbid: str, rid: str, extra: resources_p
177
172
  # Security
178
173
 
179
174
 
180
- async def get_security(txn: Transaction, *, kbid: str, rid: str) -> Optional[resources_pb2.Security]:
175
+ async def get_security(txn: Transaction, *, kbid: str, rid: str) -> resources_pb2.Security | None:
181
176
  key = KB_RESOURCE_SECURITY.format(kbid=kbid, uuid=rid)
182
177
  return await get_kv_pb(txn, key, resources_pb2.Security)
183
178
 
@@ -265,7 +260,7 @@ async def set_number_of_resources(txn: Transaction, kbid: str, value: int) -> No
265
260
 
266
261
  async def get_all_field_ids(
267
262
  txn: Transaction, *, kbid: str, rid: str, for_update: bool = False
268
- ) -> Optional[resources_pb2.AllFieldIDs]:
263
+ ) -> resources_pb2.AllFieldIDs | None:
269
264
  key = KB_RESOURCE_ALL_FIELDS.format(kbid=kbid, uuid=rid)
270
265
  return await get_kv_pb(txn, key, resources_pb2.AllFieldIDs, for_update=for_update)
271
266
 
@@ -285,21 +280,3 @@ async def has_field(txn: Transaction, *, kbid: str, rid: str, field_id: resource
285
280
  if field_id == resource_field_id:
286
281
  return True
287
282
  return False
288
-
289
-
290
- # ORM mix (this functions shouldn't belong here)
291
-
292
-
293
- @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=3)
294
- async def get_resource(txn: Transaction, *, kbid: str, rid: str) -> Optional["ResourceORM"]:
295
- """
296
- Not ideal to return Resource type here but refactoring would
297
- require a lot of changes.
298
-
299
- At least this isolated that dependency here.
300
- """
301
- # prevent circulat imports -- this is not ideal that we have the ORM mix here.
302
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
303
-
304
- kb_orm = KnowledgeBoxORM(txn, await get_storage(), kbid)
305
- return await kb_orm.get(rid)
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import logging
21
- from typing import AsyncGenerator, Optional
21
+ from collections.abc import AsyncGenerator
22
22
 
23
23
  import orjson
24
24
  from pydantic import BaseModel
@@ -56,7 +56,7 @@ class RolloverStateNotFoundError(Exception):
56
56
  ...
57
57
 
58
58
 
59
- async def get_kb_rollover_shards(txn: Transaction, *, kbid: str) -> Optional[writer_pb2.Shards]:
59
+ async def get_kb_rollover_shards(txn: Transaction, *, kbid: str) -> writer_pb2.Shards | None:
60
60
  key = KB_ROLLOVER_SHARDS.format(kbid=kbid)
61
61
  return await get_kv_pb(txn, key, writer_pb2.Shards)
62
62
 
@@ -90,7 +90,7 @@ async def add_batch_to_index(txn: Transaction, *, kbid: str, batch: list[str]) -
90
90
  await txn.set(key, b"")
91
91
 
92
92
 
93
- async def get_to_index(txn: Transaction, *, kbid: str, count: int) -> Optional[list[str]]:
93
+ async def get_to_index(txn: Transaction, *, kbid: str, count: int) -> list[str] | None:
94
94
  key = KB_ROLLOVER_RESOURCES_TO_INDEX.format(kbid=kbid, resource="")
95
95
  found = [key async for key in txn.keys(key, count=count)]
96
96
  if found:
@@ -118,9 +118,7 @@ async def add_indexed(
118
118
  await txn.set(indexed, orjson.dumps(data))
119
119
 
120
120
 
121
- async def get_indexed_data(
122
- txn: Transaction, *, kbid: str, resource_id: str
123
- ) -> Optional[tuple[str, int]]:
121
+ async def get_indexed_data(txn: Transaction, *, kbid: str, resource_id: str) -> tuple[str, int] | None:
124
122
  key = KB_ROLLOVER_RESOURCES_INDEXED.format(kbid=kbid, resource=resource_id)
125
123
  val = await txn.get(key)
126
124
  if val is not None:
@@ -213,7 +211,7 @@ async def update_kb_rollover_external_index_metadata(
213
211
 
214
212
  async def get_kb_rollover_external_index_metadata(
215
213
  txn: Transaction, *, kbid: str
216
- ) -> Optional[kb_pb2.StoredExternalIndexProviderMetadata]:
214
+ ) -> kb_pb2.StoredExternalIndexProviderMetadata | None:
217
215
  key = KB_ROLLOVER_EXTERNAL_INDEX_METADATA.format(kbid=kbid)
218
216
  val = await txn.get(key)
219
217
  if not val:
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import logging
21
- from typing import Optional
22
21
 
23
22
  from pydantic import TypeAdapter
24
23
 
@@ -31,7 +30,7 @@ KB_SEARCH_CONFIGURATION_PREFIX = "/kbs/{kbid}/search_configuration"
31
30
  KB_SEARCH_CONFIGURATION = "/kbs/{kbid}/search_configuration/{name}"
32
31
 
33
32
 
34
- async def get(txn: Transaction, *, kbid: str, name: str) -> Optional[SearchConfiguration]:
33
+ async def get(txn: Transaction, *, kbid: str, name: str) -> SearchConfiguration | None:
35
34
  key = KB_SEARCH_CONFIGURATION.format(kbid=kbid, name=name)
36
35
  data = await txn.get(key, for_update=True)
37
36
  if not data:
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
22
21
 
23
22
  from nucliadb.common.datamanagers.utils import get_kv_pb
24
23
  from nucliadb.common.maindb.driver import Transaction
@@ -27,7 +26,7 @@ from nucliadb_protos import knowledgebox_pb2
27
26
  KB_SYNONYMS = "/kbs/{kbid}/synonyms"
28
27
 
29
28
 
30
- async def get(txn: Transaction, *, kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
29
+ async def get(txn: Transaction, *, kbid: str) -> knowledgebox_pb2.Synonyms | None:
31
30
  key = KB_SYNONYMS.format(kbid=kbid)
32
31
  return await get_kv_pb(txn, key, knowledgebox_pb2.Synonyms, for_update=False)
33
32