nucliadb 6.2.1.post2971__py3-none-any.whl → 6.2.1.post2977__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. nucliadb/common/cluster/manager.py +33 -331
  2. nucliadb/common/cluster/rebalance.py +2 -2
  3. nucliadb/common/cluster/rollover.py +12 -71
  4. nucliadb/common/cluster/settings.py +3 -0
  5. nucliadb/common/cluster/standalone/utils.py +0 -43
  6. nucliadb/common/cluster/utils.py +0 -16
  7. nucliadb/common/nidx.py +76 -37
  8. nucliadb/health.py +0 -7
  9. nucliadb/ingest/app.py +0 -8
  10. nucliadb/ingest/consumer/auditing.py +1 -1
  11. nucliadb/ingest/consumer/shard_creator.py +1 -1
  12. nucliadb/ingest/orm/entities.py +3 -6
  13. nucliadb/purge/orphan_shards.py +6 -4
  14. nucliadb/search/api/v1/knowledgebox.py +1 -5
  15. nucliadb/search/requesters/utils.py +1 -2
  16. nucliadb/search/search/shards.py +19 -0
  17. nucliadb/standalone/introspect.py +0 -25
  18. nucliadb/train/lifecycle.py +3 -6
  19. nucliadb/train/nodes.py +1 -5
  20. nucliadb/writer/back_pressure.py +17 -46
  21. nucliadb/writer/settings.py +2 -2
  22. {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2977.dist-info}/METADATA +5 -7
  23. {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2977.dist-info}/RECORD +27 -37
  24. nucliadb/common/cluster/discovery/__init__.py +0 -19
  25. nucliadb/common/cluster/discovery/base.py +0 -178
  26. nucliadb/common/cluster/discovery/k8s.py +0 -301
  27. nucliadb/common/cluster/discovery/manual.py +0 -57
  28. nucliadb/common/cluster/discovery/single.py +0 -51
  29. nucliadb/common/cluster/discovery/types.py +0 -32
  30. nucliadb/common/cluster/discovery/utils.py +0 -67
  31. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  32. nucliadb/common/cluster/standalone/index_node.py +0 -123
  33. nucliadb/common/cluster/standalone/service.py +0 -84
  34. {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2977.dist-info}/WHEEL +0 -0
  35. {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2977.dist-info}/entry_points.txt +0 -0
  36. {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2977.dist-info}/top_level.txt +0 -0
  37. {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2977.dist-info}/zip-safe +0 -0
@@ -19,13 +19,10 @@
19
19
 
20
20
  import logging
21
21
  import os
22
- import shutil
23
22
  import uuid
24
- from socket import gethostname
25
23
 
26
24
  from nucliadb.common.cluster.settings import StandaloneNodeRole
27
25
  from nucliadb.common.cluster.settings import settings as cluster_settings
28
- from nucliadb.common.cluster.standalone.index_node import StandaloneIndexNode
29
26
 
30
27
  logger = logging.getLogger(__name__)
31
28
 
@@ -46,46 +43,6 @@ def get_standalone_node_id() -> str:
46
43
  return str(uuid.UUID(bytes=f.read()))
47
44
 
48
45
 
49
- _SELF_INDEX_NODE = None
50
-
51
-
52
- def get_self() -> StandaloneIndexNode:
53
- """
54
- This returns an instance of the standalone index node
55
- so when API requests come into this mode, we don't
56
- make another grpc request since this node can service it directly.
57
- """
58
- if not is_index_node():
59
- raise Exception("This node is not an Index Node. You should not reach this code path.")
60
- global _SELF_INDEX_NODE
61
- node_id = get_standalone_node_id()
62
- if _SELF_INDEX_NODE is None or node_id != _SELF_INDEX_NODE.id:
63
- if "NUCLIADB_SERVICE_HOST" in os.environ:
64
- hn = os.environ["HOSTNAME"]
65
- ns = os.environ.get("NAMESPACE", "nucliadb")
66
- host = f"{hn}.{ns}"
67
- else:
68
- host = gethostname()
69
- _SELF_INDEX_NODE = StandaloneIndexNode(id=node_id, address=host, shard_count=0, available_disk=0)
70
- try:
71
- _, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
72
- _SELF_INDEX_NODE.available_disk = available_disk
73
- except FileNotFoundError: # pragma: no cover
74
- ...
75
- try:
76
- _shards_dir = os.path.join(cluster_settings.data_path, "shards")
77
- _SELF_INDEX_NODE.shard_count = len(
78
- [
79
- shard_dir
80
- for shard_dir in os.listdir(_shards_dir)
81
- if os.path.isdir(os.path.join(_shards_dir, shard_dir))
82
- ]
83
- )
84
- except FileNotFoundError: # pragma: no cover
85
- ...
86
- return _SELF_INDEX_NODE
87
-
88
-
89
46
  def is_index_node() -> bool:
90
47
  return cluster_settings.standalone_node_role in (
91
48
  StandaloneNodeRole.ALL,
@@ -23,20 +23,11 @@ from typing import TYPE_CHECKING, Optional, Union
23
23
  import backoff
24
24
 
25
25
  from nucliadb.common import datamanagers
26
- from nucliadb.common.cluster.discovery.utils import (
27
- setup_cluster_discovery,
28
- teardown_cluster_discovery,
29
- )
30
26
  from nucliadb.common.cluster.manager import (
31
27
  KBShardManager,
32
28
  StandaloneKBShardManager,
33
- clear_index_nodes,
34
29
  )
35
30
  from nucliadb.common.cluster.settings import settings
36
- from nucliadb.common.cluster.standalone.service import (
37
- start_grpc as start_standalone_grpc,
38
- )
39
- from nucliadb.common.cluster.standalone.utils import is_index_node
40
31
  from nucliadb.ingest.orm.resource import Resource
41
32
  from nucliadb_protos import nodereader_pb2, writer_pb2
42
33
  from nucliadb_utils import const
@@ -62,12 +53,8 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
62
53
  # already setup
63
54
  return get_utility(Utility.SHARD_MANAGER)
64
55
 
65
- await setup_cluster_discovery()
66
56
  mng: Union[KBShardManager, StandaloneKBShardManager]
67
57
  if settings.standalone_mode:
68
- if is_index_node():
69
- server = await start_standalone_grpc()
70
- set_utility(_STANDALONE_SERVER, server)
71
58
  mng = StandaloneKBShardManager()
72
59
  else:
73
60
  mng = KBShardManager()
@@ -76,7 +63,6 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
76
63
 
77
64
 
78
65
  async def teardown_cluster():
79
- await teardown_cluster_discovery()
80
66
  if get_utility(Utility.SHARD_MANAGER):
81
67
  clean_utility(Utility.SHARD_MANAGER)
82
68
 
@@ -85,8 +71,6 @@ async def teardown_cluster():
85
71
  await std_server.stop(None)
86
72
  clean_utility(_STANDALONE_SERVER)
87
73
 
88
- clear_index_nodes()
89
-
90
74
 
91
75
  def get_shard_manager() -> KBShardManager:
92
76
  return get_utility(Utility.SHARD_MANAGER) # type: ignore
nucliadb/common/nidx.py CHANGED
@@ -21,7 +21,7 @@
21
21
  import os
22
22
  from typing import Optional
23
23
 
24
- from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxSearcherStub
24
+ from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
25
25
 
26
26
  from nucliadb.common.cluster.base import AbstractIndexNode
27
27
  from nucliadb.common.cluster.settings import settings
@@ -37,12 +37,10 @@ from nucliadb_utils.settings import FileBackendConfig, indexing_settings, storag
37
37
  from nucliadb_utils.storages.settings import settings as extended_storage_settings
38
38
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
39
39
 
40
- NIDX_ENABLED = bool(os.environ.get("NIDX_ENABLED"))
41
-
42
40
 
43
41
  class NidxUtility:
44
- api_client = None
45
- searcher_client = None
42
+ api_client: NidxApiStub
43
+ searcher_client: NidxSearcherStub
46
44
 
47
45
  async def initialize(self):
48
46
  raise NotImplementedError()
@@ -98,6 +96,9 @@ class NidxBindingUtility(NidxUtility):
98
96
 
99
97
  self.config = {
100
98
  "METADATA__DATABASE_URL": ingest_settings.driver_pg_url,
99
+ "SEARCHER__METADATA_REFRESH_INTERVAL": str(
100
+ indexing_settings.index_searcher_refresh_interval
101
+ ),
101
102
  **_storage_config("INDEXER", None),
102
103
  **_storage_config("STORAGE", "nidx"),
103
104
  }
@@ -123,16 +124,8 @@ class NidxBindingUtility(NidxUtility):
123
124
  self.binding.wait_for_sync()
124
125
 
125
126
 
126
- class NidxServiceUtility(NidxUtility):
127
- """Implements Nidx utility connecting to the network service"""
128
-
127
+ class NidxNatsIndexer:
129
128
  def __init__(self):
130
- if indexing_settings.index_nidx_subject is None:
131
- raise ValueError("INDEX_NIDX_SUBJECT needed for nidx utility")
132
-
133
- if not settings.nidx_api_address or not settings.nidx_searcher_address:
134
- raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
135
-
136
129
  self.nats_connection_manager = NatsConnectionManager(
137
130
  service_name="NidxIndexer",
138
131
  nats_servers=indexing_settings.index_jetstream_servers,
@@ -142,10 +135,6 @@ class NidxServiceUtility(NidxUtility):
142
135
 
143
136
  async def initialize(self):
144
137
  await self.nats_connection_manager.initialize()
145
- self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
146
- self.searcher_client = NidxSearcherStub(
147
- get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
148
- )
149
138
 
150
139
  async def finalize(self):
151
140
  await self.nats_connection_manager.finalize()
@@ -158,18 +147,68 @@ class NidxServiceUtility(NidxUtility):
158
147
  return res.seq
159
148
 
160
149
 
161
- async def start_nidx_utility() -> Optional[NidxUtility]:
162
- if not NIDX_ENABLED:
163
- return None
150
+ class NidxGrpcIndexer:
151
+ def __init__(self, address):
152
+ self.address = address
164
153
 
165
- nidx = get_nidx()
154
+ async def initialize(self):
155
+ self.client = NidxIndexerStub(get_traced_grpc_channel(self.address, "nidx_indexer"))
156
+
157
+ async def finalize(self):
158
+ pass
159
+
160
+ async def index(self, writer: IndexMessage) -> int:
161
+ await self.client.Index(writer)
162
+ return 0
163
+
164
+
165
+ class NidxServiceUtility(NidxUtility):
166
+ """Implements Nidx utility connecting to the network service"""
167
+
168
+ def __init__(self):
169
+ if not settings.nidx_api_address or not settings.nidx_searcher_address:
170
+ raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
171
+
172
+ if indexing_settings.index_nidx_subject:
173
+ self.indexer = NidxNatsIndexer()
174
+ elif settings.nidx_indexer_address is not None:
175
+ self.indexer = NidxGrpcIndexer(settings.nidx_indexer_address)
176
+ else:
177
+ raise ValueError("NIDX_INDEXER_ADDRESS or INDEX_NIDX_SUBJECT are required")
178
+
179
+ async def initialize(self):
180
+ await self.indexer.initialize()
181
+ self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
182
+ self.searcher_client = NidxSearcherStub(
183
+ get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
184
+ )
185
+
186
+ async def finalize(self):
187
+ await self.indexer.finalize()
188
+
189
+ async def index(self, writer: IndexMessage) -> int:
190
+ return await self.indexer.index(writer)
191
+
192
+
193
+ async def start_nidx_utility() -> Optional[NidxUtility]:
194
+ nidx = get_utility(Utility.NIDX)
166
195
  if nidx:
167
196
  return nidx
168
197
 
169
198
  nidx_utility: NidxUtility
170
199
  if settings.standalone_mode:
171
- nidx_utility = NidxBindingUtility()
200
+ if (
201
+ settings.nidx_api_address is not None
202
+ and settings.nidx_searcher_address is not None
203
+ and settings.nidx_indexer_address is not None
204
+ ):
205
+ # Standalone with nidx service (via grpc). This is used in clustered standalone mode
206
+ nidx_utility = NidxServiceUtility()
207
+ else:
208
+ # Normal standalone mode with binding
209
+ nidx_utility = NidxBindingUtility()
172
210
  else:
211
+ # Component deploy with nidx service via grpc & nats (cloud)
173
212
  nidx_utility = NidxServiceUtility()
174
213
 
175
214
  await nidx_utility.initialize()
@@ -178,30 +217,33 @@ async def start_nidx_utility() -> Optional[NidxUtility]:
178
217
 
179
218
 
180
219
  async def stop_nidx_utility():
181
- nidx_utility = get_nidx()
220
+ nidx_utility = get_utility(Utility.NIDX)
182
221
  if nidx_utility:
183
222
  clean_utility(Utility.NIDX)
184
223
  await nidx_utility.finalize()
185
224
 
186
225
 
187
- def get_nidx() -> Optional[NidxUtility]:
188
- return get_utility(Utility.NIDX)
226
+ def get_nidx() -> NidxUtility:
227
+ nidx = get_utility(Utility.NIDX)
228
+ if nidx is None:
229
+ raise Exception("nidx not initialized")
230
+ return nidx
189
231
 
190
232
 
191
- def get_nidx_api_client() -> Optional["NidxApiStub"]:
233
+ def get_nidx_api_client() -> "NidxApiStub":
192
234
  nidx = get_nidx()
193
- if nidx:
235
+ if nidx.api_client:
194
236
  return nidx.api_client
195
237
  else:
196
- return None
238
+ raise Exception("nidx not initialized")
197
239
 
198
240
 
199
- def get_nidx_searcher_client() -> Optional["NidxSearcherStub"]:
241
+ def get_nidx_searcher_client() -> "NidxSearcherStub":
200
242
  nidx = get_nidx()
201
- if nidx:
243
+ if nidx.searcher_client:
202
244
  return nidx.searcher_client
203
245
  else:
204
- return None
246
+ raise Exception("nidx not initialized")
205
247
 
206
248
 
207
249
  # TODO: Remove the index node abstraction
@@ -252,9 +294,6 @@ class FakeNode(AbstractIndexNode):
252
294
  return "nidx"
253
295
 
254
296
 
255
- def get_nidx_fake_node() -> Optional[FakeNode]:
297
+ def get_nidx_fake_node() -> FakeNode:
256
298
  nidx = get_nidx()
257
- if nidx:
258
- return FakeNode(nidx.api_client, nidx.searcher_client)
259
- else:
260
- return None
299
+ return FakeNode(nidx.api_client, nidx.searcher_client)
nucliadb/health.py CHANGED
@@ -40,13 +40,6 @@ def nats_manager_healthy() -> bool:
40
40
  return nats_manager.healthy()
41
41
 
42
42
 
43
- def nodes_health_check() -> bool:
44
- from nucliadb.common.cluster import manager
45
- from nucliadb.ingest.settings import DriverConfig, settings
46
-
47
- return len(manager.INDEX_NODES) > 0 or settings.driver == DriverConfig.LOCAL
48
-
49
-
50
43
  def pubsub_check() -> bool:
51
44
  driver: Optional[PubSubDriver] = get_utility(Utility.PUBSUB)
52
45
  if driver is None:
nucliadb/ingest/app.py CHANGED
@@ -22,10 +22,6 @@ import importlib.metadata
22
22
  from typing import Awaitable, Callable
23
23
 
24
24
  from nucliadb import health
25
- from nucliadb.common.cluster.discovery.utils import (
26
- setup_cluster_discovery,
27
- teardown_cluster_discovery,
28
- )
29
25
  from nucliadb.common.cluster.settings import settings as cluster_settings
30
26
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
31
27
  from nucliadb.common.context import ApplicationContext
@@ -89,13 +85,9 @@ async def initialize() -> list[Callable[[], Awaitable[None]]]:
89
85
  )
90
86
  finalizers.append(stop_nats_manager)
91
87
 
92
- await setup_cluster_discovery()
93
- finalizers.append(teardown_cluster_discovery)
94
-
95
88
  health.register_health_checks(
96
89
  [
97
90
  health.nats_manager_healthy,
98
- health.nodes_health_check,
99
91
  health.pubsub_check,
100
92
  ]
101
93
  )
@@ -113,7 +113,7 @@ class IndexAuditHandler:
113
113
 
114
114
  for shard_obj in shard_groups:
115
115
  # TODO: Uses node for auditing, don't want to suddenly change metrics
116
- node, shard_id = choose_node(shard_obj, use_nidx=False)
116
+ node, shard_id = choose_node(shard_obj)
117
117
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
118
118
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
119
119
  )
@@ -103,7 +103,7 @@ class ShardCreatorHandler:
103
103
  async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
104
104
  # remember, a lock will do at least 1+ reads and 1 write.
105
105
  # with heavy writes, this adds some simple k/v pressure
106
- node, shard_id = choose_node(current_shard, use_nidx=True)
106
+ node, shard_id = choose_node(current_shard)
107
107
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
108
108
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
109
109
  )
@@ -37,6 +37,7 @@ from nucliadb.common.datamanagers.entities import (
37
37
  from nucliadb.common.maindb.driver import Transaction
38
38
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
39
39
  from nucliadb.ingest.settings import settings
40
+ from nucliadb.search.search.shards import query_shard
40
41
  from nucliadb_protos.knowledgebox_pb2 import (
41
42
  DeletedEntitiesGroups,
42
43
  EntitiesGroup,
@@ -54,8 +55,6 @@ from nucliadb_protos.nodereader_pb2 import (
54
55
  from nucliadb_protos.utils_pb2 import RelationNode
55
56
  from nucliadb_protos.writer_pb2 import GetEntitiesResponse
56
57
  from nucliadb_telemetry import errors
57
- from nucliadb_utils import const
58
- from nucliadb_utils.utilities import has_feature
59
58
 
60
59
  from .exceptions import EntityManagementException
61
60
 
@@ -218,14 +217,13 @@ class EntitiesManager:
218
217
  ],
219
218
  ),
220
219
  )
221
- response = await node.reader.Search(request) # type: ignore
220
+ response = await query_shard(node, shard_id, request)
222
221
  return response.relation
223
222
 
224
223
  results = await shard_manager.apply_for_all_shards(
225
224
  self.kbid,
226
225
  do_entities_search,
227
226
  settings.relation_search_timeout,
228
- use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": self.kbid}),
229
227
  use_read_replica_nodes=self.use_read_replica_nodes,
230
228
  )
231
229
  for result in results:
@@ -315,7 +313,7 @@ class EntitiesManager:
315
313
  paragraph=False,
316
314
  faceted=Faceted(labels=["/e"]),
317
315
  )
318
- response: SearchResponse = await node.reader.Search(request) # type: ignore
316
+ response: SearchResponse = await query_shard(node, shard_id, request)
319
317
  try:
320
318
  facetresults = response.document.facets["/e"].facetresults
321
319
  return {facet.tag.split("/")[-1] for facet in facetresults}
@@ -327,7 +325,6 @@ class EntitiesManager:
327
325
  self.kbid,
328
326
  query_indexed_entities_group_names,
329
327
  settings.relation_types_timeout,
330
- use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": self.kbid}),
331
328
  use_read_replica_nodes=self.use_read_replica_nodes,
332
329
  )
333
330
  for result in results:
@@ -33,6 +33,7 @@ from nucliadb.common.cluster.manager import KBShardManager
33
33
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
34
34
  from nucliadb.common.maindb.driver import Driver
35
35
  from nucliadb.common.maindb.utils import setup_driver, teardown_driver
36
+ from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
36
37
  from nucliadb.ingest import logger
37
38
  from nucliadb_telemetry import errors
38
39
  from nucliadb_telemetry.logs import setup_logging
@@ -135,10 +136,9 @@ async def _get_stored_shards(driver: Driver) -> dict[str, ShardLocation]:
135
136
  continue
136
137
  else:
137
138
  for shard_object_pb in kb_shards:
138
- for shard_replica_pb in shard_object_pb.replicas:
139
- shard_replica_id = shard_replica_pb.shard.id
140
- node_id = shard_replica_pb.node
141
- stored_shards[shard_replica_id] = ShardLocation(kbid=kbid, node_id=node_id)
139
+ stored_shards[shard_object_pb.nidx_shard_id] = ShardLocation(
140
+ kbid=kbid, node_id="nidx"
141
+ )
142
142
  return stored_shards
143
143
 
144
144
 
@@ -241,6 +241,7 @@ async def main():
241
241
  """
242
242
  args = parse_arguments()
243
243
 
244
+ await start_nidx_utility()
244
245
  await setup_cluster()
245
246
  driver = await setup_driver()
246
247
 
@@ -253,6 +254,7 @@ async def main():
253
254
  finally:
254
255
  await teardown_driver()
255
256
  await teardown_cluster()
257
+ await stop_nidx_utility()
256
258
 
257
259
 
258
260
  def run() -> int: # pragma: no cover
@@ -48,9 +48,7 @@ from nucliadb_protos.noderesources_pb2 import Shard
48
48
  from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
49
49
  from nucliadb_protos.writer_pb2 import Shards
50
50
  from nucliadb_telemetry import errors
51
- from nucliadb_utils import const
52
51
  from nucliadb_utils.authentication import requires, requires_one
53
- from nucliadb_utils.utilities import has_feature
54
52
 
55
53
  MAX_PARAGRAPHS_FOR_SMALL_KB = 250_000
56
54
 
@@ -166,9 +164,7 @@ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
166
164
  queried_shards = []
167
165
  for shard_object in shard_groups:
168
166
  try:
169
- node, shard_id = choose_node(
170
- shard_object, use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid})
171
- )
167
+ node, shard_id = choose_node(shard_object)
172
168
  except KeyError:
173
169
  raise HTTPException(
174
170
  status_code=500,
@@ -123,7 +123,6 @@ async def node_query(
123
123
  try:
124
124
  node, shard_id = cluster_manager.choose_node(
125
125
  shard_obj,
126
- use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid}),
127
126
  use_read_replica_nodes=use_read_replica_nodes,
128
127
  target_shard_replicas=target_shard_replicas,
129
128
  )
@@ -224,7 +223,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
224
223
  )
225
224
  else:
226
225
  errors.capture_exception(result)
227
- logger.exception("Error while querying shard data", exc_info=result)
226
+ logger.exception(f"Error while querying shard data {result}", exc_info=result)
228
227
 
229
228
  return HTTPException(status_code=status_code, detail=reason)
230
229
 
@@ -19,6 +19,10 @@
19
19
  #
20
20
  import asyncio
21
21
 
22
+ import backoff
23
+ from grpc import StatusCode
24
+ from grpc.aio import AioRpcError
25
+
22
26
  from nucliadb.common.cluster.base import AbstractIndexNode
23
27
  from nucliadb_protos.nodereader_pb2 import (
24
28
  GetShardRequest,
@@ -39,6 +43,15 @@ node_observer = metrics.Observer(
39
43
  )
40
44
 
41
45
 
46
+ def should_giveup(e: Exception):
47
+ if isinstance(e, AioRpcError) and e.code() != StatusCode.NOT_FOUND:
48
+ return True
49
+ return False
50
+
51
+
52
+ @backoff.on_exception(
53
+ backoff.expo, Exception, jitter=None, factor=0.1, max_tries=3, giveup=should_giveup
54
+ )
42
55
  async def query_shard(node: AbstractIndexNode, shard: str, query: SearchRequest) -> SearchResponse:
43
56
  req = SearchRequest()
44
57
  req.CopyFrom(query)
@@ -47,6 +60,9 @@ async def query_shard(node: AbstractIndexNode, shard: str, query: SearchRequest)
47
60
  return await node.reader.Search(req) # type: ignore
48
61
 
49
62
 
63
+ @backoff.on_exception(
64
+ backoff.expo, Exception, jitter=None, factor=0.1, max_tries=3, giveup=should_giveup
65
+ )
50
66
  async def get_shard(node: AbstractIndexNode, shard_id: str) -> Shard:
51
67
  req = GetShardRequest()
52
68
  req.shard_id.id = shard_id
@@ -54,6 +70,9 @@ async def get_shard(node: AbstractIndexNode, shard_id: str) -> Shard:
54
70
  return await node.reader.GetShard(req) # type: ignore
55
71
 
56
72
 
73
+ @backoff.on_exception(
74
+ backoff.expo, Exception, jitter=None, factor=0.1, max_tries=3, giveup=should_giveup
75
+ )
57
76
  async def suggest_shard(node: AbstractIndexNode, shard: str, query: SuggestRequest) -> SuggestResponse:
58
77
  req = SuggestRequest()
59
78
  req.CopyFrom(query)
@@ -32,7 +32,6 @@ import psutil
32
32
  from fastapi import FastAPI
33
33
  from pydantic import BaseModel
34
34
 
35
- from nucliadb.common.cluster import manager as cluster_manager
36
35
  from nucliadb.standalone.settings import Settings
37
36
  from nucliadb_telemetry.settings import LogOutputType, LogSettings
38
37
 
@@ -83,7 +82,6 @@ async def stream_tar(app: FastAPI) -> AsyncGenerator[bytes, None]:
83
82
  with tarfile.open(tar_file, mode="w:gz") as tar:
84
83
  await add_system_info(temp_dir, tar)
85
84
  await add_dependencies(temp_dir, tar)
86
- await add_cluster_info(temp_dir, tar)
87
85
  settings: Settings = app.settings.copy() # type: ignore
88
86
  await add_settings(temp_dir, tar, settings)
89
87
  if settings.log_output_type == LogOutputType.FILE:
@@ -145,29 +143,6 @@ def _add_dependencies_to_tar(temp_dir: str, tar: tarfile.TarFile):
145
143
  tar.add(dependendies_file, arcname="dependencies.txt")
146
144
 
147
145
 
148
- async def add_cluster_info(temp_dir: str, tar: tarfile.TarFile):
149
- loop = asyncio.get_event_loop()
150
- await loop.run_in_executor(None, _add_cluster_info_to_tar, temp_dir, tar)
151
-
152
-
153
- def _add_cluster_info_to_tar(temp_dir: str, tar: tarfile.TarFile):
154
- cluster_info = ClusterInfo(
155
- nodes=[
156
- NodeInfo(
157
- id=node.id,
158
- address=node.address,
159
- shard_count=node.shard_count,
160
- primary_id=node.primary_id,
161
- )
162
- for node in cluster_manager.get_index_nodes()
163
- ]
164
- )
165
- cluster_info_file = os.path.join(temp_dir, "cluster_info.txt")
166
- with open(cluster_info_file, "w") as f:
167
- f.write(cluster_info.model_dump_json(indent=4))
168
- tar.add(cluster_info_file, arcname="cluster_info.txt")
169
-
170
-
171
146
  async def add_settings(temp_dir: str, tar: tarfile.TarFile, settings: Settings):
172
147
  loop = asyncio.get_event_loop()
173
148
  await loop.run_in_executor(None, _add_settings_to_tar, temp_dir, tar, settings)
@@ -22,10 +22,7 @@ from contextlib import asynccontextmanager
22
22
 
23
23
  from fastapi import FastAPI
24
24
 
25
- from nucliadb.common.cluster.discovery.utils import (
26
- setup_cluster_discovery,
27
- teardown_cluster_discovery,
28
- )
25
+ from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
29
26
  from nucliadb.train import SERVICE_NAME
30
27
  from nucliadb.train.utils import (
31
28
  start_shard_manager,
@@ -40,7 +37,7 @@ from nucliadb_utils.utilities import start_audit_utility, stop_audit_utility
40
37
  @asynccontextmanager
41
38
  async def lifespan(app: FastAPI):
42
39
  await setup_telemetry(SERVICE_NAME)
43
- await setup_cluster_discovery()
40
+ await start_nidx_utility()
44
41
  await start_shard_manager()
45
42
  await start_train_grpc(SERVICE_NAME)
46
43
  await start_audit_utility(SERVICE_NAME)
@@ -50,5 +47,5 @@ async def lifespan(app: FastAPI):
50
47
  await stop_audit_utility()
51
48
  await stop_train_grpc()
52
49
  await stop_shard_manager()
53
- await teardown_cluster_discovery()
50
+ await stop_nidx_utility()
54
51
  await clean_telemetry(SERVICE_NAME)
nucliadb/train/nodes.py CHANGED
@@ -45,9 +45,7 @@ from nucliadb_protos.train_pb2 import (
45
45
  TrainSentence,
46
46
  )
47
47
  from nucliadb_protos.writer_pb2 import ShardObject
48
- from nucliadb_utils import const
49
48
  from nucliadb_utils.storages.storage import Storage
50
- from nucliadb_utils.utilities import has_feature
51
49
 
52
50
 
53
51
  class TrainShardManager(manager.KBShardManager):
@@ -63,9 +61,7 @@ class TrainShardManager(manager.KBShardManager):
63
61
  except StopIteration:
64
62
  raise KeyError("Shard not found")
65
63
 
66
- node_obj, shard_id = manager.choose_node(
67
- shard_object, use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid})
68
- )
64
+ node_obj, shard_id = manager.choose_node(shard_object)
69
65
  return node_obj, shard_id
70
66
 
71
67
  async def get_kb_obj(self, txn: Transaction, kbid: str) -> Optional[KnowledgeBox]: