nucliadb 6.2.1.post2971__py3-none-any.whl → 6.2.1.post2972__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/cluster/manager.py +33 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/nidx.py +21 -23
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/orm/entities.py +3 -6
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/search/api/v1/knowledgebox.py +1 -5
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/introspect.py +0 -25
- nucliadb/train/lifecycle.py +0 -6
- nucliadb/train/nodes.py +1 -5
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/settings.py +2 -2
- {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/METADATA +5 -7
- {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/RECORD +26 -36
- nucliadb/common/cluster/discovery/__init__.py +0 -19
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/WHEEL +0 -0
- {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/top_level.txt +0 -0
- {nucliadb-6.2.1.post2971.dist-info → nucliadb-6.2.1.post2972.dist-info}/zip-safe +0 -0
@@ -27,31 +27,25 @@ import backoff
|
|
27
27
|
from nucliadb.common import datamanagers
|
28
28
|
from nucliadb.common.cluster.base import AbstractIndexNode
|
29
29
|
from nucliadb.common.cluster.exceptions import (
|
30
|
-
ExhaustedNodesError,
|
31
30
|
NodeClusterSmall,
|
32
31
|
NodeError,
|
33
32
|
NodesUnsync,
|
34
|
-
NoHealthyNodeAvailable,
|
35
33
|
ShardNotFound,
|
36
34
|
ShardsNotFound,
|
37
35
|
)
|
38
36
|
from nucliadb.common.maindb.driver import Transaction
|
39
|
-
from nucliadb.common.nidx import
|
37
|
+
from nucliadb.common.nidx import get_nidx, get_nidx_api_client, get_nidx_fake_node
|
40
38
|
from nucliadb_protos import (
|
41
39
|
knowledgebox_pb2,
|
42
|
-
nodereader_pb2,
|
43
40
|
noderesources_pb2,
|
44
41
|
nodewriter_pb2,
|
45
42
|
writer_pb2,
|
46
43
|
)
|
47
44
|
from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, NewShardRequest, TypeMessage
|
48
45
|
from nucliadb_telemetry import errors
|
49
|
-
from nucliadb_utils.utilities import
|
46
|
+
from nucliadb_utils.utilities import get_storage
|
50
47
|
|
51
|
-
from .index_node import IndexNode
|
52
48
|
from .settings import settings
|
53
|
-
from .standalone.index_node import ProxyStandaloneIndexNode
|
54
|
-
from .standalone.utils import get_self, get_standalone_node_id, is_index_node
|
55
49
|
|
56
50
|
logger = logging.getLogger(__name__)
|
57
51
|
|
@@ -60,67 +54,11 @@ READ_REPLICA_INDEX_NODES: dict[str, set[str]] = {}
|
|
60
54
|
|
61
55
|
|
62
56
|
def get_index_nodes(include_secondary: bool = False) -> list[AbstractIndexNode]:
|
63
|
-
|
64
|
-
if not include_secondary:
|
65
|
-
return [inode for inode in all_nodes if inode.primary_id is None]
|
66
|
-
return all_nodes
|
57
|
+
return [get_nidx_fake_node()]
|
67
58
|
|
68
59
|
|
69
60
|
def get_index_node(node_id: str) -> Optional[AbstractIndexNode]:
|
70
|
-
return
|
71
|
-
|
72
|
-
|
73
|
-
def clear_index_nodes():
|
74
|
-
INDEX_NODES.clear()
|
75
|
-
READ_REPLICA_INDEX_NODES.clear()
|
76
|
-
|
77
|
-
|
78
|
-
def get_read_replica_node_ids(node_id: str) -> list[str]:
|
79
|
-
return list(READ_REPLICA_INDEX_NODES.get(node_id, set()))
|
80
|
-
|
81
|
-
|
82
|
-
def add_index_node(
|
83
|
-
*,
|
84
|
-
id: str,
|
85
|
-
address: str,
|
86
|
-
shard_count: int,
|
87
|
-
available_disk: int,
|
88
|
-
dummy: bool = False,
|
89
|
-
primary_id: Optional[str] = None,
|
90
|
-
) -> AbstractIndexNode:
|
91
|
-
if settings.standalone_mode:
|
92
|
-
if is_index_node() and id == get_standalone_node_id():
|
93
|
-
node = get_self()
|
94
|
-
else:
|
95
|
-
node = ProxyStandaloneIndexNode(
|
96
|
-
id=id,
|
97
|
-
address=address,
|
98
|
-
shard_count=shard_count,
|
99
|
-
available_disk=available_disk,
|
100
|
-
dummy=dummy,
|
101
|
-
)
|
102
|
-
else:
|
103
|
-
node = IndexNode( # type: ignore
|
104
|
-
id=id,
|
105
|
-
address=address,
|
106
|
-
shard_count=shard_count,
|
107
|
-
available_disk=available_disk,
|
108
|
-
dummy=dummy,
|
109
|
-
primary_id=primary_id,
|
110
|
-
)
|
111
|
-
INDEX_NODES[id] = node
|
112
|
-
if primary_id is not None:
|
113
|
-
if primary_id not in READ_REPLICA_INDEX_NODES:
|
114
|
-
READ_REPLICA_INDEX_NODES[primary_id] = set()
|
115
|
-
READ_REPLICA_INDEX_NODES[primary_id].add(id)
|
116
|
-
return node
|
117
|
-
|
118
|
-
|
119
|
-
def remove_index_node(node_id: str, primary_id: Optional[str] = None) -> None:
|
120
|
-
INDEX_NODES.pop(node_id, None)
|
121
|
-
if primary_id is not None and primary_id in READ_REPLICA_INDEX_NODES:
|
122
|
-
if node_id in READ_REPLICA_INDEX_NODES[primary_id]:
|
123
|
-
READ_REPLICA_INDEX_NODES[primary_id].remove(node_id)
|
61
|
+
return get_nidx_fake_node()
|
124
62
|
|
125
63
|
|
126
64
|
class KBShardManager:
|
@@ -145,16 +83,13 @@ class KBShardManager:
|
|
145
83
|
aw: Callable[[AbstractIndexNode, str], Awaitable[Any]],
|
146
84
|
timeout: float,
|
147
85
|
*,
|
148
|
-
use_nidx: bool,
|
149
86
|
use_read_replica_nodes: bool = False,
|
150
87
|
) -> list[Any]:
|
151
88
|
shards = await self.get_shards_by_kbid(kbid)
|
152
89
|
ops = []
|
153
90
|
|
154
91
|
for shard_obj in shards:
|
155
|
-
node, shard_id = choose_node(
|
156
|
-
shard_obj, use_nidx=use_nidx, use_read_replica_nodes=use_read_replica_nodes
|
157
|
-
)
|
92
|
+
node, shard_id = choose_node(shard_obj, use_read_replica_nodes=use_read_replica_nodes)
|
158
93
|
if shard_id is None:
|
159
94
|
raise ShardNotFound("Found a node but not a shard")
|
160
95
|
|
@@ -190,27 +125,12 @@ class KBShardManager:
|
|
190
125
|
txn: Transaction,
|
191
126
|
kbid: str,
|
192
127
|
) -> writer_pb2.ShardObject:
|
193
|
-
try:
|
194
|
-
check_enough_nodes()
|
195
|
-
except NodeClusterSmall as err:
|
196
|
-
errors.capture_exception(err)
|
197
|
-
logger.error(
|
198
|
-
f"Shard creation for kbid={kbid} failed: Replication requirements could not be met."
|
199
|
-
)
|
200
|
-
raise
|
201
|
-
|
202
128
|
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
|
203
129
|
if kb_shards is None:
|
204
130
|
msg = ("Attempting to create a shard for a KB when it has no stored shards in maindb",)
|
205
131
|
logger.error(msg, extra={"kbid": kbid})
|
206
132
|
raise ShardsNotFound(msg)
|
207
133
|
|
208
|
-
existing_kb_nodes = [replica.node for shard in kb_shards.shards for replica in shard.replicas]
|
209
|
-
nodes = sorted_primary_nodes(
|
210
|
-
avoid_nodes=existing_kb_nodes,
|
211
|
-
ignore_nodes=settings.drain_nodes,
|
212
|
-
)
|
213
|
-
|
214
134
|
vectorsets = {
|
215
135
|
vectorset_id: vectorset_config.vectorset_index_config
|
216
136
|
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(txn, kbid=kbid)
|
@@ -220,64 +140,14 @@ class KBShardManager:
|
|
220
140
|
|
221
141
|
shard = writer_pb2.ShardObject(shard=shard_uuid, read_only=False)
|
222
142
|
try:
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
raise ExhaustedNodesError()
|
232
|
-
|
233
|
-
node = get_index_node(node_id)
|
234
|
-
if node is None:
|
235
|
-
logger.error(f"Node {node_id} is not found or not available")
|
236
|
-
continue
|
237
|
-
|
238
|
-
try:
|
239
|
-
if not vectorsets:
|
240
|
-
# bw/c KBs without vectorsets
|
241
|
-
is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
|
242
|
-
vector_index_config = nodewriter_pb2.VectorIndexConfig(
|
243
|
-
similarity=kb_shards.similarity,
|
244
|
-
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
245
|
-
vector_dimension=kb_shards.model.vector_dimension,
|
246
|
-
normalize_vectors=is_matryoshka,
|
247
|
-
)
|
248
|
-
|
249
|
-
shard_created = await node.new_shard(
|
250
|
-
kbid,
|
251
|
-
vector_index_config=vector_index_config,
|
252
|
-
)
|
253
|
-
|
254
|
-
else:
|
255
|
-
shard_created = await node.new_shard_with_vectorsets(
|
256
|
-
kbid,
|
257
|
-
vectorsets_configs=vectorsets,
|
258
|
-
)
|
259
|
-
|
260
|
-
except Exception as exc:
|
261
|
-
errors.capture_exception(exc)
|
262
|
-
logger.exception(
|
263
|
-
f"Error creating new shard for KB", extra={"kbid": kbid, "node_id": node}
|
264
|
-
)
|
265
|
-
continue
|
266
|
-
|
267
|
-
replica = writer_pb2.ShardReplica(node=str(node_id))
|
268
|
-
replica.shard.CopyFrom(shard_created)
|
269
|
-
shard.replicas.append(replica)
|
270
|
-
replicas_created += 1
|
271
|
-
|
272
|
-
nidx_api = get_nidx_api_client()
|
273
|
-
if nidx_api:
|
274
|
-
req = NewShardRequest(
|
275
|
-
kbid=kbid,
|
276
|
-
vectorsets_configs=vectorsets,
|
277
|
-
)
|
278
|
-
|
279
|
-
resp = await nidx_api.NewShard(req) # type: ignore
|
280
|
-
shard.nidx_shard_id = resp.id
|
143
|
+
nidx_api = get_nidx_api_client()
|
144
|
+
req = NewShardRequest(
|
145
|
+
kbid=kbid,
|
146
|
+
vectorsets_configs=vectorsets,
|
147
|
+
)
|
148
|
+
|
149
|
+
resp = await nidx_api.NewShard(req) # type: ignore
|
150
|
+
shard.nidx_shard_id = resp.id
|
281
151
|
|
282
152
|
except Exception as exc:
|
283
153
|
errors.capture_exception(exc)
|
@@ -300,43 +170,15 @@ class KBShardManager:
|
|
300
170
|
return shard
|
301
171
|
|
302
172
|
async def rollback_shard(self, shard: writer_pb2.ShardObject):
|
303
|
-
for shard_replica in shard.replicas:
|
304
|
-
node_id = shard_replica.node
|
305
|
-
replica_id = shard_replica.shard.id
|
306
|
-
node = get_index_node(node_id)
|
307
|
-
if node is not None:
|
308
|
-
try:
|
309
|
-
logger.info(
|
310
|
-
"Deleting shard replica",
|
311
|
-
extra={"shard": replica_id, "node": node_id},
|
312
|
-
)
|
313
|
-
await node.delete_shard(replica_id)
|
314
|
-
except Exception as rollback_error:
|
315
|
-
errors.capture_exception(rollback_error)
|
316
|
-
logger.error(
|
317
|
-
f"New shard rollback error. Node: {node_id} Shard: {replica_id}",
|
318
|
-
exc_info=True,
|
319
|
-
)
|
320
|
-
|
321
173
|
nidx_api = get_nidx_api_client()
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
)
|
331
|
-
|
332
|
-
def indexing_replicas(self, shard: writer_pb2.ShardObject) -> list[tuple[str, str]]:
|
333
|
-
"""
|
334
|
-
Returns the replica ids and nodes for the shard replicas
|
335
|
-
"""
|
336
|
-
result = []
|
337
|
-
for replica in shard.replicas:
|
338
|
-
result.append((replica.shard.id, replica.node))
|
339
|
-
return result
|
174
|
+
try:
|
175
|
+
await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
|
176
|
+
except Exception as rollback_error:
|
177
|
+
errors.capture_exception(rollback_error)
|
178
|
+
logger.error(
|
179
|
+
f"New shard rollback error. Nidx Shard: {shard.nidx_shard_id}",
|
180
|
+
exc_info=True,
|
181
|
+
)
|
340
182
|
|
341
183
|
async def delete_resource(
|
342
184
|
self,
|
@@ -346,29 +188,16 @@ class KBShardManager:
|
|
346
188
|
partition: str,
|
347
189
|
kb: str,
|
348
190
|
) -> None:
|
349
|
-
indexing = get_indexing()
|
350
191
|
storage = await get_storage()
|
351
192
|
nidx = get_nidx()
|
352
193
|
|
353
194
|
await storage.delete_indexing(resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard)
|
354
195
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
indexpb.resource = uuid
|
361
|
-
indexpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
|
362
|
-
indexpb.partition = partition
|
363
|
-
indexpb.kbid = kb
|
364
|
-
await indexing.index(indexpb, node_id)
|
365
|
-
|
366
|
-
if nidx is not None and shard.nidx_shard_id:
|
367
|
-
nidxpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
368
|
-
nidxpb.shard = shard.nidx_shard_id
|
369
|
-
nidxpb.resource = uuid
|
370
|
-
nidxpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
|
371
|
-
await nidx.index(nidxpb)
|
196
|
+
nidxpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
197
|
+
nidxpb.shard = shard.nidx_shard_id
|
198
|
+
nidxpb.resource = uuid
|
199
|
+
nidxpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
|
200
|
+
await nidx.index(nidxpb)
|
372
201
|
|
373
202
|
async def add_resource(
|
374
203
|
self,
|
@@ -389,7 +218,6 @@ class KBShardManager:
|
|
389
218
|
reindex_id = uuid.uuid4().hex
|
390
219
|
|
391
220
|
storage = await get_storage()
|
392
|
-
indexing = get_indexing()
|
393
221
|
nidx = get_nidx()
|
394
222
|
indexpb = IndexMessage()
|
395
223
|
|
@@ -412,14 +240,8 @@ class KBShardManager:
|
|
412
240
|
indexpb.source = source
|
413
241
|
indexpb.resource = resource.resource.uuid
|
414
242
|
|
415
|
-
|
416
|
-
|
417
|
-
indexpb.shard = replica_id
|
418
|
-
await indexing.index(indexpb, node_id)
|
419
|
-
|
420
|
-
if nidx is not None and shard.nidx_shard_id:
|
421
|
-
indexpb.shard = shard.nidx_shard_id
|
422
|
-
await nidx.index(indexpb)
|
243
|
+
indexpb.shard = shard.nidx_shard_id
|
244
|
+
await nidx.index(indexpb)
|
423
245
|
|
424
246
|
def should_create_new_shard(self, num_paragraphs: int) -> bool:
|
425
247
|
return num_paragraphs > settings.max_shard_paragraphs
|
@@ -451,12 +273,8 @@ class KBShardManager:
|
|
451
273
|
)
|
452
274
|
|
453
275
|
await self.apply_for_all_shards(
|
454
|
-
kbid, _create_vectorset, timeout=10,
|
276
|
+
kbid, _create_vectorset, timeout=10, use_read_replica_nodes=False
|
455
277
|
)
|
456
|
-
if NIDX_ENABLED:
|
457
|
-
await self.apply_for_all_shards(
|
458
|
-
kbid, _create_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
|
459
|
-
)
|
460
278
|
|
461
279
|
async def delete_vectorset(self, kbid: str, vectorset_id: str):
|
462
280
|
"""Delete a vectorset from all KB shards"""
|
@@ -469,12 +287,8 @@ class KBShardManager:
|
|
469
287
|
)
|
470
288
|
|
471
289
|
await self.apply_for_all_shards(
|
472
|
-
kbid, _delete_vectorset, timeout=10,
|
290
|
+
kbid, _delete_vectorset, timeout=10, use_read_replica_nodes=False
|
473
291
|
)
|
474
|
-
if NIDX_ENABLED:
|
475
|
-
await self.apply_for_all_shards(
|
476
|
-
kbid, _delete_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
|
477
|
-
)
|
478
292
|
|
479
293
|
|
480
294
|
class StandaloneKBShardManager(KBShardManager):
|
@@ -485,27 +299,6 @@ class StandaloneKBShardManager(KBShardManager):
|
|
485
299
|
self._lock = asyncio.Lock()
|
486
300
|
self._change_count: dict[tuple[str, str], int] = {}
|
487
301
|
|
488
|
-
async def _resource_change_event(self, kbid: str, node_id: str, shard_id: str) -> None:
|
489
|
-
if (node_id, shard_id) not in self._change_count:
|
490
|
-
self._change_count[(node_id, shard_id)] = 0
|
491
|
-
self._change_count[(node_id, shard_id)] += 1
|
492
|
-
if self._change_count[(node_id, shard_id)] < self.max_ops_before_checks:
|
493
|
-
return
|
494
|
-
|
495
|
-
self._change_count[(node_id, shard_id)] = 0
|
496
|
-
async with self._lock:
|
497
|
-
index_node: Optional[ProxyStandaloneIndexNode] = get_index_node(node_id) # type: ignore
|
498
|
-
if index_node is None:
|
499
|
-
return
|
500
|
-
shard_info: noderesources_pb2.Shard = await index_node.reader.GetShard(
|
501
|
-
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id))
|
502
|
-
)
|
503
|
-
await self.maybe_create_new_shard(
|
504
|
-
kbid,
|
505
|
-
shard_info.paragraphs,
|
506
|
-
)
|
507
|
-
await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id))
|
508
|
-
|
509
302
|
@backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
|
510
303
|
async def delete_resource(
|
511
304
|
self,
|
@@ -518,16 +311,6 @@ class StandaloneKBShardManager(KBShardManager):
|
|
518
311
|
req = noderesources_pb2.ResourceID()
|
519
312
|
req.uuid = uuid
|
520
313
|
|
521
|
-
for shardreplica in shard.replicas:
|
522
|
-
req.shard_id = shardreplica.shard.id
|
523
|
-
index_node = get_index_node(shardreplica.node)
|
524
|
-
if index_node is None: # pragma: no cover
|
525
|
-
raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
|
526
|
-
await index_node.writer.RemoveResource(req) # type: ignore
|
527
|
-
asyncio.create_task(
|
528
|
-
self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
|
529
|
-
)
|
530
|
-
|
531
314
|
nidx = get_nidx()
|
532
315
|
if nidx is not None and shard.nidx_shard_id:
|
533
316
|
indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
@@ -551,16 +334,6 @@ class StandaloneKBShardManager(KBShardManager):
|
|
551
334
|
Calls the node writer's SetResource method directly to store the resource in the node.
|
552
335
|
There is no queuing for standalone nodes at the moment -- indexing is done synchronously.
|
553
336
|
"""
|
554
|
-
index_node = None
|
555
|
-
for shardreplica in shard.replicas:
|
556
|
-
resource.shard_id = resource.resource.shard_id = shardreplica.shard.id
|
557
|
-
index_node = get_index_node(shardreplica.node)
|
558
|
-
if index_node is None: # pragma: no cover
|
559
|
-
raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
|
560
|
-
await index_node.writer.SetResource(resource) # type: ignore
|
561
|
-
asyncio.create_task(
|
562
|
-
self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
|
563
|
-
)
|
564
337
|
|
565
338
|
nidx = get_nidx()
|
566
339
|
if nidx is not None and shard.nidx_shard_id:
|
@@ -587,89 +360,18 @@ class StandaloneKBShardManager(KBShardManager):
|
|
587
360
|
pass
|
588
361
|
|
589
362
|
|
590
|
-
def get_all_shard_nodes(
|
591
|
-
shard: writer_pb2.ShardObject,
|
592
|
-
*,
|
593
|
-
use_read_replicas: bool,
|
594
|
-
) -> list[tuple[AbstractIndexNode, str]]:
|
595
|
-
"""Return a list of all nodes containing `shard` with the shard replica id.
|
596
|
-
If `use_read_replicas`, read replica nodes will be returned too.
|
597
|
-
|
598
|
-
"""
|
599
|
-
nodes = []
|
600
|
-
for shard_replica_pb in shard.replicas:
|
601
|
-
node_id = shard_replica_pb.node
|
602
|
-
shard_replica_id = shard_replica_pb.shard.id
|
603
|
-
|
604
|
-
node = get_index_node(node_id)
|
605
|
-
if node is not None:
|
606
|
-
nodes.append((node, shard_replica_id))
|
607
|
-
|
608
|
-
if use_read_replicas:
|
609
|
-
for read_replica_node_id in get_read_replica_node_ids(node_id):
|
610
|
-
read_replica_node = get_index_node(read_replica_node_id)
|
611
|
-
if read_replica_node is not None:
|
612
|
-
nodes.append((read_replica_node, shard_replica_id))
|
613
|
-
|
614
|
-
return nodes
|
615
|
-
|
616
|
-
|
617
363
|
def choose_node(
|
618
364
|
shard: writer_pb2.ShardObject,
|
619
365
|
*,
|
620
|
-
use_nidx: bool,
|
621
366
|
target_shard_replicas: Optional[list[str]] = None,
|
622
367
|
use_read_replica_nodes: bool = False,
|
623
368
|
) -> tuple[AbstractIndexNode, str]:
|
624
|
-
|
625
|
-
|
626
|
-
- when enabled, read replica nodes are preferred over primaries
|
627
|
-
- if there's more than one option with the same score, a random choice will
|
628
|
-
be made between them.
|
629
|
-
|
630
|
-
According to these rules and considering we use read replica nodes, a read
|
631
|
-
replica node containing a shard replica from `target_shard_replicas` is the
|
632
|
-
most preferent, while a primary node with a shard not in
|
633
|
-
`target_shard_replicas` is the least preferent.
|
634
|
-
|
635
|
-
"""
|
636
|
-
|
637
|
-
# Use nidx if requested and enabled, fallback to node
|
638
|
-
if shard.nidx_shard_id and use_nidx:
|
639
|
-
fake_node = get_nidx_fake_node()
|
640
|
-
if fake_node:
|
641
|
-
return fake_node, shard.nidx_shard_id
|
642
|
-
|
643
|
-
target_shard_replicas = target_shard_replicas or []
|
644
|
-
|
645
|
-
shard_nodes = get_all_shard_nodes(shard, use_read_replicas=use_read_replica_nodes)
|
646
|
-
|
647
|
-
if len(shard_nodes) == 0:
|
648
|
-
raise NoHealthyNodeAvailable("Could not find a node to query")
|
649
|
-
|
650
|
-
# Ranking values
|
651
|
-
IN_TARGET_SHARD_REPLICAS = 0b10
|
652
|
-
IS_READ_REPLICA_NODE = 0b01
|
653
|
-
|
654
|
-
ranked_nodes: dict[int, list[tuple[AbstractIndexNode, str]]] = {}
|
655
|
-
for node, shard_replica_id in shard_nodes:
|
656
|
-
score = 0
|
657
|
-
if shard_replica_id in target_shard_replicas:
|
658
|
-
score |= IN_TARGET_SHARD_REPLICAS
|
659
|
-
if node.is_read_replica():
|
660
|
-
score |= IS_READ_REPLICA_NODE
|
661
|
-
|
662
|
-
ranked_nodes.setdefault(score, []).append((node, shard_replica_id))
|
663
|
-
|
664
|
-
top = ranked_nodes[max(ranked_nodes)]
|
665
|
-
# As shard replica ids are random numbers, we sort by shard replica id and choose its
|
666
|
-
# node to make sure we choose in deterministically but we don't favour any node in particular
|
667
|
-
top.sort(key=lambda x: x[1])
|
668
|
-
selected_node, shard_replica_id = top[0]
|
669
|
-
return selected_node, shard_replica_id
|
369
|
+
fake_node = get_nidx_fake_node()
|
370
|
+
return fake_node, shard.nidx_shard_id
|
670
371
|
|
671
372
|
|
672
373
|
def check_enough_nodes():
|
374
|
+
return True
|
673
375
|
"""
|
674
376
|
It raises an exception if it can't find enough nodes for the configured replicas.
|
675
377
|
"""
|
@@ -52,7 +52,7 @@ async def get_shards_paragraphs(kbid: str) -> list[tuple[str, int]]:
|
|
52
52
|
results = {}
|
53
53
|
for shard_meta in kb_shards.shards:
|
54
54
|
# Rebalance using node as source of truth. But it will rebalance nidx
|
55
|
-
node, shard_id = choose_node(shard_meta
|
55
|
+
node, shard_id = choose_node(shard_meta)
|
56
56
|
shard_data: nodereader_pb2.Shard = await node.reader.GetShard(
|
57
57
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
58
58
|
)
|
@@ -102,7 +102,7 @@ async def move_set_of_kb_resources(
|
|
102
102
|
from_shard = [s for s in kb_shards.shards if s.shard == from_shard_id][0]
|
103
103
|
to_shard = [s for s in kb_shards.shards if s.shard == to_shard_id][0]
|
104
104
|
|
105
|
-
from_node, from_shard_replica_id = choose_node(from_shard
|
105
|
+
from_node, from_shard_replica_id = choose_node(from_shard)
|
106
106
|
search_response: nodereader_pb2.SearchResponse = await from_node.reader.Search( # type: ignore
|
107
107
|
nodereader_pb2.SearchRequest(
|
108
108
|
shard=from_shard_replica_id,
|
@@ -24,7 +24,6 @@ from datetime import datetime
|
|
24
24
|
from typing import Optional
|
25
25
|
|
26
26
|
from nucliadb.common import datamanagers, locking
|
27
|
-
from nucliadb.common.cluster import manager as cluster_manager
|
28
27
|
from nucliadb.common.context import ApplicationContext
|
29
28
|
from nucliadb.common.datamanagers.rollover import RolloverState, RolloverStateNotFoundError
|
30
29
|
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
@@ -32,11 +31,9 @@ from nucliadb.common.external_index_providers.manager import (
|
|
32
31
|
get_external_index_manager,
|
33
32
|
)
|
34
33
|
from nucliadb.common.nidx import get_nidx_fake_node
|
35
|
-
from nucliadb_protos import
|
34
|
+
from nucliadb_protos import writer_pb2
|
36
35
|
from nucliadb_telemetry import errors
|
37
36
|
|
38
|
-
from .manager import get_index_node
|
39
|
-
from .settings import settings
|
40
37
|
from .utils import (
|
41
38
|
delete_resource_from_shard,
|
42
39
|
get_resource,
|
@@ -138,63 +135,19 @@ async def create_rollover_shards(
|
|
138
135
|
# create new shards
|
139
136
|
created_shards = []
|
140
137
|
try:
|
141
|
-
nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
|
142
138
|
for shard in kb_shards.shards:
|
143
139
|
shard.ClearField("replicas")
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
continue
|
156
|
-
|
157
|
-
vectorsets = {
|
158
|
-
vectorset_id: vectorset_config.vectorset_index_config
|
159
|
-
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
160
|
-
txn, kbid=kbid
|
161
|
-
)
|
162
|
-
}
|
163
|
-
try:
|
164
|
-
if not vectorsets:
|
165
|
-
is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
|
166
|
-
vector_index_config = nodewriter_pb2.VectorIndexConfig(
|
167
|
-
similarity=kb_shards.similarity,
|
168
|
-
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
169
|
-
vector_dimension=kb_shards.model.vector_dimension,
|
170
|
-
normalize_vectors=is_matryoshka,
|
171
|
-
)
|
172
|
-
shard_created = await node.new_shard(
|
173
|
-
kbid,
|
174
|
-
vector_index_config=vector_index_config,
|
175
|
-
)
|
176
|
-
else:
|
177
|
-
shard_created = await node.new_shard_with_vectorsets(
|
178
|
-
kbid,
|
179
|
-
vectorsets_configs=vectorsets,
|
180
|
-
)
|
181
|
-
except Exception as e:
|
182
|
-
errors.capture_exception(e)
|
183
|
-
logger.exception(f"Error creating new shard at {node}")
|
184
|
-
continue
|
185
|
-
|
186
|
-
replica = writer_pb2.ShardReplica(node=str(node_id))
|
187
|
-
replica.shard.CopyFrom(shard_created)
|
188
|
-
shard.replicas.append(replica)
|
189
|
-
created_shards.append(shard)
|
190
|
-
replicas_created += 1
|
191
|
-
|
192
|
-
if nidx_node:
|
193
|
-
nidx_shard = await nidx_node.new_shard_with_vectorsets(
|
194
|
-
kbid,
|
195
|
-
vectorsets_configs=vectorsets,
|
196
|
-
)
|
197
|
-
shard.nidx_shard_id = nidx_shard.id
|
140
|
+
vectorsets = {
|
141
|
+
vectorset_id: vectorset_config.vectorset_index_config
|
142
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(txn, kbid=kbid)
|
143
|
+
}
|
144
|
+
|
145
|
+
nidx_shard = await nidx_node.new_shard_with_vectorsets(
|
146
|
+
kbid,
|
147
|
+
vectorsets_configs=vectorsets,
|
148
|
+
)
|
149
|
+
shard.nidx_shard_id = nidx_shard.id
|
150
|
+
created_shards.append(shard)
|
198
151
|
|
199
152
|
except Exception as e:
|
200
153
|
errors.capture_exception(e)
|
@@ -621,16 +574,6 @@ async def clean_rollover_status(app_context: ApplicationContext, kbid: str) -> N
|
|
621
574
|
await txn.commit()
|
622
575
|
|
623
576
|
|
624
|
-
async def wait_for_cluster_ready() -> None:
|
625
|
-
node_ready_checks = 0
|
626
|
-
while len(cluster_manager.INDEX_NODES) == 0:
|
627
|
-
if node_ready_checks > 10:
|
628
|
-
raise Exception("No index nodes available")
|
629
|
-
logger.info("Waiting for index nodes to be available")
|
630
|
-
await asyncio.sleep(1)
|
631
|
-
node_ready_checks += 1
|
632
|
-
|
633
|
-
|
634
577
|
async def rollover_kb_index(
|
635
578
|
app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
|
636
579
|
) -> None:
|
@@ -654,8 +597,6 @@ async def rollover_kb_index(
|
|
654
597
|
- Validate that all resources are in the new kb index
|
655
598
|
- Clean up indexed data
|
656
599
|
"""
|
657
|
-
await wait_for_cluster_ready()
|
658
|
-
|
659
600
|
extra = {"kbid": kbid, "external_index_provider": None}
|
660
601
|
external = await get_external_index_manager(kbid, for_rollover=True)
|
661
602
|
if external is not None:
|