nucliadb 6.9.1.post5187__py3-none-any.whl → 6.9.1.post5207__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

@@ -43,8 +43,6 @@ from nucliadb_protos import knowledgebox_pb2, writer_pb2
43
43
  from nucliadb_telemetry import errors
44
44
  from nucliadb_utils.utilities import get_storage
45
45
 
46
- from .settings import settings
47
-
48
46
  logger = logging.getLogger(__name__)
49
47
 
50
48
 
@@ -113,6 +111,8 @@ class KBShardManager:
113
111
  self,
114
112
  txn: Transaction,
115
113
  kbid: str,
114
+ *,
115
+ prewarm_enabled: bool,
116
116
  ) -> writer_pb2.ShardObject:
117
117
  kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
118
118
  if kb_shards is None:
@@ -133,6 +133,7 @@ class KBShardManager:
133
133
  req = NewShardRequest(
134
134
  kbid=kbid,
135
135
  vectorsets_configs=vectorsets,
136
+ prewarm_enabled=prewarm_enabled,
136
137
  )
137
138
 
138
139
  resp = await nidx_api.NewShard(req) # type: ignore
@@ -232,23 +233,6 @@ class KBShardManager:
232
233
  indexpb.shard = shard.nidx_shard_id
233
234
  await nidx.index(indexpb)
234
235
 
235
- def should_create_new_shard(self, num_paragraphs: int) -> bool:
236
- return num_paragraphs > settings.max_shard_paragraphs
237
-
238
- async def maybe_create_new_shard(
239
- self,
240
- kbid: str,
241
- num_paragraphs: int,
242
- ):
243
- if not self.should_create_new_shard(num_paragraphs):
244
- return
245
-
246
- logger.info({"message": "Adding shard", "kbid": kbid})
247
-
248
- async with datamanagers.with_transaction() as txn:
249
- await self.create_shard_by_kbid(txn, kbid)
250
- await txn.commit()
251
-
252
236
  async def create_vectorset(self, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
253
237
  """Create a new vectorset in all KB shards."""
254
238
 
@@ -18,162 +18,494 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
+ import dataclasses
21
22
  import logging
23
+ import math
24
+ import random
25
+ from typing import Optional
22
26
 
27
+ import aioitertools
28
+ from grpc import StatusCode
29
+ from grpc.aio import AioRpcError
23
30
  from nidx_protos import nodereader_pb2, noderesources_pb2
24
31
 
25
32
  from nucliadb.common import datamanagers, locking
26
33
  from nucliadb.common.cluster.utils import get_shard_manager
27
34
  from nucliadb.common.context import ApplicationContext
35
+ from nucliadb.common.datamanagers.resources import KB_RESOURCE_SHARD
28
36
  from nucliadb.common.nidx import get_nidx_api_client, get_nidx_searcher_client
37
+ from nucliadb_protos import writer_pb2
29
38
  from nucliadb_telemetry import errors
30
39
  from nucliadb_telemetry.logs import setup_logging
31
40
  from nucliadb_telemetry.utils import setup_telemetry
41
+ from nucliadb_utils import const
32
42
  from nucliadb_utils.fastapi.run import serve_metrics
43
+ from nucliadb_utils.utilities import has_feature
33
44
 
34
45
  from .settings import settings
35
- from .utils import delete_resource_from_shard, index_resource_to_shard
46
+ from .utils import delete_resource_from_shard, index_resource_to_shard, wait_for_nidx
36
47
 
37
48
  logger = logging.getLogger(__name__)
38
49
 
39
50
  REBALANCE_LOCK = "rebalance"
40
51
 
41
-
42
- async def get_shards_paragraphs(kbid: str) -> list[tuple[str, int]]:
43
- """
44
- Ordered shard -> num paragraph by number of paragraphs
45
- """
46
- async with datamanagers.with_ro_transaction() as txn:
47
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
48
- if kb_shards is None:
49
- return []
50
-
51
- results = {}
52
- for shard_meta in kb_shards.shards:
53
- # Rebalance using node as source of truth. But it will rebalance nidx
54
- shard_data: nodereader_pb2.Shard = await get_nidx_api_client().GetShard(
55
- nodereader_pb2.GetShardRequest(
56
- shard_id=noderesources_pb2.ShardId(id=shard_meta.nidx_shard_id)
57
- ) # type: ignore
52
+ MAX_MOVES_PER_SHARD = 100
53
+
54
+
55
+ @dataclasses.dataclass
56
+ class RebalanceShard:
57
+ id: str
58
+ nidx_id: str
59
+ paragraphs: int
60
+ active: bool
61
+
62
+ def to_dict(self):
63
+ return self.__dict__
64
+
65
+
66
+ class Rebalancer:
67
+ def __init__(self, context: ApplicationContext, kbid: str):
68
+ self.context = context
69
+ self.kbid = kbid
70
+ self.kb_shards: Optional[writer_pb2.Shards] = None
71
+ self.index: dict[str, set[str]] = {}
72
+
73
+ async def get_rebalance_shards(self) -> list[RebalanceShard]:
74
+ """
75
+ Return the sorted list of shards by increasing paragraph count.
76
+ """
77
+ self.kb_shards = await datamanagers.atomic.cluster.get_kb_shards(kbid=self.kbid)
78
+ if self.kb_shards is None: # pragma: no cover
79
+ return []
80
+ return list(
81
+ sorted(
82
+ [
83
+ RebalanceShard(
84
+ id=shard.shard,
85
+ nidx_id=shard.nidx_shard_id,
86
+ paragraphs=await get_shard_paragraph_count(shard.nidx_shard_id),
87
+ active=(idx == self.kb_shards.actual),
88
+ )
89
+ for idx, shard in enumerate(self.kb_shards.shards)
90
+ ],
91
+ key=lambda x: x.paragraphs,
92
+ )
58
93
  )
59
- results[shard_meta.shard] = shard_data.paragraphs
60
94
 
61
- return [(shard, paragraphs) for shard, paragraphs in sorted(results.items(), key=lambda x: x[1])]
95
+ async def build_shard_resources_index(self):
96
+ async with datamanagers.with_ro_transaction() as txn:
97
+ iterable = datamanagers.resources.iterate_resource_ids(kbid=self.kbid)
98
+ async for resources_batch in aioitertools.batched(iterable, n=200):
99
+ shards = await txn.batch_get(
100
+ keys=[KB_RESOURCE_SHARD.format(kbid=self.kbid, uuid=rid) for rid in resources_batch],
101
+ for_update=False,
102
+ )
103
+ for rid, shard_bytes in zip(resources_batch, shards):
104
+ if shard_bytes is not None:
105
+ self.index.setdefault(shard_bytes.decode(), set()).add(rid)
106
+
107
+ async def move_paragraphs(
108
+ self, from_shard: RebalanceShard, to_shard: RebalanceShard, max_paragraphs: int
109
+ ) -> int:
110
+ """
111
+ Takes random resources from the source shard and tries to move at most max_paragraphs.
112
+ It stops moving paragraphs until the are no more resources to move.
113
+ """
114
+ moved_paragraphs = 0
115
+
116
+ while moved_paragraphs < max_paragraphs:
117
+ # Take a random resource to move
118
+ try:
119
+ resource_id = random.choice(tuple(self.index[from_shard.id]))
120
+ except (KeyError, IndexError):
121
+ # No more resources in shard or shard not found
122
+ break
123
+
124
+ assert self.kb_shards is not None
125
+ from_shard_obj = next(s for s in self.kb_shards.shards if s.shard == from_shard.id)
126
+ to_shard_obj = next(s for s in self.kb_shards.shards if s.shard == to_shard.id)
127
+ paragraphs_count = await get_resource_paragraphs_count(resource_id, from_shard.nidx_id)
128
+ moved = await move_resource_to_shard(
129
+ self.context, self.kbid, resource_id, from_shard_obj, to_shard_obj
130
+ )
131
+ if moved:
132
+ self.index[from_shard.id].remove(resource_id)
133
+ self.index.setdefault(to_shard.id, set()).add(resource_id)
134
+ moved_paragraphs += paragraphs_count
62
135
 
136
+ return moved_paragraphs
63
137
 
64
- async def maybe_add_shard(kbid: str) -> None:
65
- async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
66
- async with datamanagers.with_ro_transaction() as txn:
67
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
68
- if kb_shards is None:
138
+ async def wait_for_indexing(self):
139
+ try:
140
+ self.context.nats_manager
141
+ except AssertionError: # pragma: no cover
142
+ logger.warning(f"Nats manager not initialized. Cannot wait for indexing")
69
143
  return
144
+ while True:
145
+ try:
146
+ await wait_for_nidx(self.context.nats_manager, max_wait_seconds=60, max_pending=1000)
147
+ return
148
+ except asyncio.TimeoutError:
149
+ logger.warning("Nidx is behind. Backing off rebalancing.", extra={"kbid": self.kbid})
150
+ await asyncio.sleep(30)
151
+
152
+ async def rebalance_shards(self):
153
+ """
154
+ Iterate over shards until none of them need more rebalancing.
155
+
156
+ Will move excess of paragraphs to other shards (potentially creating new ones), and
157
+ merge small shards together when possible (potentially deleting empty ones.)
158
+
159
+
160
+ Merge chooses a <90% filled shard and fills it to almost 100%
161
+ Split chooses a >110% filled shard and reduces it to 100%
162
+ If the shard is between 90% and 110% full, nobody touches it
163
+ """
164
+ await self.build_shard_resources_index()
165
+ while True:
166
+ await self.wait_for_indexing()
167
+
168
+ shards = await self.get_rebalance_shards()
169
+
170
+ # Any shards to split?
171
+ shard_to_split = next((s for s in shards[::-1] if needs_split(s)), None)
172
+ if shard_to_split is not None:
173
+ await self.split_shard(shard_to_split, shards)
174
+ continue
175
+
176
+ # Any shards to merge?
177
+ shard_to_merge = next((s for s in shards if needs_merge(s, shards)), None)
178
+ if shard_to_merge is not None:
179
+ await self.merge_shard(shard_to_merge, shards)
180
+ else:
181
+ break
182
+
183
+ async def split_shard(self, shard_to_split: RebalanceShard, shards: list[RebalanceShard]):
184
+ logger.info(
185
+ "Splitting excess of paragraphs to other shards",
186
+ extra={
187
+ "kbid": self.kbid,
188
+ "shard": shard_to_split.to_dict(),
189
+ },
190
+ )
70
191
 
71
- shard_paragraphs = await get_shards_paragraphs(kbid)
72
- total_paragraphs = sum([c for _, c in shard_paragraphs])
73
-
74
- if (total_paragraphs / len(kb_shards.shards)) > (
75
- settings.max_shard_paragraphs * 0.9 # 90% of the max
76
- ):
77
- # create new shard
78
- async with datamanagers.with_transaction() as txn:
192
+ # First off, calculate if the excess fits in the other shards or we need to add a new shard.
193
+ # Note that we don't filter out the active shard on purpose.
194
+ excess = shard_to_split.paragraphs - settings.max_shard_paragraphs
195
+ other_shards = [s for s in shards if s.id != shard_to_split.id]
196
+ other_shards_capacity = sum(
197
+ [max(0, (settings.max_shard_paragraphs - s.paragraphs)) for s in other_shards]
198
+ )
199
+ if excess > other_shards_capacity:
200
+ shards_to_add = math.ceil((excess - other_shards_capacity) / settings.max_shard_paragraphs)
201
+ logger.info(
202
+ "More shards needed",
203
+ extra={
204
+ "kbid": self.kbid,
205
+ "shards_to_add": shards_to_add,
206
+ "all_shards": [s.to_dict() for s in shards],
207
+ },
208
+ )
209
+ # Add new shards where to rebalance the excess of paragraphs
210
+ async with (
211
+ locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=self.kbid)),
212
+ datamanagers.with_rw_transaction() as txn,
213
+ ):
214
+ kb_config = await datamanagers.kb.get_config(txn, kbid=self.kbid)
215
+ prewarm = kb_config is not None and kb_config.prewarm_enabled
79
216
  sm = get_shard_manager()
80
- await sm.create_shard_by_kbid(txn, kbid)
217
+ for _ in range(shards_to_add):
218
+ await sm.create_shard_by_kbid(txn, self.kbid, prewarm_enabled=prewarm)
81
219
  await txn.commit()
82
220
 
221
+ # Recalculate after having created shards, the active shard is a different one
222
+ shards = await self.get_rebalance_shards()
223
+
224
+ # Now, move resources to other shards as long as we are still over the max
225
+ for _ in range(MAX_MOVES_PER_SHARD):
226
+ shard_paragraphs = next(s.paragraphs for s in shards if s.id == shard_to_split.id)
227
+ excess = shard_paragraphs - settings.max_shard_paragraphs
228
+ if excess <= 0:
229
+ logger.info(
230
+ "Shard rebalanced successfuly",
231
+ extra={"kbid": self.kbid, "shard": shard_to_split.to_dict()},
232
+ )
233
+ break
83
234
 
84
- async def move_set_of_kb_resources(
85
- context: ApplicationContext,
86
- kbid: str,
87
- from_shard_id: str,
88
- to_shard_id: str,
89
- count: int = 20,
90
- ) -> None:
91
- async with datamanagers.with_ro_transaction() as txn:
92
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
93
- if kb_shards is None: # pragma: no cover
94
- logger.warning("No shards found for kb. This should not happen.", extra={"kbid": kbid})
95
- return
235
+ target_shard, target_capacity = get_target_shard(shards, shard_to_split, skip_active=False)
236
+ if target_shard is None:
237
+ logger.warning("No target shard found for splitting", extra={"kbid": self.kbid})
238
+ break
96
239
 
97
- logger.info(
98
- "Rebalancing kb shards",
99
- extra={"kbid": kbid, "from": from_shard_id, "to": to_shard_id, "count": count},
100
- )
240
+ moved_paragraphs = await self.move_paragraphs(
241
+ from_shard=shard_to_split,
242
+ to_shard=target_shard,
243
+ max_paragraphs=min(excess, target_capacity),
244
+ )
101
245
 
102
- from_shard = [s for s in kb_shards.shards if s.shard == from_shard_id][0]
103
- to_shard = [s for s in kb_shards.shards if s.shard == to_shard_id][0]
246
+ # Update shard paragraph counts
247
+ shard_to_split.paragraphs -= moved_paragraphs
248
+ target_shard.paragraphs += moved_paragraphs
249
+ shards.sort(key=lambda x: x.paragraphs)
104
250
 
105
- request = nodereader_pb2.SearchRequest(
106
- shard=from_shard.nidx_shard_id,
107
- paragraph=False,
108
- document=True,
109
- result_per_page=count,
110
- )
111
- request.field_filter.field.field_type = "a"
112
- request.field_filter.field.field_id = "title"
113
- search_response: nodereader_pb2.SearchResponse = await get_nidx_searcher_client().Search(request)
251
+ await self.wait_for_indexing()
114
252
 
115
- for result in search_response.document.results:
116
- resource_id = result.uuid
117
- try:
118
- async with (
119
- datamanagers.with_transaction() as txn,
120
- locking.distributed_lock(
121
- locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=resource_id)
122
- ),
123
- ):
124
- found_shard_id = await datamanagers.resources.get_resource_shard_id(
125
- txn, kbid=kbid, rid=resource_id, for_update=True
253
+ async def merge_shard(self, shard_to_merge: RebalanceShard, shards: list[RebalanceShard]):
254
+ logger.info(
255
+ "Merging shard",
256
+ extra={
257
+ "kbid": self.kbid,
258
+ "shard": shard_to_merge.to_dict(),
259
+ },
260
+ )
261
+ empty_shard = False
262
+
263
+ for _ in range(MAX_MOVES_PER_SHARD):
264
+ resources_count = len(self.index.get(shard_to_merge.id, []))
265
+ if resources_count == 0:
266
+ logger.info(
267
+ "Shard is now empty",
268
+ extra={
269
+ "kbid": self.kbid,
270
+ "shard": shard_to_merge.to_dict(),
271
+ },
126
272
  )
127
- if found_shard_id is None:
128
- # resource deleted
129
- continue
130
- if found_shard_id != from_shard_id:
131
- # resource could have already been moved
132
- continue
273
+ empty_shard = True
274
+ break
275
+
276
+ logger.info(
277
+ "Shard not yet empty",
278
+ extra={
279
+ "kbid": self.kbid,
280
+ "shard": shard_to_merge.to_dict(),
281
+ "remaining": resources_count,
282
+ },
283
+ )
133
284
 
134
- await datamanagers.resources.set_resource_shard_id(
135
- txn, kbid=kbid, rid=resource_id, shard=to_shard_id
285
+ target_shard, target_capacity = get_target_shard(shards, shard_to_merge, skip_active=True)
286
+ if target_shard is None:
287
+ logger.warning(
288
+ "No target shard could be found for merging. Moving on",
289
+ extra={"kbid": self.kbid, "shard": shard_to_merge.to_dict()},
136
290
  )
137
- await index_resource_to_shard(context, kbid, resource_id, to_shard)
138
- await delete_resource_from_shard(context, kbid, resource_id, from_shard)
139
- await txn.commit()
291
+ break
292
+
293
+ moved_paragraphs = await self.move_paragraphs(
294
+ from_shard=shard_to_merge,
295
+ to_shard=target_shard,
296
+ max_paragraphs=target_capacity,
297
+ )
298
+
299
+ # Update shard paragraph counts
300
+ shard_to_merge.paragraphs -= moved_paragraphs
301
+ target_shard.paragraphs += moved_paragraphs
302
+ shards.sort(key=lambda x: x.paragraphs)
303
+
304
+ await self.wait_for_indexing()
305
+
306
+ if empty_shard:
307
+ # Build the index again, and make sure there is no resource assigned to this shard
308
+ await self.build_shard_resources_index()
309
+ shard_resources = self.index.get(shard_to_merge.id, set())
310
+ if len(shard_resources) > 0:
311
+ logger.error(
312
+ f"Shard expected to be empty, but it isn't. Won't be deleted.",
313
+ extra={
314
+ "kbid": self.kbid,
315
+ "shard": shard_to_merge.id,
316
+ "resources": list(shard_resources)[:30],
317
+ },
318
+ )
319
+ return
320
+
321
+ # If shard was emptied, delete it
322
+ async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=self.kbid)):
323
+ async with datamanagers.with_rw_transaction() as txn:
324
+ kb_shards = await datamanagers.cluster.get_kb_shards(
325
+ txn, kbid=self.kbid, for_update=True
326
+ )
327
+ if kb_shards is not None:
328
+ logger.info(
329
+ "Deleting empty shard",
330
+ extra={
331
+ "kbid": self.kbid,
332
+ "shard_id": shard_to_merge.id,
333
+ "nidx_shard_id": shard_to_merge.nidx_id,
334
+ },
335
+ )
336
+
337
+ # Delete shards from kb shards in maindb
338
+ to_delete, to_delete_idx = next(
339
+ (s, idx)
340
+ for idx, s in enumerate(kb_shards.shards)
341
+ if s.shard == shard_to_merge.id
342
+ )
343
+ kb_shards.shards.remove(to_delete)
344
+ if to_delete_idx <= kb_shards.actual:
345
+ # Only decrement the actual pointer if we remove before the pointer.
346
+ kb_shards.actual -= 1
347
+ assert kb_shards.actual >= 0
348
+ await datamanagers.cluster.update_kb_shards(
349
+ txn, kbid=self.kbid, shards=kb_shards
350
+ )
351
+ await txn.commit()
352
+
353
+ # Delete shard from nidx
354
+ await get_nidx_api_client().DeleteShard(
355
+ noderesources_pb2.ShardId(id=to_delete.nidx_shard_id)
356
+ )
357
+
358
+
359
+ async def get_resource_paragraphs_count(resource_id: str, nidx_shard_id: str) -> int:
360
+ # Do a search on the fields (paragraph) index and return the number of paragraphs this resource has
361
+ try:
362
+ request = nodereader_pb2.SearchRequest(
363
+ shard=nidx_shard_id,
364
+ paragraph=True,
365
+ document=False,
366
+ result_per_page=0,
367
+ field_filter=nodereader_pb2.FilterExpression(
368
+ resource=nodereader_pb2.FilterExpression.ResourceFilter(resource_id=resource_id)
369
+ ),
370
+ )
371
+ search_response: nodereader_pb2.SearchResponse = await get_nidx_searcher_client().Search(request)
372
+ return search_response.paragraph.total
373
+ except AioRpcError as exc: # pragma: no cover
374
+ if exc.code() == StatusCode.NOT_FOUND:
375
+ logger.warning(f"Shard not found in nidx", extra={"nidx_shard_id": nidx_shard_id})
376
+ return 0
377
+ raise
378
+
379
+
380
+ def get_target_shard(
381
+ shards: list[RebalanceShard], rebalanced_shard: RebalanceShard, skip_active: bool = True
382
+ ) -> tuple[Optional[RebalanceShard], int]:
383
+ """
384
+ Return the biggest shard with capacity (< 90% of the max paragraphs per shard).
385
+ """
386
+ target_shard = next(
387
+ reversed(
388
+ [
389
+ s
390
+ for s in shards
391
+ if s.id != rebalanced_shard.id
392
+ and s.paragraphs < settings.max_shard_paragraphs * 0.9
393
+ and (not skip_active or (skip_active and not s.active))
394
+ ]
395
+ ),
396
+ None,
397
+ )
398
+ if target_shard is None: # pragma: no cover
399
+ return None, 0
400
+
401
+ # Aim to fill target shards up to 100% of max
402
+ capacity = int(max(0, settings.max_shard_paragraphs - target_shard.paragraphs))
403
+ return target_shard, capacity
404
+
405
+
406
+ async def get_shard_paragraph_count(nidx_shard_id: str) -> int:
407
+ # Do a search on the fields (paragraph) index
408
+ try:
409
+ request = nodereader_pb2.SearchRequest(
410
+ shard=nidx_shard_id,
411
+ paragraph=True,
412
+ document=False,
413
+ result_per_page=0,
414
+ )
415
+ search_response: nodereader_pb2.SearchResponse = await get_nidx_searcher_client().Search(request)
416
+ return search_response.paragraph.total
417
+ except AioRpcError as exc: # pragma: no cover
418
+ if exc.code() == StatusCode.NOT_FOUND:
419
+ logger.warning(f"Shard not found in nidx", extra={"nidx_shard_id": nidx_shard_id})
420
+ return 0
421
+ raise
422
+
423
+
424
+ async def move_resource_to_shard(
425
+ context: ApplicationContext,
426
+ kbid: str,
427
+ resource_id: str,
428
+ from_shard: writer_pb2.ShardObject,
429
+ to_shard: writer_pb2.ShardObject,
430
+ ) -> bool:
431
+ indexed_to_new = False
432
+ deleted_from_old = False
433
+ try:
434
+ async with (
435
+ datamanagers.with_transaction() as txn,
436
+ locking.distributed_lock(
437
+ locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=resource_id)
438
+ ),
439
+ ):
440
+ found_shard_id = await datamanagers.resources.get_resource_shard_id(
441
+ txn, kbid=kbid, rid=resource_id, for_update=True
442
+ )
443
+ if found_shard_id is None: # pragma: no cover
444
+ # resource deleted
445
+ return False
446
+ if found_shard_id != from_shard.shard: # pragma: no cover
447
+ # resource could have already been moved
448
+ return False
449
+
450
+ await datamanagers.resources.set_resource_shard_id(
451
+ txn, kbid=kbid, rid=resource_id, shard=to_shard.shard
452
+ )
453
+ await index_resource_to_shard(context, kbid, resource_id, to_shard)
454
+ indexed_to_new = True
455
+ await delete_resource_from_shard(context, kbid, resource_id, from_shard)
456
+ deleted_from_old = True
457
+ await txn.commit()
458
+ return True
459
+ except Exception:
460
+ logger.exception(
461
+ "Failed to move resource",
462
+ extra={"kbid": kbid, "resource_id": resource_id},
463
+ )
464
+ # XXX Not ideal failure situation here. Try reverting the whole move even though it could be redundant
465
+ try:
466
+ if indexed_to_new:
467
+ await delete_resource_from_shard(context, kbid, resource_id, to_shard)
468
+ if deleted_from_old:
469
+ await index_resource_to_shard(context, kbid, resource_id, from_shard)
140
470
  except Exception:
141
471
  logger.exception(
142
- "Failed to move resource",
472
+ "Failed to revert move resource. Hopefully you never see this message.",
143
473
  extra={"kbid": kbid, "resource_id": resource_id},
144
474
  )
145
- # XXX Not ideal failure situation here. Try reverting the whole move even though it could be redundant
146
- try:
147
- await index_resource_to_shard(context, kbid, resource_id, from_shard)
148
- await delete_resource_from_shard(context, kbid, resource_id, to_shard)
149
- except Exception:
150
- logger.exception(
151
- "Failed to revert move resource. Hopefully you never see this message.",
152
- extra={"kbid": kbid, "resource_id": resource_id},
153
- )
475
+ return False
154
476
 
155
477
 
156
- async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
157
- await maybe_add_shard(kbid)
478
+ def needs_split(shard: RebalanceShard) -> bool:
479
+ """
480
+ Return true if the shard is more than 110% of the max.
158
481
 
159
- shard_paragraphs = await get_shards_paragraphs(kbid)
160
- rebalanced_shards = set()
161
- while any(paragraphs > settings.max_shard_paragraphs for _, paragraphs in shard_paragraphs):
162
- # find the shard with the least/most paragraphs
163
- smallest_shard = shard_paragraphs[0][0]
164
- largest_shard = shard_paragraphs[-1][0]
165
- assert smallest_shard != largest_shard
482
+ Active shards are not considered for splitting: the shard creator subscriber will
483
+ eventually create a new shard, make it the active one and the previous one, if
484
+ too full, will be split.
485
+ """
486
+ return not shard.active and (shard.paragraphs > (settings.max_shard_paragraphs * 1.1))
166
487
 
167
- if smallest_shard in rebalanced_shards:
168
- # XXX This is to prevent flapping data between shards on a single pass
169
- # if we already rebalanced this shard, then we can't do anything else
170
- break
171
488
 
172
- await move_set_of_kb_resources(context, kbid, largest_shard, smallest_shard)
489
+ def needs_merge(shard: RebalanceShard, all_shards: list[RebalanceShard]) -> bool:
490
+ """
491
+ Returns true if a shard is less 75% full and there is enough capacity on the other shards to fit it.
492
+
493
+ Active shards are not considered for merging. Shards that are more than 75% full are also skipped.
494
+ """
495
+ if shard.active:
496
+ return False
497
+ if shard.paragraphs > (settings.max_shard_paragraphs * 0.75):
498
+ return False
499
+ other_shards = [s for s in all_shards if s.id != shard.id and not s.active]
500
+ other_shards_capacity = sum(
501
+ [max(0, (settings.max_shard_paragraphs - s.paragraphs)) for s in other_shards]
502
+ )
503
+ return shard.paragraphs < other_shards_capacity
173
504
 
174
- rebalanced_shards.add(largest_shard)
175
505
 
176
- shard_paragraphs = await get_shards_paragraphs(kbid)
506
+ async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
507
+ rebalancer = Rebalancer(context, kbid)
508
+ await rebalancer.rebalance_shards()
177
509
 
178
510
 
179
511
  async def run(context: ApplicationContext) -> None:
@@ -182,8 +514,12 @@ async def run(context: ApplicationContext) -> None:
182
514
  # get all kb ids
183
515
  async with datamanagers.with_ro_transaction() as txn:
184
516
  kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
185
- # go through each kb and see if shards need to be reduced in size
517
+ # go through each kb and see if shards need to be rebalanced
186
518
  for kbid in kbids:
519
+ if not has_feature(
520
+ const.Features.REBALANCE_ENABLED, default=False, context={"kbid": kbid}
521
+ ):
522
+ continue
187
523
  async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
188
524
  await rebalance_kb(context, kbid)
189
525
  except locking.ResourceLocked as exc: