nucliadb 6.9.2.post5276__py3-none-any.whl → 6.9.3.post5290__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

@@ -0,0 +1,34 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ # Concurrent index must be created outside of a transaction but psycopg automatically
26
+ # creates transactions. We temporarily disable this for building indexes.
27
+ await txn.connection.commit()
28
+ try:
29
+ await txn.connection.set_autocommit(True)
30
+ await txn.connection.execute(
31
+ "CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
32
+ )
33
+ finally:
34
+ await txn.connection.set_autocommit(False)
@@ -22,9 +22,8 @@ import dataclasses
22
22
  import logging
23
23
  import math
24
24
  import random
25
- from typing import Optional
25
+ from typing import Optional, cast
26
26
 
27
- import aioitertools
28
27
  from grpc import StatusCode
29
28
  from grpc.aio import AioRpcError
30
29
  from nidx_protos import nodereader_pb2, noderesources_pb2
@@ -32,7 +31,8 @@ from nidx_protos import nodereader_pb2, noderesources_pb2
32
31
  from nucliadb.common import datamanagers, locking
33
32
  from nucliadb.common.cluster.utils import get_shard_manager
34
33
  from nucliadb.common.context import ApplicationContext
35
- from nucliadb.common.datamanagers.resources import KB_RESOURCE_SHARD
34
+ from nucliadb.common.maindb.driver import Driver
35
+ from nucliadb.common.maindb.pg import PGDriver
36
36
  from nucliadb.common.nidx import get_nidx_api_client, get_nidx_searcher_client
37
37
  from nucliadb_protos import writer_pb2
38
38
  from nucliadb_telemetry import errors
@@ -68,7 +68,7 @@ class Rebalancer:
68
68
  self.context = context
69
69
  self.kbid = kbid
70
70
  self.kb_shards: Optional[writer_pb2.Shards] = None
71
- self.index: dict[str, set[str]] = {}
71
+ self.index: dict[str, int] = {}
72
72
 
73
73
  async def get_rebalance_shards(self) -> list[RebalanceShard]:
74
74
  """
@@ -93,16 +93,7 @@ class Rebalancer:
93
93
  )
94
94
 
95
95
  async def build_shard_resources_index(self):
96
- async with datamanagers.with_ro_transaction() as txn:
97
- iterable = datamanagers.resources.iterate_resource_ids(kbid=self.kbid)
98
- async for resources_batch in aioitertools.batched(iterable, n=200):
99
- shards = await txn.batch_get(
100
- keys=[KB_RESOURCE_SHARD.format(kbid=self.kbid, uuid=rid) for rid in resources_batch],
101
- for_update=False,
102
- )
103
- for rid, shard_bytes in zip(resources_batch, shards):
104
- if shard_bytes is not None:
105
- self.index.setdefault(shard_bytes.decode(), set()).add(rid)
96
+ self.index = await build_shard_resources_index(self.context.kv_driver, self.kbid)
106
97
 
107
98
  async def move_paragraphs(
108
99
  self, from_shard: RebalanceShard, to_shard: RebalanceShard, max_paragraphs: int
@@ -113,13 +104,19 @@ class Rebalancer:
113
104
  """
114
105
  moved_paragraphs = 0
115
106
 
107
+ resources_batch: list[str] = []
108
+
116
109
  while moved_paragraphs < max_paragraphs:
110
+ if len(resources_batch) == 0:
111
+ resources_batch = await get_resources_from_shard(
112
+ self.context.kv_driver, self.kbid, from_shard.id, n=50
113
+ )
114
+ if len(resources_batch) == 0:
115
+ # No more resources to move or shard not found
116
+ break
117
+
117
118
  # Take a random resource to move
118
- try:
119
- resource_id = random.choice(tuple(self.index[from_shard.id]))
120
- except (KeyError, IndexError):
121
- # No more resources in shard or shard not found
122
- break
119
+ resource_id = random.choice(resources_batch)
123
120
 
124
121
  assert self.kb_shards is not None
125
122
  from_shard_obj = next(s for s in self.kb_shards.shards if s.shard == from_shard.id)
@@ -129,8 +126,9 @@ class Rebalancer:
129
126
  self.context, self.kbid, resource_id, from_shard_obj, to_shard_obj
130
127
  )
131
128
  if moved:
132
- self.index[from_shard.id].remove(resource_id)
133
- self.index.setdefault(to_shard.id, set()).add(resource_id)
129
+ resources_batch.remove(resource_id)
130
+ self.index[from_shard.id] = self.index.get(from_shard.id, 1) - 1
131
+ self.index[to_shard.id] = self.index.get(to_shard.id, 0) + 1
134
132
  moved_paragraphs += paragraphs_count
135
133
 
136
134
  return moved_paragraphs
@@ -261,7 +259,7 @@ class Rebalancer:
261
259
  empty_shard = False
262
260
 
263
261
  for _ in range(MAX_MOVES_PER_SHARD):
264
- resources_count = len(self.index.get(shard_to_merge.id, []))
262
+ resources_count = self.index.get(shard_to_merge.id, 0)
265
263
  if resources_count == 0:
266
264
  logger.info(
267
265
  "Shard is now empty",
@@ -306,14 +304,14 @@ class Rebalancer:
306
304
  if empty_shard:
307
305
  # Build the index again, and make sure there is no resource assigned to this shard
308
306
  await self.build_shard_resources_index()
309
- shard_resources = self.index.get(shard_to_merge.id, set())
310
- if len(shard_resources) > 0:
307
+ shard_resources = self.index.get(shard_to_merge.id, 0)
308
+ if shard_resources > 0:
311
309
  logger.error(
312
310
  f"Shard expected to be empty, but it isn't. Won't be deleted.",
313
311
  extra={
314
312
  "kbid": self.kbid,
315
313
  "shard": shard_to_merge.id,
316
- "resources": list(shard_resources)[:30],
314
+ "resources": shard_resources,
317
315
  },
318
316
  )
319
317
  return
@@ -356,6 +354,40 @@ class Rebalancer:
356
354
  )
357
355
 
358
356
 
357
+ async def build_shard_resources_index(driver: Driver, kbid: str) -> dict[str, int]:
358
+ index: dict[str, int] = {}
359
+ driver = cast(PGDriver, driver)
360
+ async with driver._get_connection() as conn:
361
+ cur = conn.cursor("")
362
+ await cur.execute(
363
+ """
364
+ SELECT encode(value, 'escape'), COUNT(*) FROM resources WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$' AND key ~ %s GROUP BY value;
365
+ """,
366
+ (f"/kbs/{kbid}/r/[^/]*/shard$",),
367
+ )
368
+ records = await cur.fetchall()
369
+ shard: str
370
+ resources_count: int
371
+ for shard, resources_count in records:
372
+ index[shard] = resources_count
373
+ return index
374
+
375
+
376
+ async def get_resources_from_shard(driver: Driver, kbid: str, shard_id: str, n: int) -> list[str]:
377
+ driver = cast(PGDriver, driver)
378
+ async with driver._get_connection() as conn:
379
+ cur = conn.cursor("")
380
+ await cur.execute(
381
+ """
382
+ SELECT split_part(key, '/', 5) FROM resources WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$' AND key ~ %s AND encode(value, 'escape') LIKE %s limit %s;
383
+ """,
384
+ (f"/kbs/{kbid}/r/[^/]*/shard$", shard_id, n),
385
+ )
386
+ records = await cur.fetchall()
387
+ rids: list[str] = [r[0] for r in records]
388
+ return rids
389
+
390
+
359
391
  async def get_resource_paragraphs_count(resource_id: str, nidx_shard_id: str) -> int:
360
392
  # Do a search on the fields (paragraph) index and return the number of paragraphs this resource has
361
393
  try:
@@ -25,7 +25,7 @@ from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitMet
25
25
  from nucliadb_protos.resources_pb2 import Conversation as PBConversation
26
26
  from nucliadb_utils.storages.storage import StorageField
27
27
 
28
- MAX_CONVERSATION_MESSAGES = 50 * 1024
28
+ MAX_CONVERSATION_MESSAGES = None # No limit
29
29
 
30
30
  PAGE_SIZE = 200
31
31
 
@@ -564,7 +564,10 @@ async def _conversation_append_checks(
564
564
 
565
565
  # Make sure that the max number of messages is not exceeded
566
566
  current_message_count = (await conv.get_metadata()).total
567
- if len(input.messages) + current_message_count > MAX_CONVERSATION_MESSAGES:
567
+ if (
568
+ MAX_CONVERSATION_MESSAGES is not None
569
+ and (len(input.messages) + current_message_count) > MAX_CONVERSATION_MESSAGES
570
+ ):
568
571
  raise HTTPException(
569
572
  status_code=422,
570
573
  detail=f"Conversation fields cannot have more than {MAX_CONVERSATION_MESSAGES} messages.",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.9.2.post5276
3
+ Version: 6.9.3.post5290
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -18,11 +18,11 @@ Classifier: Programming Language :: Python :: 3.12
18
18
  Classifier: Programming Language :: Python :: 3 :: Only
19
19
  Requires-Python: <4,>=3.10
20
20
  Description-Content-Type: text/markdown
21
- Requires-Dist: nucliadb-telemetry[all]>=6.9.2.post5276
22
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.2.post5276
23
- Requires-Dist: nucliadb-protos>=6.9.2.post5276
24
- Requires-Dist: nucliadb-models>=6.9.2.post5276
25
- Requires-Dist: nidx-protos>=6.9.2.post5276
21
+ Requires-Dist: nucliadb-telemetry[all]>=6.9.3.post5290
22
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.3.post5290
23
+ Requires-Dist: nucliadb-protos>=6.9.3.post5290
24
+ Requires-Dist: nucliadb-models>=6.9.3.post5290
25
+ Requires-Dist: nidx-protos>=6.9.3.post5290
26
26
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
27
27
  Requires-Dist: nuclia-models>=0.50.0
28
28
  Requires-Dist: uvicorn[standard]
@@ -46,6 +46,7 @@ migrations/pg/0006_catalog_title_indexes.py,sha256=n2OGxwE4oeCwHAYaxBkja4t10BmwT
46
46
  migrations/pg/0007_catalog_slug.py,sha256=mArzZCBO-RD5DkWxRIyDKgEzrnAcis1TOGvSNUe7Kgg,1150
47
47
  migrations/pg/0008_catalog_facets.py,sha256=dxIUdHJHtI_Gyk2dpP7tjHEnL2iPzAufi6ajYm2FVMI,1595
48
48
  migrations/pg/0009_extract_facets_safety.py,sha256=k9Appx7ipp3wDyLy70qgw9oLjN7N6BEadE-N5Fhan-4,1066
49
+ migrations/pg/0010_shards_index.py,sha256=7s7c2s5768BkOqexJ2CedzJq8SigGKC9rfOigtQOraA,1417
49
50
  migrations/pg/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
50
51
  nucliadb/__init__.py,sha256=_abCmDJ_0ku483Os4UAjPX7Nywm39cQgAV_DiyjsKeQ,891
51
52
  nucliadb/health.py,sha256=UIxxA4oms4HIsCRZM_SZsdkIZIlgzmOxw-qSHLlWuak,3465
@@ -86,7 +87,7 @@ nucliadb/common/cluster/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIX
86
87
  nucliadb/common/cluster/exceptions.py,sha256=t7v_l93t44l2tQpdQXgO_w-c4YZRcaayOz1A2i0w4RQ,1258
87
88
  nucliadb/common/cluster/grpc_node_dummy.py,sha256=JkufazWzMA4KFEU8EBkMbiiDW4C8lLcRhiiCxP7aCQY,2949
88
89
  nucliadb/common/cluster/manager.py,sha256=p-haaGEnCa-20t-I2XEo4QJ5ZC1QgJ6p2jzXFYVB6nQ,12346
89
- nucliadb/common/cluster/rebalance.py,sha256=0RIyrv8kkTHJFpqC5H3fPukypF0HpRNiN5f4WRiFvh8,22541
90
+ nucliadb/common/cluster/rebalance.py,sha256=XT8APmvvjwr5pbMPqnVcFj1ZehSQwOP5US-cKkEyEag,23559
90
91
  nucliadb/common/cluster/rollover.py,sha256=kmVCdyjJ1dilnSodHMqf0EUkTjPC5H0aA4JqW9KsEa4,27168
91
92
  nucliadb/common/cluster/settings.py,sha256=f6Y5K0PGahkedwe5wtkWMnbqwDFJgOOwX_MOIGwH9Dg,2271
92
93
  nucliadb/common/cluster/utils.py,sha256=E4GqidwTKczJX_lTnncBCof2fL4CFVVF1eLiz9NWjlc,5494
@@ -155,7 +156,7 @@ nucliadb/ingest/consumer/shard_creator.py,sha256=qUEpxZLE1etw1nL8L3O9HvZBx5NNql7
155
156
  nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
156
157
  nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
157
158
  nucliadb/ingest/fields/base.py,sha256=Yk6b7OQ96YAAaDEyckry6-dBhzXASzuJucjTpEvFUZ4,23715
158
- nucliadb/ingest/fields/conversation.py,sha256=KkOvNM1rZFQRg2RsfGd3Jrz3lpx0HpGpN1cmlpz_mZw,9563
159
+ nucliadb/ingest/fields/conversation.py,sha256=QbHBtjOV_KWeJu_nrP8DTQ6uTc-yDAXliiJ1f2LienY,9570
159
160
  nucliadb/ingest/fields/exceptions.py,sha256=sZBk21BSrXFdOdo1qUdCAyD-9YMYakSLdn4_WdIPCIQ,1217
160
161
  nucliadb/ingest/fields/file.py,sha256=1v4jLg3balUua2VmSV8hHkAwPFShTUCOzufZvIUQcQw,4740
161
162
  nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54Y4Ig,1547
@@ -374,7 +375,7 @@ nucliadb/writer/api/v1/vectorsets.py,sha256=F3iMViL5G95_Tns4aO2SOA0DwAzxK2_P8MXx
374
375
  nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
375
376
  nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
376
377
  nucliadb/writer/resource/basic.py,sha256=44GK8M9EEVoAUfGiabdLrrpENqeFwNn7qwxF2AHhQGg,10504
377
- nucliadb/writer/resource/field.py,sha256=kJFxOgmizGbEuTRPb5o0cNqonZ8sa9ehVlSfRk-ektY,22866
378
+ nucliadb/writer/resource/field.py,sha256=FF1cDuvDPZN6TrwiMQiyGqxNU4bGxXYpVcHsM-BNpKs,22946
378
379
  nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
379
380
  nucliadb/writer/tus/__init__.py,sha256=Kera0BtxoDX0ngPftXiMjNgjrhtQ3l2XFc5nJqSBOJY,5498
380
381
  nucliadb/writer/tus/azure.py,sha256=yxoRi4PhGDikTqVK3PiuVyguy8H9DOS66JpZCY4hpUY,4177
@@ -385,8 +386,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
385
386
  nucliadb/writer/tus/s3.py,sha256=vu1BGg4VqJ_x2P1u2BxqPKlSfw5orT_a3R-Ln5oPUpU,8483
386
387
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
387
388
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
388
- nucliadb-6.9.2.post5276.dist-info/METADATA,sha256=92ehiGl-d963I9_9rxJo1lyNxd7om9lcGXMtunuwhc8,4118
389
- nucliadb-6.9.2.post5276.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
390
- nucliadb-6.9.2.post5276.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
391
- nucliadb-6.9.2.post5276.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
392
- nucliadb-6.9.2.post5276.dist-info/RECORD,,
389
+ nucliadb-6.9.3.post5290.dist-info/METADATA,sha256=4vQ4BF0XZCMBSLDcbFox0HLi4sYZxqM3sOzsdM1xH5w,4118
390
+ nucliadb-6.9.3.post5290.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
391
+ nucliadb-6.9.3.post5290.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
392
+ nucliadb-6.9.3.post5290.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
393
+ nucliadb-6.9.3.post5290.dist-info/RECORD,,