nucliadb 6.9.2.post5276__py3-none-any.whl → 6.9.3.post5290__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/common/cluster/rebalance.py +57 -25
- nucliadb/ingest/fields/conversation.py +1 -1
- nucliadb/writer/resource/field.py +4 -1
- {nucliadb-6.9.2.post5276.dist-info → nucliadb-6.9.3.post5290.dist-info}/METADATA +6 -6
- {nucliadb-6.9.2.post5276.dist-info → nucliadb-6.9.3.post5290.dist-info}/RECORD +9 -8
- {nucliadb-6.9.2.post5276.dist-info → nucliadb-6.9.3.post5290.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.2.post5276.dist-info → nucliadb-6.9.3.post5290.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.2.post5276.dist-info → nucliadb-6.9.3.post5290.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
|
25
|
+
# Concurrent index must be created outside of a transaction but psycopg automatically
|
|
26
|
+
# creates transactions. We temporarily disable this for building indexes.
|
|
27
|
+
await txn.connection.commit()
|
|
28
|
+
try:
|
|
29
|
+
await txn.connection.set_autocommit(True)
|
|
30
|
+
await txn.connection.execute(
|
|
31
|
+
"CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
|
|
32
|
+
)
|
|
33
|
+
finally:
|
|
34
|
+
await txn.connection.set_autocommit(False)
|
|
@@ -22,9 +22,8 @@ import dataclasses
|
|
|
22
22
|
import logging
|
|
23
23
|
import math
|
|
24
24
|
import random
|
|
25
|
-
from typing import Optional
|
|
25
|
+
from typing import Optional, cast
|
|
26
26
|
|
|
27
|
-
import aioitertools
|
|
28
27
|
from grpc import StatusCode
|
|
29
28
|
from grpc.aio import AioRpcError
|
|
30
29
|
from nidx_protos import nodereader_pb2, noderesources_pb2
|
|
@@ -32,7 +31,8 @@ from nidx_protos import nodereader_pb2, noderesources_pb2
|
|
|
32
31
|
from nucliadb.common import datamanagers, locking
|
|
33
32
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
34
33
|
from nucliadb.common.context import ApplicationContext
|
|
35
|
-
from nucliadb.common.
|
|
34
|
+
from nucliadb.common.maindb.driver import Driver
|
|
35
|
+
from nucliadb.common.maindb.pg import PGDriver
|
|
36
36
|
from nucliadb.common.nidx import get_nidx_api_client, get_nidx_searcher_client
|
|
37
37
|
from nucliadb_protos import writer_pb2
|
|
38
38
|
from nucliadb_telemetry import errors
|
|
@@ -68,7 +68,7 @@ class Rebalancer:
|
|
|
68
68
|
self.context = context
|
|
69
69
|
self.kbid = kbid
|
|
70
70
|
self.kb_shards: Optional[writer_pb2.Shards] = None
|
|
71
|
-
self.index: dict[str,
|
|
71
|
+
self.index: dict[str, int] = {}
|
|
72
72
|
|
|
73
73
|
async def get_rebalance_shards(self) -> list[RebalanceShard]:
|
|
74
74
|
"""
|
|
@@ -93,16 +93,7 @@ class Rebalancer:
|
|
|
93
93
|
)
|
|
94
94
|
|
|
95
95
|
async def build_shard_resources_index(self):
|
|
96
|
-
|
|
97
|
-
iterable = datamanagers.resources.iterate_resource_ids(kbid=self.kbid)
|
|
98
|
-
async for resources_batch in aioitertools.batched(iterable, n=200):
|
|
99
|
-
shards = await txn.batch_get(
|
|
100
|
-
keys=[KB_RESOURCE_SHARD.format(kbid=self.kbid, uuid=rid) for rid in resources_batch],
|
|
101
|
-
for_update=False,
|
|
102
|
-
)
|
|
103
|
-
for rid, shard_bytes in zip(resources_batch, shards):
|
|
104
|
-
if shard_bytes is not None:
|
|
105
|
-
self.index.setdefault(shard_bytes.decode(), set()).add(rid)
|
|
96
|
+
self.index = await build_shard_resources_index(self.context.kv_driver, self.kbid)
|
|
106
97
|
|
|
107
98
|
async def move_paragraphs(
|
|
108
99
|
self, from_shard: RebalanceShard, to_shard: RebalanceShard, max_paragraphs: int
|
|
@@ -113,13 +104,19 @@ class Rebalancer:
|
|
|
113
104
|
"""
|
|
114
105
|
moved_paragraphs = 0
|
|
115
106
|
|
|
107
|
+
resources_batch: list[str] = []
|
|
108
|
+
|
|
116
109
|
while moved_paragraphs < max_paragraphs:
|
|
110
|
+
if len(resources_batch) == 0:
|
|
111
|
+
resources_batch = await get_resources_from_shard(
|
|
112
|
+
self.context.kv_driver, self.kbid, from_shard.id, n=50
|
|
113
|
+
)
|
|
114
|
+
if len(resources_batch) == 0:
|
|
115
|
+
# No more resources to move or shard not found
|
|
116
|
+
break
|
|
117
|
+
|
|
117
118
|
# Take a random resource to move
|
|
118
|
-
|
|
119
|
-
resource_id = random.choice(tuple(self.index[from_shard.id]))
|
|
120
|
-
except (KeyError, IndexError):
|
|
121
|
-
# No more resources in shard or shard not found
|
|
122
|
-
break
|
|
119
|
+
resource_id = random.choice(resources_batch)
|
|
123
120
|
|
|
124
121
|
assert self.kb_shards is not None
|
|
125
122
|
from_shard_obj = next(s for s in self.kb_shards.shards if s.shard == from_shard.id)
|
|
@@ -129,8 +126,9 @@ class Rebalancer:
|
|
|
129
126
|
self.context, self.kbid, resource_id, from_shard_obj, to_shard_obj
|
|
130
127
|
)
|
|
131
128
|
if moved:
|
|
132
|
-
|
|
133
|
-
self.index.
|
|
129
|
+
resources_batch.remove(resource_id)
|
|
130
|
+
self.index[from_shard.id] = self.index.get(from_shard.id, 1) - 1
|
|
131
|
+
self.index[to_shard.id] = self.index.get(to_shard.id, 0) + 1
|
|
134
132
|
moved_paragraphs += paragraphs_count
|
|
135
133
|
|
|
136
134
|
return moved_paragraphs
|
|
@@ -261,7 +259,7 @@ class Rebalancer:
|
|
|
261
259
|
empty_shard = False
|
|
262
260
|
|
|
263
261
|
for _ in range(MAX_MOVES_PER_SHARD):
|
|
264
|
-
resources_count =
|
|
262
|
+
resources_count = self.index.get(shard_to_merge.id, 0)
|
|
265
263
|
if resources_count == 0:
|
|
266
264
|
logger.info(
|
|
267
265
|
"Shard is now empty",
|
|
@@ -306,14 +304,14 @@ class Rebalancer:
|
|
|
306
304
|
if empty_shard:
|
|
307
305
|
# Build the index again, and make sure there is no resource assigned to this shard
|
|
308
306
|
await self.build_shard_resources_index()
|
|
309
|
-
shard_resources = self.index.get(shard_to_merge.id,
|
|
310
|
-
if
|
|
307
|
+
shard_resources = self.index.get(shard_to_merge.id, 0)
|
|
308
|
+
if shard_resources > 0:
|
|
311
309
|
logger.error(
|
|
312
310
|
f"Shard expected to be empty, but it isn't. Won't be deleted.",
|
|
313
311
|
extra={
|
|
314
312
|
"kbid": self.kbid,
|
|
315
313
|
"shard": shard_to_merge.id,
|
|
316
|
-
"resources":
|
|
314
|
+
"resources": shard_resources,
|
|
317
315
|
},
|
|
318
316
|
)
|
|
319
317
|
return
|
|
@@ -356,6 +354,40 @@ class Rebalancer:
|
|
|
356
354
|
)
|
|
357
355
|
|
|
358
356
|
|
|
357
|
+
async def build_shard_resources_index(driver: Driver, kbid: str) -> dict[str, int]:
|
|
358
|
+
index: dict[str, int] = {}
|
|
359
|
+
driver = cast(PGDriver, driver)
|
|
360
|
+
async with driver._get_connection() as conn:
|
|
361
|
+
cur = conn.cursor("")
|
|
362
|
+
await cur.execute(
|
|
363
|
+
"""
|
|
364
|
+
SELECT encode(value, 'escape'), COUNT(*) FROM resources WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$' AND key ~ %s GROUP BY value;
|
|
365
|
+
""",
|
|
366
|
+
(f"/kbs/{kbid}/r/[^/]*/shard$",),
|
|
367
|
+
)
|
|
368
|
+
records = await cur.fetchall()
|
|
369
|
+
shard: str
|
|
370
|
+
resources_count: int
|
|
371
|
+
for shard, resources_count in records:
|
|
372
|
+
index[shard] = resources_count
|
|
373
|
+
return index
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
async def get_resources_from_shard(driver: Driver, kbid: str, shard_id: str, n: int) -> list[str]:
|
|
377
|
+
driver = cast(PGDriver, driver)
|
|
378
|
+
async with driver._get_connection() as conn:
|
|
379
|
+
cur = conn.cursor("")
|
|
380
|
+
await cur.execute(
|
|
381
|
+
"""
|
|
382
|
+
SELECT split_part(key, '/', 5) FROM resources WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$' AND key ~ %s AND encode(value, 'escape') LIKE %s limit %s;
|
|
383
|
+
""",
|
|
384
|
+
(f"/kbs/{kbid}/r/[^/]*/shard$", shard_id, n),
|
|
385
|
+
)
|
|
386
|
+
records = await cur.fetchall()
|
|
387
|
+
rids: list[str] = [r[0] for r in records]
|
|
388
|
+
return rids
|
|
389
|
+
|
|
390
|
+
|
|
359
391
|
async def get_resource_paragraphs_count(resource_id: str, nidx_shard_id: str) -> int:
|
|
360
392
|
# Do a search on the fields (paragraph) index and return the number of paragraphs this resource has
|
|
361
393
|
try:
|
|
@@ -25,7 +25,7 @@ from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitMet
|
|
|
25
25
|
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
|
26
26
|
from nucliadb_utils.storages.storage import StorageField
|
|
27
27
|
|
|
28
|
-
MAX_CONVERSATION_MESSAGES =
|
|
28
|
+
MAX_CONVERSATION_MESSAGES = None # No limit
|
|
29
29
|
|
|
30
30
|
PAGE_SIZE = 200
|
|
31
31
|
|
|
@@ -564,7 +564,10 @@ async def _conversation_append_checks(
|
|
|
564
564
|
|
|
565
565
|
# Make sure that the max number of messages is not exceeded
|
|
566
566
|
current_message_count = (await conv.get_metadata()).total
|
|
567
|
-
if
|
|
567
|
+
if (
|
|
568
|
+
MAX_CONVERSATION_MESSAGES is not None
|
|
569
|
+
and (len(input.messages) + current_message_count) > MAX_CONVERSATION_MESSAGES
|
|
570
|
+
):
|
|
568
571
|
raise HTTPException(
|
|
569
572
|
status_code=422,
|
|
570
573
|
detail=f"Conversation fields cannot have more than {MAX_CONVERSATION_MESSAGES} messages.",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nucliadb
|
|
3
|
-
Version: 6.9.
|
|
3
|
+
Version: 6.9.3.post5290
|
|
4
4
|
Summary: NucliaDB
|
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
|
@@ -18,11 +18,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
18
18
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
19
|
Requires-Python: <4,>=3.10
|
|
20
20
|
Description-Content-Type: text/markdown
|
|
21
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.9.
|
|
22
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.
|
|
23
|
-
Requires-Dist: nucliadb-protos>=6.9.
|
|
24
|
-
Requires-Dist: nucliadb-models>=6.9.
|
|
25
|
-
Requires-Dist: nidx-protos>=6.9.
|
|
21
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.9.3.post5290
|
|
22
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.3.post5290
|
|
23
|
+
Requires-Dist: nucliadb-protos>=6.9.3.post5290
|
|
24
|
+
Requires-Dist: nucliadb-models>=6.9.3.post5290
|
|
25
|
+
Requires-Dist: nidx-protos>=6.9.3.post5290
|
|
26
26
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
|
27
27
|
Requires-Dist: nuclia-models>=0.50.0
|
|
28
28
|
Requires-Dist: uvicorn[standard]
|
|
@@ -46,6 +46,7 @@ migrations/pg/0006_catalog_title_indexes.py,sha256=n2OGxwE4oeCwHAYaxBkja4t10BmwT
|
|
|
46
46
|
migrations/pg/0007_catalog_slug.py,sha256=mArzZCBO-RD5DkWxRIyDKgEzrnAcis1TOGvSNUe7Kgg,1150
|
|
47
47
|
migrations/pg/0008_catalog_facets.py,sha256=dxIUdHJHtI_Gyk2dpP7tjHEnL2iPzAufi6ajYm2FVMI,1595
|
|
48
48
|
migrations/pg/0009_extract_facets_safety.py,sha256=k9Appx7ipp3wDyLy70qgw9oLjN7N6BEadE-N5Fhan-4,1066
|
|
49
|
+
migrations/pg/0010_shards_index.py,sha256=7s7c2s5768BkOqexJ2CedzJq8SigGKC9rfOigtQOraA,1417
|
|
49
50
|
migrations/pg/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
|
50
51
|
nucliadb/__init__.py,sha256=_abCmDJ_0ku483Os4UAjPX7Nywm39cQgAV_DiyjsKeQ,891
|
|
51
52
|
nucliadb/health.py,sha256=UIxxA4oms4HIsCRZM_SZsdkIZIlgzmOxw-qSHLlWuak,3465
|
|
@@ -86,7 +87,7 @@ nucliadb/common/cluster/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIX
|
|
|
86
87
|
nucliadb/common/cluster/exceptions.py,sha256=t7v_l93t44l2tQpdQXgO_w-c4YZRcaayOz1A2i0w4RQ,1258
|
|
87
88
|
nucliadb/common/cluster/grpc_node_dummy.py,sha256=JkufazWzMA4KFEU8EBkMbiiDW4C8lLcRhiiCxP7aCQY,2949
|
|
88
89
|
nucliadb/common/cluster/manager.py,sha256=p-haaGEnCa-20t-I2XEo4QJ5ZC1QgJ6p2jzXFYVB6nQ,12346
|
|
89
|
-
nucliadb/common/cluster/rebalance.py,sha256=
|
|
90
|
+
nucliadb/common/cluster/rebalance.py,sha256=XT8APmvvjwr5pbMPqnVcFj1ZehSQwOP5US-cKkEyEag,23559
|
|
90
91
|
nucliadb/common/cluster/rollover.py,sha256=kmVCdyjJ1dilnSodHMqf0EUkTjPC5H0aA4JqW9KsEa4,27168
|
|
91
92
|
nucliadb/common/cluster/settings.py,sha256=f6Y5K0PGahkedwe5wtkWMnbqwDFJgOOwX_MOIGwH9Dg,2271
|
|
92
93
|
nucliadb/common/cluster/utils.py,sha256=E4GqidwTKczJX_lTnncBCof2fL4CFVVF1eLiz9NWjlc,5494
|
|
@@ -155,7 +156,7 @@ nucliadb/ingest/consumer/shard_creator.py,sha256=qUEpxZLE1etw1nL8L3O9HvZBx5NNql7
|
|
|
155
156
|
nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
|
|
156
157
|
nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
|
157
158
|
nucliadb/ingest/fields/base.py,sha256=Yk6b7OQ96YAAaDEyckry6-dBhzXASzuJucjTpEvFUZ4,23715
|
|
158
|
-
nucliadb/ingest/fields/conversation.py,sha256=
|
|
159
|
+
nucliadb/ingest/fields/conversation.py,sha256=QbHBtjOV_KWeJu_nrP8DTQ6uTc-yDAXliiJ1f2LienY,9570
|
|
159
160
|
nucliadb/ingest/fields/exceptions.py,sha256=sZBk21BSrXFdOdo1qUdCAyD-9YMYakSLdn4_WdIPCIQ,1217
|
|
160
161
|
nucliadb/ingest/fields/file.py,sha256=1v4jLg3balUua2VmSV8hHkAwPFShTUCOzufZvIUQcQw,4740
|
|
161
162
|
nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54Y4Ig,1547
|
|
@@ -374,7 +375,7 @@ nucliadb/writer/api/v1/vectorsets.py,sha256=F3iMViL5G95_Tns4aO2SOA0DwAzxK2_P8MXx
|
|
|
374
375
|
nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
|
375
376
|
nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
|
|
376
377
|
nucliadb/writer/resource/basic.py,sha256=44GK8M9EEVoAUfGiabdLrrpENqeFwNn7qwxF2AHhQGg,10504
|
|
377
|
-
nucliadb/writer/resource/field.py,sha256=
|
|
378
|
+
nucliadb/writer/resource/field.py,sha256=FF1cDuvDPZN6TrwiMQiyGqxNU4bGxXYpVcHsM-BNpKs,22946
|
|
378
379
|
nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
|
|
379
380
|
nucliadb/writer/tus/__init__.py,sha256=Kera0BtxoDX0ngPftXiMjNgjrhtQ3l2XFc5nJqSBOJY,5498
|
|
380
381
|
nucliadb/writer/tus/azure.py,sha256=yxoRi4PhGDikTqVK3PiuVyguy8H9DOS66JpZCY4hpUY,4177
|
|
@@ -385,8 +386,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
|
385
386
|
nucliadb/writer/tus/s3.py,sha256=vu1BGg4VqJ_x2P1u2BxqPKlSfw5orT_a3R-Ln5oPUpU,8483
|
|
386
387
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
|
387
388
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
|
388
|
-
nucliadb-6.9.
|
|
389
|
-
nucliadb-6.9.
|
|
390
|
-
nucliadb-6.9.
|
|
391
|
-
nucliadb-6.9.
|
|
392
|
-
nucliadb-6.9.
|
|
389
|
+
nucliadb-6.9.3.post5290.dist-info/METADATA,sha256=4vQ4BF0XZCMBSLDcbFox0HLi4sYZxqM3sOzsdM1xH5w,4118
|
|
390
|
+
nucliadb-6.9.3.post5290.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
391
|
+
nucliadb-6.9.3.post5290.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
|
392
|
+
nucliadb-6.9.3.post5290.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
|
393
|
+
nucliadb-6.9.3.post5290.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|