nucliadb 6.9.1.post5180__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/filter_expression.py +15 -32
- nucliadb/ingest/consumer/service.py +1 -2
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/fields/base.py +0 -17
- nucliadb/ingest/orm/knowledgebox.py +78 -29
- nucliadb/ingest/orm/processor/processor.py +21 -16
- nucliadb/ingest/service/writer.py +12 -5
- nucliadb/migrator/datamanager.py +1 -7
- nucliadb/purge/__init__.py +2 -7
- nucliadb/reader/api/v1/learning_config.py +21 -0
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/search/chat/ask.py +0 -1
- nucliadb/search/search/chat/prompt.py +45 -13
- nucliadb/search/search/chat/query.py +0 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/models.py +0 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +2 -2
- nucliadb/search/search/query_parser/parsers/find.py +0 -8
- nucliadb/search/search/query_parser/parsers/search.py +0 -8
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/writer/api/v1/knowledgebox.py +15 -22
- {nucliadb-6.9.1.post5180.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +8 -9
- {nucliadb-6.9.1.post5180.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +35 -34
- {nucliadb-6.9.1.post5180.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5180.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5180.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -34,8 +34,10 @@ from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
|
34
34
|
from nucliadb.common.external_index_providers.manager import (
|
|
35
35
|
get_external_index_manager,
|
|
36
36
|
)
|
|
37
|
+
from nucliadb.common.maindb.utils import get_driver
|
|
37
38
|
from nucliadb.common.nidx import get_nidx_api_client
|
|
38
39
|
from nucliadb.common.vector_index_config import nucliadb_index_config_to_nidx
|
|
40
|
+
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
|
39
41
|
from nucliadb.migrator.settings import settings
|
|
40
42
|
from nucliadb_protos import utils_pb2, writer_pb2
|
|
41
43
|
from nucliadb_telemetry import errors
|
|
@@ -45,6 +47,7 @@ from .utils import (
|
|
|
45
47
|
get_resource,
|
|
46
48
|
get_rollover_resource_index_message,
|
|
47
49
|
index_resource_to_shard,
|
|
50
|
+
wait_for_nidx,
|
|
48
51
|
)
|
|
49
52
|
|
|
50
53
|
logger = logging.getLogger(__name__)
|
|
@@ -254,6 +257,7 @@ async def index_to_rollover_index(
|
|
|
254
257
|
for rid in resource_ids
|
|
255
258
|
]
|
|
256
259
|
await asyncio.gather(*batch)
|
|
260
|
+
await wait_for_indexing_to_catch_up(app_context)
|
|
257
261
|
|
|
258
262
|
async with datamanagers.with_transaction() as txn:
|
|
259
263
|
state.resources_indexed = True
|
|
@@ -262,6 +266,22 @@ async def index_to_rollover_index(
|
|
|
262
266
|
await txn.commit()
|
|
263
267
|
|
|
264
268
|
|
|
269
|
+
async def wait_for_indexing_to_catch_up(app_context: ApplicationContext):
|
|
270
|
+
try:
|
|
271
|
+
app_context.nats_manager
|
|
272
|
+
except AssertionError:
|
|
273
|
+
logger.warning("Nats manager not initialized. Cannot wait for indexing to catch up")
|
|
274
|
+
return
|
|
275
|
+
max_pending = 1000
|
|
276
|
+
while True:
|
|
277
|
+
try:
|
|
278
|
+
await wait_for_nidx(app_context.nats_manager, max_wait_seconds=60, max_pending=max_pending)
|
|
279
|
+
return
|
|
280
|
+
except asyncio.TimeoutError:
|
|
281
|
+
logger.warning(f"Nidx is behind more than {max_pending} messages. Throttling rollover.")
|
|
282
|
+
await asyncio.sleep(30)
|
|
283
|
+
|
|
284
|
+
|
|
265
285
|
async def _index_resource_to_rollover_index(
|
|
266
286
|
app_context: ApplicationContext,
|
|
267
287
|
rollover_shards: writer_pb2.Shards,
|
|
@@ -415,6 +435,15 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
|
|
|
415
435
|
|
|
416
436
|
await txn.commit()
|
|
417
437
|
|
|
438
|
+
# For KBs with pre-warm enabled, we must configure the new shards. There may
|
|
439
|
+
# be some small delay between this call and the shards being actually
|
|
440
|
+
# prewarmed, but rollovers are quite unusual and we prefer this rather than
|
|
441
|
+
# prewarming old and new shards at the same time
|
|
442
|
+
kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
|
|
443
|
+
if kb_config is not None and kb_config.prewarm_enabled:
|
|
444
|
+
driver = get_driver()
|
|
445
|
+
await KnowledgeBox.configure_shards(driver, kbid, prewarm=True)
|
|
446
|
+
|
|
418
447
|
|
|
419
448
|
async def validate_indexed_data(
|
|
420
449
|
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
nucliadb/common/cluster/utils.py
CHANGED
|
@@ -32,6 +32,7 @@ from nucliadb.common.cluster.settings import settings
|
|
|
32
32
|
from nucliadb.ingest.orm import index_message
|
|
33
33
|
from nucliadb.ingest.orm.resource import Resource
|
|
34
34
|
from nucliadb_protos import writer_pb2
|
|
35
|
+
from nucliadb_utils.nats import NatsConnectionManager
|
|
35
36
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
|
36
37
|
|
|
37
38
|
if TYPE_CHECKING: # pragma: no cover
|
|
@@ -125,3 +126,28 @@ async def delete_resource_from_shard(
|
|
|
125
126
|
partition = partitioning.generate_partition(kbid, resource_id)
|
|
126
127
|
|
|
127
128
|
await sm.delete_resource(shard, resource_id, 0, str(partition), kbid)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def get_nats_consumer_pending_messages(
|
|
132
|
+
nats_manager: NatsConnectionManager, *, stream: str, consumer: str
|
|
133
|
+
) -> int:
|
|
134
|
+
# get raw js client
|
|
135
|
+
js = nats_manager.js
|
|
136
|
+
consumer_info = await js.consumer_info(stream, consumer)
|
|
137
|
+
return consumer_info.num_pending
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
async def wait_for_nidx(
|
|
141
|
+
nats_manager: NatsConnectionManager,
|
|
142
|
+
max_pending: int,
|
|
143
|
+
poll_interval_seconds: int = 5,
|
|
144
|
+
max_wait_seconds: int = 60,
|
|
145
|
+
):
|
|
146
|
+
async with asyncio.timeout(max_wait_seconds): # type: ignore
|
|
147
|
+
while True:
|
|
148
|
+
pending = await get_nats_consumer_pending_messages(
|
|
149
|
+
nats_manager, stream="nidx", consumer="nidx"
|
|
150
|
+
)
|
|
151
|
+
if pending < max_pending:
|
|
152
|
+
return
|
|
153
|
+
await asyncio.sleep(poll_interval_seconds)
|
|
@@ -42,6 +42,7 @@ from typing_extensions import Concatenate, ParamSpec
|
|
|
42
42
|
|
|
43
43
|
from nucliadb.common.maindb.driver import Transaction
|
|
44
44
|
|
|
45
|
+
from . import cluster as cluster_dm
|
|
45
46
|
from . import kb as kb_dm
|
|
46
47
|
from . import labels as labels_dm
|
|
47
48
|
from . import resources as resources_dm
|
|
@@ -73,6 +74,10 @@ def rw_txn_wrap(fun: Callable[Concatenate[Transaction, P], Awaitable[T]]) -> Cal
|
|
|
73
74
|
return wrapper
|
|
74
75
|
|
|
75
76
|
|
|
77
|
+
class cluster:
|
|
78
|
+
get_kb_shards = ro_txn_wrap(cluster_dm.get_kb_shards)
|
|
79
|
+
|
|
80
|
+
|
|
76
81
|
class kb:
|
|
77
82
|
exists_kb = ro_txn_wrap(kb_dm.exists_kb)
|
|
78
83
|
get_config = ro_txn_wrap(kb_dm.get_config)
|
|
@@ -83,6 +88,7 @@ class resources:
|
|
|
83
88
|
get_resource_uuid_from_slug = ro_txn_wrap(resources_dm.get_resource_uuid_from_slug)
|
|
84
89
|
resource_exists = ro_txn_wrap(resources_dm.resource_exists)
|
|
85
90
|
slug_exists = ro_txn_wrap(resources_dm.slug_exists)
|
|
91
|
+
get_all_field_ids = ro_txn_wrap(resources_dm.get_all_field_ids)
|
|
86
92
|
|
|
87
93
|
|
|
88
94
|
class labelset:
|
|
@@ -52,37 +52,20 @@ from nucliadb_models.filters import (
|
|
|
52
52
|
)
|
|
53
53
|
|
|
54
54
|
# Filters that end up as a facet
|
|
55
|
-
FacetFilter =
|
|
56
|
-
OriginTag
|
|
57
|
-
Label
|
|
58
|
-
ResourceMimetype
|
|
59
|
-
FieldMimetype
|
|
60
|
-
Entity
|
|
61
|
-
Language
|
|
62
|
-
OriginMetadata
|
|
63
|
-
OriginPath
|
|
64
|
-
Generated
|
|
65
|
-
Kind
|
|
66
|
-
OriginCollaborator
|
|
67
|
-
OriginSource
|
|
68
|
-
Status
|
|
69
|
-
]
|
|
70
|
-
# In Python 3.9 we cannot do isinstance against an union
|
|
71
|
-
# Once we support only 3.10+, we can remove this
|
|
72
|
-
FacetFilterTypes = (
|
|
73
|
-
OriginTag,
|
|
74
|
-
Label,
|
|
75
|
-
ResourceMimetype,
|
|
76
|
-
FieldMimetype,
|
|
77
|
-
Entity,
|
|
78
|
-
Language,
|
|
79
|
-
OriginMetadata,
|
|
80
|
-
OriginPath,
|
|
81
|
-
Generated,
|
|
82
|
-
Kind,
|
|
83
|
-
OriginCollaborator,
|
|
84
|
-
OriginSource,
|
|
85
|
-
Status,
|
|
55
|
+
FacetFilter = (
|
|
56
|
+
OriginTag
|
|
57
|
+
| Label
|
|
58
|
+
| ResourceMimetype
|
|
59
|
+
| FieldMimetype
|
|
60
|
+
| Entity
|
|
61
|
+
| Language
|
|
62
|
+
| OriginMetadata
|
|
63
|
+
| OriginPath
|
|
64
|
+
| Generated
|
|
65
|
+
| Kind
|
|
66
|
+
| OriginCollaborator
|
|
67
|
+
| OriginSource
|
|
68
|
+
| Status
|
|
86
69
|
)
|
|
87
70
|
|
|
88
71
|
|
|
@@ -131,7 +114,7 @@ async def parse_expression(
|
|
|
131
114
|
f.date.since.FromDatetime(expr.since)
|
|
132
115
|
if expr.until:
|
|
133
116
|
f.date.until.FromDatetime(expr.until)
|
|
134
|
-
elif isinstance(expr,
|
|
117
|
+
elif isinstance(expr, FacetFilter):
|
|
135
118
|
f.facet.facet = facet_from_filter(expr)
|
|
136
119
|
else:
|
|
137
120
|
# This is a trick so mypy generates an error if this branch can be reached,
|
|
@@ -140,9 +140,8 @@ async def start_shard_creator() -> Callable[[], Awaitable[None]]:
|
|
|
140
140
|
driver = await setup_driver()
|
|
141
141
|
pubsub = await get_pubsub()
|
|
142
142
|
assert pubsub is not None, "Pubsub is not configured"
|
|
143
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
|
144
143
|
|
|
145
|
-
shard_creator = ShardCreatorHandler(driver=driver,
|
|
144
|
+
shard_creator = ShardCreatorHandler(driver=driver, pubsub=pubsub)
|
|
146
145
|
await shard_creator.initialize()
|
|
147
146
|
|
|
148
147
|
return shard_creator.finalize
|
|
@@ -25,14 +25,14 @@ from typing import Any
|
|
|
25
25
|
|
|
26
26
|
from nidx_protos import nodereader_pb2, noderesources_pb2
|
|
27
27
|
|
|
28
|
-
from nucliadb.common import locking
|
|
28
|
+
from nucliadb.common import datamanagers, locking
|
|
29
|
+
from nucliadb.common.cluster.settings import settings
|
|
29
30
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
30
31
|
from nucliadb.common.maindb.driver import Driver
|
|
31
32
|
from nucliadb.common.nidx import get_nidx_api_client
|
|
32
33
|
from nucliadb_protos import writer_pb2
|
|
33
34
|
from nucliadb_utils import const
|
|
34
35
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
|
35
|
-
from nucliadb_utils.storages.storage import Storage
|
|
36
36
|
|
|
37
37
|
from . import metrics
|
|
38
38
|
from .utils import DelayedTaskHandler
|
|
@@ -52,12 +52,10 @@ class ShardCreatorHandler:
|
|
|
52
52
|
self,
|
|
53
53
|
*,
|
|
54
54
|
driver: Driver,
|
|
55
|
-
storage: Storage,
|
|
56
55
|
pubsub: PubSubDriver,
|
|
57
56
|
check_delay: float = 10.0,
|
|
58
57
|
):
|
|
59
58
|
self.driver = driver
|
|
60
|
-
self.storage = storage
|
|
61
59
|
self.pubsub = pubsub
|
|
62
60
|
self.shard_manager = get_shard_manager()
|
|
63
61
|
self.task_handler = DelayedTaskHandler(check_delay)
|
|
@@ -111,4 +109,17 @@ class ShardCreatorHandler:
|
|
|
111
109
|
shard_id=noderesources_pb2.ShardId(id=current_shard.nidx_shard_id)
|
|
112
110
|
) # type: ignore
|
|
113
111
|
)
|
|
114
|
-
|
|
112
|
+
|
|
113
|
+
if not should_create_new_shard(shard.paragraphs):
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
logger.info({"message": "Adding shard", "kbid": kbid})
|
|
117
|
+
async with datamanagers.with_rw_transaction() as txn:
|
|
118
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
119
|
+
prewarm = kb_config is not None and kb_config.prewarm_enabled
|
|
120
|
+
await self.shard_manager.create_shard_by_kbid(txn, kbid, prewarm_enabled=prewarm)
|
|
121
|
+
await txn.commit()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def should_create_new_shard(num_paragraphs: int) -> bool:
|
|
125
|
+
return num_paragraphs > settings.max_shard_paragraphs
|
nucliadb/ingest/fields/base.py
CHANGED
|
@@ -47,10 +47,8 @@ from nucliadb_protos.resources_pb2 import (
|
|
|
47
47
|
)
|
|
48
48
|
from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
|
|
49
49
|
from nucliadb_protos.writer_pb2 import Error, FieldStatus
|
|
50
|
-
from nucliadb_utils import const
|
|
51
50
|
from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
|
|
52
51
|
from nucliadb_utils.storages.storage import Storage, StorageField
|
|
53
|
-
from nucliadb_utils.utilities import has_feature
|
|
54
52
|
|
|
55
53
|
logger = logging.getLogger(__name__)
|
|
56
54
|
|
|
@@ -224,21 +222,6 @@ class Field(Generic[PbType]):
|
|
|
224
222
|
) -> None:
|
|
225
223
|
# Try delete vectors
|
|
226
224
|
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
|
227
|
-
|
|
228
|
-
if has_feature(const.Features.DEBUG_MISSING_VECTORS):
|
|
229
|
-
# This is a very chatty log. It is just a temporary hint while debugging an issue.
|
|
230
|
-
logger.info(
|
|
231
|
-
"Deleting vectors from storage",
|
|
232
|
-
extra={
|
|
233
|
-
"kbid": self.kbid,
|
|
234
|
-
"rid": self.resource.uuid,
|
|
235
|
-
"field": f"{self.type}/{self.id}",
|
|
236
|
-
"vectorset": vectorset,
|
|
237
|
-
"storage_key_kind": storage_key_kind,
|
|
238
|
-
"key": sf.key,
|
|
239
|
-
"bucket": sf.bucket,
|
|
240
|
-
},
|
|
241
|
-
)
|
|
242
225
|
try:
|
|
243
226
|
await self.storage.delete_upload(sf.key, sf.bucket)
|
|
244
227
|
except KeyError:
|
|
@@ -24,7 +24,7 @@ from uuid import uuid4
|
|
|
24
24
|
|
|
25
25
|
from grpc import StatusCode
|
|
26
26
|
from grpc.aio import AioRpcError
|
|
27
|
-
from nidx_protos import noderesources_pb2
|
|
27
|
+
from nidx_protos import nidx_pb2, noderesources_pb2
|
|
28
28
|
|
|
29
29
|
from nucliadb.common import datamanagers
|
|
30
30
|
from nucliadb.common.cluster.exceptions import ShardNotFound
|
|
@@ -108,6 +108,7 @@ class KnowledgeBox:
|
|
|
108
108
|
external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
|
|
109
109
|
hidden_resources_enabled: bool = False,
|
|
110
110
|
hidden_resources_hide_on_creation: bool = False,
|
|
111
|
+
prewarm_enabled: bool = False,
|
|
111
112
|
) -> tuple[str, str]:
|
|
112
113
|
"""Creates a new knowledge box and return its id and slug."""
|
|
113
114
|
|
|
@@ -194,6 +195,7 @@ class KnowledgeBox:
|
|
|
194
195
|
migration_version=get_latest_version(),
|
|
195
196
|
hidden_resources_enabled=hidden_resources_enabled,
|
|
196
197
|
hidden_resources_hide_on_creation=hidden_resources_hide_on_creation,
|
|
198
|
+
prewarm_enabled=prewarm_enabled,
|
|
197
199
|
)
|
|
198
200
|
config.external_index_provider.CopyFrom(stored_external_index_provider)
|
|
199
201
|
await datamanagers.kb.set_config(txn, kbid=kbid, config=config)
|
|
@@ -220,7 +222,7 @@ class KnowledgeBox:
|
|
|
220
222
|
shard_manager = get_shard_manager()
|
|
221
223
|
# XXX creating a shard is a slow IO operation that requires a write
|
|
222
224
|
# txn to be open!
|
|
223
|
-
await shard_manager.create_shard_by_kbid(txn, kbid)
|
|
225
|
+
await shard_manager.create_shard_by_kbid(txn, kbid, prewarm_enabled=prewarm_enabled)
|
|
224
226
|
# shards don't need a rollback as they will be eventually purged
|
|
225
227
|
|
|
226
228
|
await txn.commit()
|
|
@@ -243,39 +245,86 @@ class KnowledgeBox:
|
|
|
243
245
|
@classmethod
|
|
244
246
|
async def update(
|
|
245
247
|
cls,
|
|
246
|
-
|
|
247
|
-
|
|
248
|
+
driver: Driver,
|
|
249
|
+
kbid: str,
|
|
250
|
+
*,
|
|
248
251
|
slug: Optional[str] = None,
|
|
249
|
-
|
|
252
|
+
title: Optional[str] = None,
|
|
253
|
+
description: Optional[str] = None,
|
|
254
|
+
migration_version: Optional[int] = None,
|
|
255
|
+
external_index_provider: Optional[StoredExternalIndexProviderMetadata] = None,
|
|
256
|
+
hidden_resources_enabled: Optional[bool] = None,
|
|
257
|
+
hidden_resources_hide_on_creation: Optional[bool] = None,
|
|
258
|
+
prewarm_enabled: Optional[bool] = None,
|
|
250
259
|
) -> str:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
else:
|
|
264
|
-
exist.slug = slug
|
|
260
|
+
async with driver.rw_transaction() as txn:
|
|
261
|
+
stored = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
|
|
262
|
+
if not stored:
|
|
263
|
+
raise datamanagers.exceptions.KnowledgeBoxNotFound()
|
|
264
|
+
|
|
265
|
+
if slug:
|
|
266
|
+
await txn.delete(datamanagers.kb.KB_SLUGS.format(slug=stored.slug))
|
|
267
|
+
await txn.set(
|
|
268
|
+
datamanagers.kb.KB_SLUGS.format(slug=slug),
|
|
269
|
+
kbid.encode(),
|
|
270
|
+
)
|
|
271
|
+
stored.slug = slug
|
|
265
272
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
273
|
+
if title is not None:
|
|
274
|
+
stored.title = title
|
|
275
|
+
if description is not None:
|
|
276
|
+
stored.description = description
|
|
270
277
|
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
278
|
+
if migration_version is not None:
|
|
279
|
+
stored.migration_version = migration_version
|
|
280
|
+
|
|
281
|
+
if external_index_provider is not None:
|
|
282
|
+
stored.external_index_provider.MergeFrom(external_index_provider)
|
|
275
283
|
|
|
276
|
-
|
|
284
|
+
if hidden_resources_enabled is not None:
|
|
285
|
+
stored.hidden_resources_enabled = hidden_resources_enabled
|
|
286
|
+
if hidden_resources_hide_on_creation is not None:
|
|
287
|
+
stored.hidden_resources_hide_on_creation = hidden_resources_hide_on_creation
|
|
288
|
+
|
|
289
|
+
update_nidx_prewarm = None
|
|
290
|
+
if prewarm_enabled is not None:
|
|
291
|
+
if stored.prewarm_enabled != prewarm_enabled:
|
|
292
|
+
update_nidx_prewarm = prewarm_enabled
|
|
293
|
+
stored.prewarm_enabled = prewarm_enabled
|
|
294
|
+
|
|
295
|
+
if stored.hidden_resources_hide_on_creation and not stored.hidden_resources_enabled:
|
|
296
|
+
raise KnowledgeBoxCreationError(
|
|
297
|
+
"Cannot hide new resources if the hidden resources feature is disabled"
|
|
298
|
+
)
|
|
277
299
|
|
|
278
|
-
|
|
300
|
+
await datamanagers.kb.set_config(txn, kbid=kbid, config=stored)
|
|
301
|
+
|
|
302
|
+
await txn.commit()
|
|
303
|
+
|
|
304
|
+
if update_nidx_prewarm is not None:
|
|
305
|
+
await cls.configure_shards(driver, kbid, prewarm=update_nidx_prewarm)
|
|
306
|
+
|
|
307
|
+
return kbid
|
|
308
|
+
|
|
309
|
+
@classmethod
|
|
310
|
+
async def configure_shards(cls, driver: Driver, kbid: str, *, prewarm: bool):
|
|
311
|
+
shards_obj = await datamanagers.atomic.cluster.get_kb_shards(kbid=kbid)
|
|
312
|
+
if shards_obj is None:
|
|
313
|
+
logger.warning(f"Shards not found for KB while updating pre-warm flag", extra={"kbid": kbid})
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
nidx_shard_ids = [shard.nidx_shard_id for shard in shards_obj.shards]
|
|
317
|
+
|
|
318
|
+
nidx_api = get_nidx_api_client()
|
|
319
|
+
if nidx_api is not None and len(nidx_shard_ids) > 0:
|
|
320
|
+
configs = [
|
|
321
|
+
nidx_pb2.ShardConfig(
|
|
322
|
+
shard_id=shard_id,
|
|
323
|
+
prewarm_enabled=prewarm,
|
|
324
|
+
)
|
|
325
|
+
for shard_id in nidx_shard_ids
|
|
326
|
+
]
|
|
327
|
+
await nidx_api.ConfigureShards(nidx_pb2.ShardsConfig(configs=configs))
|
|
279
328
|
|
|
280
329
|
@classmethod
|
|
281
330
|
async def delete(cls, driver: Driver, kbid: str):
|
|
@@ -446,22 +446,27 @@ class Processor:
|
|
|
446
446
|
# a resource was move to another shard while it was being indexed
|
|
447
447
|
shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
|
|
448
448
|
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
449
|
+
shard = None
|
|
450
|
+
if shard_id is not None:
|
|
451
|
+
# Resource already has a shard assigned
|
|
452
|
+
shard = await kb.get_resource_shard(shard_id)
|
|
453
|
+
if shard is None:
|
|
454
|
+
raise AttributeError("Shard not available")
|
|
455
|
+
else:
|
|
456
|
+
# It's a new resource, get KB's current active shard to place new resource on
|
|
457
|
+
shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
|
|
458
|
+
if shard is None:
|
|
459
|
+
# No current shard available, create a new one
|
|
460
|
+
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
|
461
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
462
|
+
prewarm = kb_config is not None and kb_config.prewarm_enabled
|
|
463
|
+
shard = await self.index_node_shard_manager.create_shard_by_kbid(
|
|
464
|
+
txn, kbid, prewarm_enabled=prewarm
|
|
465
|
+
)
|
|
466
|
+
await datamanagers.resources.set_resource_shard_id(
|
|
467
|
+
txn, kbid=kbid, rid=uuid, shard=shard.shard
|
|
468
|
+
)
|
|
469
|
+
return shard
|
|
465
470
|
|
|
466
471
|
@processor_observer.wrap({"type": "index_resource"})
|
|
467
472
|
async def index_resource(
|
|
@@ -126,6 +126,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
126
126
|
external_index_provider=request.external_index_provider,
|
|
127
127
|
hidden_resources_enabled=request.hidden_resources_enabled,
|
|
128
128
|
hidden_resources_hide_on_creation=request.hidden_resources_hide_on_creation,
|
|
129
|
+
prewarm_enabled=request.prewarm_enabled,
|
|
129
130
|
)
|
|
130
131
|
|
|
131
132
|
except KnowledgeBoxConflict:
|
|
@@ -167,11 +168,17 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
167
168
|
)
|
|
168
169
|
|
|
169
170
|
try:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
171
|
+
kbid = await KnowledgeBoxORM.update(
|
|
172
|
+
self.driver,
|
|
173
|
+
kbid=request.uuid,
|
|
174
|
+
slug=request.slug,
|
|
175
|
+
title=request.config.title or None,
|
|
176
|
+
description=request.config.description or None,
|
|
177
|
+
external_index_provider=request.config.external_index_provider or None,
|
|
178
|
+
hidden_resources_enabled=request.config.hidden_resources_enabled,
|
|
179
|
+
hidden_resources_hide_on_creation=request.config.hidden_resources_hide_on_creation,
|
|
180
|
+
prewarm_enabled=request.config.prewarm_enabled,
|
|
181
|
+
)
|
|
175
182
|
except KnowledgeBoxNotFound:
|
|
176
183
|
return UpdateKnowledgeBoxResponse(status=KnowledgeBoxResponseStatus.NOTFOUND)
|
|
177
184
|
except Exception:
|
nucliadb/migrator/datamanager.py
CHANGED
|
@@ -77,13 +77,7 @@ class MigrationsDataManager:
|
|
|
77
77
|
return KnowledgeBoxInfo(current_version=kb_config.migration_version)
|
|
78
78
|
|
|
79
79
|
async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
|
|
80
|
-
|
|
81
|
-
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
|
|
82
|
-
if kb_config is None:
|
|
83
|
-
raise Exception(f"KB {kbid} does not exist")
|
|
84
|
-
kb_config.migration_version = current_version
|
|
85
|
-
await KnowledgeBoxORM.update(txn, kbid, config=kb_config)
|
|
86
|
-
await txn.commit()
|
|
80
|
+
await KnowledgeBoxORM.update(self.driver, kbid, migration_version=current_version)
|
|
87
81
|
|
|
88
82
|
async def get_global_info(self) -> GlobalInfo:
|
|
89
83
|
async with self.driver.ro_transaction() as txn:
|
nucliadb/purge/__init__.py
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
import importlib.metadata
|
|
22
|
+
from itertools import batched # type: ignore
|
|
22
23
|
from typing import AsyncGenerator
|
|
23
24
|
|
|
24
25
|
from nucliadb.common import datamanagers
|
|
@@ -233,7 +234,7 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
|
|
233
234
|
fields.extend((await resource.get_fields(force=True)).values())
|
|
234
235
|
|
|
235
236
|
logger.info(f"Purging {len(fields)} fields for vectorset {vectorset}", extra={"kbid": kbid})
|
|
236
|
-
for fields_batch in
|
|
237
|
+
for fields_batch in batched(fields, n=20):
|
|
237
238
|
tasks = []
|
|
238
239
|
for field in fields_batch:
|
|
239
240
|
if purge_payload.storage_key_kind == VectorSetConfig.StorageKeyKind.UNSET:
|
|
@@ -317,9 +318,3 @@ def run() -> int: # pragma: no cover
|
|
|
317
318
|
setup_logging()
|
|
318
319
|
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
|
319
320
|
return asyncio.run(main())
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
def batchify(iterable, n=1):
|
|
323
|
-
"""Yield successive n-sized chunks from iterable."""
|
|
324
|
-
for i in range(0, len(iterable), n):
|
|
325
|
-
yield iterable[i : i + n]
|
|
@@ -128,6 +128,27 @@ async def get_schema_for_configuration_updates(
|
|
|
128
128
|
)
|
|
129
129
|
|
|
130
130
|
|
|
131
|
+
@api.get(
|
|
132
|
+
path=f"/{KB_PREFIX}/{{kbid}}/generative_providers",
|
|
133
|
+
status_code=200,
|
|
134
|
+
summary="Available models for a knowledge box",
|
|
135
|
+
description="Get all available models for a knowledge box grouped by provider",
|
|
136
|
+
response_model=None,
|
|
137
|
+
tags=["Models"],
|
|
138
|
+
)
|
|
139
|
+
@requires_one([NucliaDBRoles.READER, NucliaDBRoles.MANAGER])
|
|
140
|
+
@version(1)
|
|
141
|
+
async def get_models_group_by_providers(
|
|
142
|
+
request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
|
|
143
|
+
):
|
|
144
|
+
return await learning_config_proxy(
|
|
145
|
+
request,
|
|
146
|
+
"GET",
|
|
147
|
+
f"/generative_providers/{kbid}",
|
|
148
|
+
headers={"account-id": x_nucliadb_account},
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
131
152
|
@api.get(
|
|
132
153
|
path=f"/nua/schema",
|
|
133
154
|
status_code=200,
|
nucliadb/search/api/v1/find.py
CHANGED
|
@@ -46,7 +46,6 @@ from nucliadb_models.search import (
|
|
|
46
46
|
KnowledgeboxFindResults,
|
|
47
47
|
NucliaDBClientType,
|
|
48
48
|
RankFusionName,
|
|
49
|
-
Reranker,
|
|
50
49
|
RerankerName,
|
|
51
50
|
ResourceProperties,
|
|
52
51
|
SearchParamDefaults,
|
|
@@ -127,11 +126,10 @@ async def find_knowledgebox(
|
|
|
127
126
|
extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
|
|
128
127
|
with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
|
|
129
128
|
with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
|
|
130
|
-
autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
|
|
131
129
|
security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
|
|
132
130
|
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
|
133
131
|
rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
|
|
134
|
-
reranker:
|
|
132
|
+
reranker: RerankerName = fastapi_query(SearchParamDefaults.reranker),
|
|
135
133
|
search_configuration: Optional[str] = Query(
|
|
136
134
|
default=None,
|
|
137
135
|
description="Load find parameters from this configuration. Parameters in the request override parameters from the configuration.",
|
|
@@ -166,7 +164,6 @@ async def find_knowledgebox(
|
|
|
166
164
|
extracted=extracted,
|
|
167
165
|
with_duplicates=with_duplicates,
|
|
168
166
|
with_synonyms=with_synonyms,
|
|
169
|
-
autofilter=autofilter,
|
|
170
167
|
security=security,
|
|
171
168
|
show_hidden=show_hidden,
|
|
172
169
|
rank_fusion=rank_fusion,
|
|
@@ -28,7 +28,8 @@ from nucliadb.search.api.v1.resource.utils import get_resource_uuid_by_slug
|
|
|
28
28
|
from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_SLUG_PREFIX, api
|
|
29
29
|
from nucliadb_models.resource import NucliaDBRoles
|
|
30
30
|
from nucliadb_models.search import AskRequest, NucliaDBClientType, SyncAskResponse
|
|
31
|
-
from
|
|
31
|
+
from nucliadb_models.security import RequestSecurity
|
|
32
|
+
from nucliadb_utils.authentication import NucliaUser, requires
|
|
32
33
|
|
|
33
34
|
from ..ask import create_ask_response
|
|
34
35
|
|
|
@@ -58,6 +59,15 @@ async def resource_ask_endpoint_by_uuid(
|
|
|
58
59
|
"This is slower and requires waiting for entire answer to be ready.",
|
|
59
60
|
),
|
|
60
61
|
) -> Union[StreamingResponse, HTTPClientError, Response]:
|
|
62
|
+
current_user: NucliaUser = request.user
|
|
63
|
+
# If present, security groups from AuthorizationBackend overrides any
|
|
64
|
+
# security group of the payload
|
|
65
|
+
if current_user.security_groups:
|
|
66
|
+
if item.security is None:
|
|
67
|
+
item.security = RequestSecurity(groups=current_user.security_groups)
|
|
68
|
+
else:
|
|
69
|
+
item.security.groups = current_user.security_groups
|
|
70
|
+
|
|
61
71
|
return await create_ask_response(
|
|
62
72
|
kbid=kbid,
|
|
63
73
|
ask_request=item,
|
|
@@ -98,6 +108,16 @@ async def resource_ask_endpoint_by_slug(
|
|
|
98
108
|
resource_id = await get_resource_uuid_by_slug(kbid, slug)
|
|
99
109
|
if resource_id is None:
|
|
100
110
|
return HTTPClientError(status_code=404, detail="Resource not found")
|
|
111
|
+
|
|
112
|
+
current_user: NucliaUser = request.user
|
|
113
|
+
# If present, security groups from AuthorizationBackend overrides any
|
|
114
|
+
# security group of the payload
|
|
115
|
+
if current_user.security_groups:
|
|
116
|
+
if item.security is None:
|
|
117
|
+
item.security = RequestSecurity(groups=current_user.security_groups)
|
|
118
|
+
else:
|
|
119
|
+
item.security.groups = current_user.security_groups
|
|
120
|
+
|
|
101
121
|
return await create_ask_response(
|
|
102
122
|
kbid=kbid,
|
|
103
123
|
ask_request=item,
|