nucliadb 6.2.1.post2864__py3-none-any.whl → 6.2.1.post2869__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/ingest/fields/base.py +43 -18
- nucliadb/ingest/orm/brain.py +11 -21
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/knowledgebox.py +15 -4
- nucliadb/ingest/orm/resource.py +62 -396
- nucliadb/ingest/serialize.py +13 -2
- nucliadb/ingest/service/writer.py +4 -0
- nucliadb/purge/__init__.py +32 -12
- nucliadb/train/nodes.py +13 -7
- nucliadb/train/resource.py +380 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2869.dist-info}/METADATA +5 -5
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2869.dist-info}/RECORD +17 -16
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2869.dist-info}/WHEEL +0 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2869.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2869.dist-info}/top_level.txt +0 -0
- {nucliadb-6.2.1.post2864.dist-info → nucliadb-6.2.1.post2869.dist-info}/zip-safe +0 -0
@@ -58,6 +58,11 @@ async def iter(
|
|
58
58
|
yield config.vectorset_id, config
|
59
59
|
|
60
60
|
|
61
|
+
async def count(txn: Transaction, *, kbid: str) -> int:
|
62
|
+
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
|
63
|
+
return len(kb_vectorsets.vectorsets)
|
64
|
+
|
65
|
+
|
61
66
|
async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
|
62
67
|
"""Create or update a vectorset configuration"""
|
63
68
|
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
|
@@ -73,16 +78,20 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
|
|
73
78
|
await txn.set(key, kb_vectorsets.SerializeToString())
|
74
79
|
|
75
80
|
|
76
|
-
async def delete(
|
81
|
+
async def delete(
|
82
|
+
txn: Transaction, *, kbid: str, vectorset_id: str
|
83
|
+
) -> Optional[knowledgebox_pb2.VectorSetConfig]:
|
77
84
|
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
|
78
85
|
index = _find_vectorset(kb_vectorsets, vectorset_id)
|
79
86
|
if index is None:
|
80
87
|
# already deleted
|
81
|
-
return
|
88
|
+
return None
|
82
89
|
|
90
|
+
deleted = kb_vectorsets.vectorsets[index]
|
83
91
|
del kb_vectorsets.vectorsets[index]
|
84
92
|
key = KB_VECTORSETS.format(kbid=kbid)
|
85
93
|
await txn.set(key, kb_vectorsets.SerializeToString())
|
94
|
+
return deleted
|
86
95
|
|
87
96
|
|
88
97
|
# XXX At some point in the vectorset epic, we should make this key mandatory and
|
nucliadb/ingest/fields/base.py
CHANGED
@@ -21,12 +21,13 @@ from __future__ import annotations
|
|
21
21
|
|
22
22
|
import enum
|
23
23
|
from datetime import datetime
|
24
|
-
from typing import Any, Generic, Optional, Type, TypeVar
|
24
|
+
from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
|
25
25
|
|
26
26
|
from google.protobuf.message import DecodeError, Message
|
27
27
|
|
28
28
|
from nucliadb.common import datamanagers
|
29
29
|
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
30
|
+
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
30
31
|
from nucliadb_protos.resources_pb2 import (
|
31
32
|
CloudFile,
|
32
33
|
ExtractedTextWrapper,
|
@@ -44,9 +45,15 @@ from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
|
|
44
45
|
from nucliadb_protos.writer_pb2 import Error, FieldStatus
|
45
46
|
from nucliadb_utils.storages.storage import Storage, StorageField
|
46
47
|
|
48
|
+
if TYPE_CHECKING: # pragma: no cover
|
49
|
+
from nucliadb.ingest.orm.resource import Resource
|
50
|
+
|
51
|
+
|
47
52
|
SUBFIELDFIELDS = ("c",)
|
48
53
|
|
49
54
|
|
55
|
+
# NOTE extracted vectors key is no longer a static key, it is stored in each
|
56
|
+
# vectorset
|
50
57
|
class FieldTypes(str, enum.Enum):
|
51
58
|
FIELD_TEXT = "extracted_text"
|
52
59
|
FIELD_VECTORS = "extracted_vectors"
|
@@ -73,7 +80,7 @@ class Field(Generic[PbType]):
|
|
73
80
|
def __init__(
|
74
81
|
self,
|
75
82
|
id: str,
|
76
|
-
resource:
|
83
|
+
resource: Resource,
|
77
84
|
pb: Optional[Any] = None,
|
78
85
|
value: Optional[Any] = None,
|
79
86
|
):
|
@@ -88,7 +95,7 @@ class Field(Generic[PbType]):
|
|
88
95
|
self.question_answers = None
|
89
96
|
|
90
97
|
self.id: str = id
|
91
|
-
self.resource
|
98
|
+
self.resource = resource
|
92
99
|
|
93
100
|
if value is not None:
|
94
101
|
newpb = self.pbklass()
|
@@ -119,11 +126,20 @@ class Field(Generic[PbType]):
|
|
119
126
|
def get_storage_field(self, field_type: FieldTypes) -> StorageField:
|
120
127
|
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, field_type.value)
|
121
128
|
|
122
|
-
def _get_extracted_vectors_storage_field(
|
123
|
-
|
129
|
+
def _get_extracted_vectors_storage_field(
|
130
|
+
self,
|
131
|
+
vectorset: str,
|
132
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
133
|
+
) -> StorageField:
|
134
|
+
if storage_key_kind == VectorSetConfig.StorageKeyKind.LEGACY:
|
135
|
+
key = FieldTypes.FIELD_VECTORS.value
|
136
|
+
elif storage_key_kind == VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX:
|
124
137
|
key = FieldTypes.FIELD_VECTORSET.value.format(vectorset=vectorset)
|
125
138
|
else:
|
126
|
-
|
139
|
+
raise ValueError(
|
140
|
+
f"Can't do anything with UNSET or unknown vectorset storage key kind: {storage_key_kind}"
|
141
|
+
)
|
142
|
+
|
127
143
|
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
|
128
144
|
|
129
145
|
async def db_get_value(self) -> Optional[PbType]:
|
@@ -163,7 +179,8 @@ class Field(Generic[PbType]):
|
|
163
179
|
field_id=self.id,
|
164
180
|
)
|
165
181
|
await self.delete_extracted_text()
|
166
|
-
|
182
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(self.resource.txn, kbid=self.kbid):
|
183
|
+
await self.delete_vectors(vectorset_id, vs.storage_key_kind)
|
167
184
|
await self.delete_metadata()
|
168
185
|
await self.delete_question_answers()
|
169
186
|
|
@@ -181,9 +198,13 @@ class Field(Generic[PbType]):
|
|
181
198
|
except KeyError:
|
182
199
|
pass
|
183
200
|
|
184
|
-
async def delete_vectors(
|
201
|
+
async def delete_vectors(
|
202
|
+
self,
|
203
|
+
vectorset: str,
|
204
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
205
|
+
) -> None:
|
185
206
|
# Try delete vectors
|
186
|
-
sf = self._get_extracted_vectors_storage_field(vectorset)
|
207
|
+
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
187
208
|
try:
|
188
209
|
await self.storage.delete_upload(sf.key, sf.bucket)
|
189
210
|
except KeyError:
|
@@ -328,12 +349,17 @@ class Field(Generic[PbType]):
|
|
328
349
|
self.extracted_text = payload
|
329
350
|
return self.extracted_text
|
330
351
|
|
331
|
-
async def set_vectors(
|
332
|
-
|
352
|
+
async def set_vectors(
|
353
|
+
self,
|
354
|
+
payload: ExtractedVectorsWrapper,
|
355
|
+
vectorset: str,
|
356
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
357
|
+
) -> Optional[VectorObject]:
|
333
358
|
if self.type in SUBFIELDFIELDS:
|
334
359
|
try:
|
335
360
|
actual_payload: Optional[VectorObject] = await self.get_vectors(
|
336
361
|
vectorset=vectorset,
|
362
|
+
storage_key_kind=storage_key_kind,
|
337
363
|
force=True,
|
338
364
|
)
|
339
365
|
except KeyError:
|
@@ -341,7 +367,7 @@ class Field(Generic[PbType]):
|
|
341
367
|
else:
|
342
368
|
actual_payload = None
|
343
369
|
|
344
|
-
sf = self._get_extracted_vectors_storage_field(vectorset)
|
370
|
+
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
345
371
|
vo: Optional[VectorObject] = None
|
346
372
|
if actual_payload is None:
|
347
373
|
# Its first extracted text
|
@@ -373,14 +399,13 @@ class Field(Generic[PbType]):
|
|
373
399
|
return vo
|
374
400
|
|
375
401
|
async def get_vectors(
|
376
|
-
self,
|
402
|
+
self,
|
403
|
+
vectorset: str,
|
404
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
405
|
+
force: bool = False,
|
377
406
|
) -> Optional[VectorObject]:
|
378
|
-
# compat with vectorsets coming from protobuffers where no value is
|
379
|
-
# empty string instead of None. This shouldn't be handled here but we
|
380
|
-
# have to make sure it gets the correct vectorset
|
381
|
-
vectorset = vectorset or None
|
382
407
|
if self.extracted_vectors.get(vectorset, None) is None or force:
|
383
|
-
sf = self._get_extracted_vectors_storage_field(vectorset)
|
408
|
+
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
384
409
|
payload = await self.storage.download_pb(sf, VectorObject)
|
385
410
|
if payload is not None:
|
386
411
|
self.extracted_vectors[vectorset] = payload
|
nucliadb/ingest/orm/brain.py
CHANGED
@@ -246,9 +246,10 @@ class ResourceBrain:
|
|
246
246
|
field_id: str,
|
247
247
|
vo: utils_pb2.VectorObject,
|
248
248
|
*,
|
249
|
-
vectorset:
|
249
|
+
vectorset: str,
|
250
250
|
replace_field: bool = False,
|
251
|
-
|
251
|
+
# cut to specific dimension if specified
|
252
|
+
vector_dimension: Optional[int] = None,
|
252
253
|
):
|
253
254
|
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
254
255
|
for subfield, vectors in vo.split_vectors.items():
|
@@ -277,7 +278,7 @@ class ResourceBrain:
|
|
277
278
|
sentence_key,
|
278
279
|
vector,
|
279
280
|
vectorset=vectorset,
|
280
|
-
|
281
|
+
vector_dimension=vector_dimension,
|
281
282
|
)
|
282
283
|
|
283
284
|
_field_id = ids.FieldId(
|
@@ -303,16 +304,12 @@ class ResourceBrain:
|
|
303
304
|
sentence_key,
|
304
305
|
vector,
|
305
306
|
vectorset=vectorset,
|
306
|
-
|
307
|
+
vector_dimension=vector_dimension,
|
307
308
|
)
|
308
309
|
|
309
310
|
if replace_field:
|
310
311
|
full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
|
311
|
-
|
312
|
-
# DEPRECATED
|
313
|
-
self.brain.sentences_to_delete.append(full_field_id)
|
314
|
-
else:
|
315
|
-
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
312
|
+
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
316
313
|
|
317
314
|
def _apply_field_vector(
|
318
315
|
self,
|
@@ -321,22 +318,15 @@ class ResourceBrain:
|
|
321
318
|
sentence_key: ids.VectorId,
|
322
319
|
vector: utils_pb2.Vector,
|
323
320
|
*,
|
324
|
-
vectorset:
|
325
|
-
|
321
|
+
vectorset: str,
|
322
|
+
# cut vectors if a specific dimension is specified
|
323
|
+
vector_dimension: Optional[int] = None,
|
326
324
|
):
|
327
325
|
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
328
|
-
|
329
|
-
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
330
|
-
else:
|
331
|
-
sentence_pb = paragraph_pb.sentences[sentence_key.full()]
|
326
|
+
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
332
327
|
|
333
328
|
sentence_pb.ClearField("vector") # clear first to prevent duplicates
|
334
|
-
|
335
|
-
# cut vectors if a specific dimension is specified
|
336
|
-
if matryoshka_vector_dimension is not None:
|
337
|
-
sentence_pb.vector.extend(vector.vector[:matryoshka_vector_dimension])
|
338
|
-
else:
|
339
|
-
sentence_pb.vector.extend(vector.vector)
|
329
|
+
sentence_pb.vector.extend(vector.vector[:vector_dimension])
|
340
330
|
|
341
331
|
# we only care about start/stop position of the paragraph for a given sentence here
|
342
332
|
# the key has the sentence position
|
@@ -20,11 +20,13 @@
|
|
20
20
|
|
21
21
|
from typing import cast
|
22
22
|
|
23
|
+
from nucliadb.common import datamanagers
|
23
24
|
from nucliadb.ingest.fields.base import Field
|
24
25
|
from nucliadb.ingest.fields.conversation import Conversation
|
25
26
|
from nucliadb.ingest.fields.file import File
|
26
27
|
from nucliadb.ingest.fields.link import Link
|
27
28
|
from nucliadb.ingest.orm.resource import Resource
|
29
|
+
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
28
30
|
from nucliadb_protos.resources_pb2 import (
|
29
31
|
ExtractedTextWrapper,
|
30
32
|
ExtractedVectorsWrapper,
|
@@ -90,7 +92,12 @@ class _BrokerMessageBuilder:
|
|
90
92
|
self.bm.link_extracted_data.append(link_extracted_data)
|
91
93
|
|
92
94
|
# Field vectors
|
93
|
-
|
95
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(
|
96
|
+
resource.txn, kbid=resource.kb.kbid
|
97
|
+
):
|
98
|
+
await self.generate_field_vectors(
|
99
|
+
type_id, field_id, field, vectorset_id, vs.storage_key_kind
|
100
|
+
)
|
94
101
|
|
95
102
|
# Large metadata
|
96
103
|
await self.generate_field_large_computed_metadata(type_id, field_id, field)
|
@@ -155,13 +162,16 @@ class _BrokerMessageBuilder:
|
|
155
162
|
type_id: FieldType.ValueType,
|
156
163
|
field_id: str,
|
157
164
|
field: Field,
|
165
|
+
vectorset: str,
|
166
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
158
167
|
):
|
159
|
-
vo = await field.get_vectors()
|
168
|
+
vo = await field.get_vectors(vectorset, storage_key_kind)
|
160
169
|
if vo is None:
|
161
170
|
return
|
162
171
|
evw = ExtractedVectorsWrapper()
|
163
172
|
evw.field.field = field_id
|
164
173
|
evw.field.field_type = type_id
|
174
|
+
evw.vectorset_id = vectorset
|
165
175
|
evw.vectors.CopyFrom(vo)
|
166
176
|
self.bm.field_vectors.append(evw)
|
167
177
|
|
@@ -57,6 +57,7 @@ from nucliadb_protos.knowledgebox_pb2 import (
|
|
57
57
|
KnowledgeBoxConfig,
|
58
58
|
SemanticModelMetadata,
|
59
59
|
StoredExternalIndexProviderMetadata,
|
60
|
+
VectorSetPurge,
|
60
61
|
)
|
61
62
|
from nucliadb_protos.resources_pb2 import Basic
|
62
63
|
from nucliadb_utils.settings import is_onprem_nucliadb
|
@@ -103,9 +104,9 @@ class KnowledgeBox:
|
|
103
104
|
*,
|
104
105
|
kbid: str,
|
105
106
|
slug: str,
|
107
|
+
semantic_models: dict[str, SemanticModelMetadata],
|
106
108
|
title: str = "",
|
107
109
|
description: str = "",
|
108
|
-
semantic_models: Optional[dict[str, SemanticModelMetadata]] = None,
|
109
110
|
external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
|
110
111
|
hidden_resources_enabled: bool = False,
|
111
112
|
hidden_resources_hide_on_creation: bool = False,
|
@@ -120,7 +121,7 @@ class KnowledgeBox:
|
|
120
121
|
raise KnowledgeBoxCreationError(
|
121
122
|
"Cannot hide new resources if the hidden resources feature is disabled"
|
122
123
|
)
|
123
|
-
if
|
124
|
+
if len(semantic_models) == 0:
|
124
125
|
raise KnowledgeBoxCreationError("KB must define at least one semantic model")
|
125
126
|
|
126
127
|
rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
|
@@ -523,11 +524,21 @@ class KnowledgeBox:
|
|
523
524
|
await shard_manager.create_vectorset(self.kbid, config)
|
524
525
|
|
525
526
|
async def delete_vectorset(self, vectorset_id: str):
|
526
|
-
await datamanagers.vectorsets.
|
527
|
+
vectorset_count = await datamanagers.vectorsets.count(self.txn, kbid=self.kbid)
|
528
|
+
if vectorset_count == 1:
|
529
|
+
raise VectorSetConflict("Deletion of your last vectorset is not allowed")
|
530
|
+
|
531
|
+
deleted = await datamanagers.vectorsets.delete(
|
532
|
+
self.txn, kbid=self.kbid, vectorset_id=vectorset_id
|
533
|
+
)
|
534
|
+
if deleted is None:
|
535
|
+
# already deleted
|
536
|
+
return
|
527
537
|
|
528
538
|
# mark vectorset for async deletion
|
529
539
|
deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=vectorset_id)
|
530
|
-
|
540
|
+
payload = VectorSetPurge(storage_key_kind=deleted.storage_key_kind)
|
541
|
+
await self.txn.set(deletion_mark_key, payload.SerializeToString())
|
531
542
|
|
532
543
|
shard_manager = get_shard_manager()
|
533
544
|
await shard_manager.delete_vectorset(self.kbid, vectorset_id)
|