nucliadb 6.2.1.post2864__py3-none-any.whl → 6.2.1.post2869__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -58,6 +58,11 @@ async def iter(
58
58
  yield config.vectorset_id, config
59
59
 
60
60
 
61
+ async def count(txn: Transaction, *, kbid: str) -> int:
62
+ kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
63
+ return len(kb_vectorsets.vectorsets)
64
+
65
+
61
66
  async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
62
67
  """Create or update a vectorset configuration"""
63
68
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
@@ -73,16 +78,20 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
73
78
  await txn.set(key, kb_vectorsets.SerializeToString())
74
79
 
75
80
 
76
- async def delete(txn: Transaction, *, kbid: str, vectorset_id: str):
81
+ async def delete(
82
+ txn: Transaction, *, kbid: str, vectorset_id: str
83
+ ) -> Optional[knowledgebox_pb2.VectorSetConfig]:
77
84
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
78
85
  index = _find_vectorset(kb_vectorsets, vectorset_id)
79
86
  if index is None:
80
87
  # already deleted
81
- return
88
+ return None
82
89
 
90
+ deleted = kb_vectorsets.vectorsets[index]
83
91
  del kb_vectorsets.vectorsets[index]
84
92
  key = KB_VECTORSETS.format(kbid=kbid)
85
93
  await txn.set(key, kb_vectorsets.SerializeToString())
94
+ return deleted
86
95
 
87
96
 
88
97
  # XXX At some point in the vectorset epic, we should make this key mandatory and
@@ -21,12 +21,13 @@ from __future__ import annotations
21
21
 
22
22
  import enum
23
23
  from datetime import datetime
24
- from typing import Any, Generic, Optional, Type, TypeVar
24
+ from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
25
25
 
26
26
  from google.protobuf.message import DecodeError, Message
27
27
 
28
28
  from nucliadb.common import datamanagers
29
29
  from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
30
+ from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
30
31
  from nucliadb_protos.resources_pb2 import (
31
32
  CloudFile,
32
33
  ExtractedTextWrapper,
@@ -44,9 +45,15 @@ from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
44
45
  from nucliadb_protos.writer_pb2 import Error, FieldStatus
45
46
  from nucliadb_utils.storages.storage import Storage, StorageField
46
47
 
48
+ if TYPE_CHECKING: # pragma: no cover
49
+ from nucliadb.ingest.orm.resource import Resource
50
+
51
+
47
52
  SUBFIELDFIELDS = ("c",)
48
53
 
49
54
 
55
+ # NOTE extracted vectors key is no longer a static key, it is stored in each
56
+ # vectorset
50
57
  class FieldTypes(str, enum.Enum):
51
58
  FIELD_TEXT = "extracted_text"
52
59
  FIELD_VECTORS = "extracted_vectors"
@@ -73,7 +80,7 @@ class Field(Generic[PbType]):
73
80
  def __init__(
74
81
  self,
75
82
  id: str,
76
- resource: Any,
83
+ resource: Resource,
77
84
  pb: Optional[Any] = None,
78
85
  value: Optional[Any] = None,
79
86
  ):
@@ -88,7 +95,7 @@ class Field(Generic[PbType]):
88
95
  self.question_answers = None
89
96
 
90
97
  self.id: str = id
91
- self.resource: Any = resource
98
+ self.resource = resource
92
99
 
93
100
  if value is not None:
94
101
  newpb = self.pbklass()
@@ -119,11 +126,20 @@ class Field(Generic[PbType]):
119
126
  def get_storage_field(self, field_type: FieldTypes) -> StorageField:
120
127
  return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, field_type.value)
121
128
 
122
- def _get_extracted_vectors_storage_field(self, vectorset: Optional[str] = None) -> StorageField:
123
- if vectorset:
129
+ def _get_extracted_vectors_storage_field(
130
+ self,
131
+ vectorset: str,
132
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
133
+ ) -> StorageField:
134
+ if storage_key_kind == VectorSetConfig.StorageKeyKind.LEGACY:
135
+ key = FieldTypes.FIELD_VECTORS.value
136
+ elif storage_key_kind == VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX:
124
137
  key = FieldTypes.FIELD_VECTORSET.value.format(vectorset=vectorset)
125
138
  else:
126
- key = FieldTypes.FIELD_VECTORS.value
139
+ raise ValueError(
140
+ f"Can't do anything with UNSET or unknown vectorset storage key kind: {storage_key_kind}"
141
+ )
142
+
127
143
  return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
128
144
 
129
145
  async def db_get_value(self) -> Optional[PbType]:
@@ -163,7 +179,8 @@ class Field(Generic[PbType]):
163
179
  field_id=self.id,
164
180
  )
165
181
  await self.delete_extracted_text()
166
- await self.delete_vectors()
182
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(self.resource.txn, kbid=self.kbid):
183
+ await self.delete_vectors(vectorset_id, vs.storage_key_kind)
167
184
  await self.delete_metadata()
168
185
  await self.delete_question_answers()
169
186
 
@@ -181,9 +198,13 @@ class Field(Generic[PbType]):
181
198
  except KeyError:
182
199
  pass
183
200
 
184
- async def delete_vectors(self, vectorset: Optional[str] = None) -> None:
201
+ async def delete_vectors(
202
+ self,
203
+ vectorset: str,
204
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
205
+ ) -> None:
185
206
  # Try delete vectors
186
- sf = self._get_extracted_vectors_storage_field(vectorset)
207
+ sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
187
208
  try:
188
209
  await self.storage.delete_upload(sf.key, sf.bucket)
189
210
  except KeyError:
@@ -328,12 +349,17 @@ class Field(Generic[PbType]):
328
349
  self.extracted_text = payload
329
350
  return self.extracted_text
330
351
 
331
- async def set_vectors(self, payload: ExtractedVectorsWrapper) -> Optional[VectorObject]:
332
- vectorset = payload.vectorset_id or None
352
+ async def set_vectors(
353
+ self,
354
+ payload: ExtractedVectorsWrapper,
355
+ vectorset: str,
356
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
357
+ ) -> Optional[VectorObject]:
333
358
  if self.type in SUBFIELDFIELDS:
334
359
  try:
335
360
  actual_payload: Optional[VectorObject] = await self.get_vectors(
336
361
  vectorset=vectorset,
362
+ storage_key_kind=storage_key_kind,
337
363
  force=True,
338
364
  )
339
365
  except KeyError:
@@ -341,7 +367,7 @@ class Field(Generic[PbType]):
341
367
  else:
342
368
  actual_payload = None
343
369
 
344
- sf = self._get_extracted_vectors_storage_field(vectorset)
370
+ sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
345
371
  vo: Optional[VectorObject] = None
346
372
  if actual_payload is None:
347
373
  # Its first extracted text
@@ -373,14 +399,13 @@ class Field(Generic[PbType]):
373
399
  return vo
374
400
 
375
401
  async def get_vectors(
376
- self, vectorset: Optional[str] = None, force: bool = False
402
+ self,
403
+ vectorset: str,
404
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
405
+ force: bool = False,
377
406
  ) -> Optional[VectorObject]:
378
- # compat with vectorsets coming from protobuffers where no value is
379
- # empty string instead of None. This shouldn't be handled here but we
380
- # have to make sure it gets the correct vectorset
381
- vectorset = vectorset or None
382
407
  if self.extracted_vectors.get(vectorset, None) is None or force:
383
- sf = self._get_extracted_vectors_storage_field(vectorset)
408
+ sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
384
409
  payload = await self.storage.download_pb(sf, VectorObject)
385
410
  if payload is not None:
386
411
  self.extracted_vectors[vectorset] = payload
@@ -246,9 +246,10 @@ class ResourceBrain:
246
246
  field_id: str,
247
247
  vo: utils_pb2.VectorObject,
248
248
  *,
249
- vectorset: Optional[str] = None,
249
+ vectorset: str,
250
250
  replace_field: bool = False,
251
- matryoshka_vector_dimension: Optional[int] = None,
251
+ # cut to specific dimension if specified
252
+ vector_dimension: Optional[int] = None,
252
253
  ):
253
254
  fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
254
255
  for subfield, vectors in vo.split_vectors.items():
@@ -277,7 +278,7 @@ class ResourceBrain:
277
278
  sentence_key,
278
279
  vector,
279
280
  vectorset=vectorset,
280
- matryoshka_vector_dimension=matryoshka_vector_dimension,
281
+ vector_dimension=vector_dimension,
281
282
  )
282
283
 
283
284
  _field_id = ids.FieldId(
@@ -303,16 +304,12 @@ class ResourceBrain:
303
304
  sentence_key,
304
305
  vector,
305
306
  vectorset=vectorset,
306
- matryoshka_vector_dimension=matryoshka_vector_dimension,
307
+ vector_dimension=vector_dimension,
307
308
  )
308
309
 
309
310
  if replace_field:
310
311
  full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
311
- if vectorset is None:
312
- # DEPRECATED
313
- self.brain.sentences_to_delete.append(full_field_id)
314
- else:
315
- self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
312
+ self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
316
313
 
317
314
  def _apply_field_vector(
318
315
  self,
@@ -321,22 +318,15 @@ class ResourceBrain:
321
318
  sentence_key: ids.VectorId,
322
319
  vector: utils_pb2.Vector,
323
320
  *,
324
- vectorset: Optional[str],
325
- matryoshka_vector_dimension: Optional[int] = None,
321
+ vectorset: str,
322
+ # cut vectors if a specific dimension is specified
323
+ vector_dimension: Optional[int] = None,
326
324
  ):
327
325
  paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
328
- if vectorset:
329
- sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
330
- else:
331
- sentence_pb = paragraph_pb.sentences[sentence_key.full()]
326
+ sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
332
327
 
333
328
  sentence_pb.ClearField("vector") # clear first to prevent duplicates
334
-
335
- # cut vectors if a specific dimension is specified
336
- if matryoshka_vector_dimension is not None:
337
- sentence_pb.vector.extend(vector.vector[:matryoshka_vector_dimension])
338
- else:
339
- sentence_pb.vector.extend(vector.vector)
329
+ sentence_pb.vector.extend(vector.vector[:vector_dimension])
340
330
 
341
331
  # we only care about start/stop position of the paragraph for a given sentence here
342
332
  # the key has the sentence position
@@ -20,11 +20,13 @@
20
20
 
21
21
  from typing import cast
22
22
 
23
+ from nucliadb.common import datamanagers
23
24
  from nucliadb.ingest.fields.base import Field
24
25
  from nucliadb.ingest.fields.conversation import Conversation
25
26
  from nucliadb.ingest.fields.file import File
26
27
  from nucliadb.ingest.fields.link import Link
27
28
  from nucliadb.ingest.orm.resource import Resource
29
+ from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
28
30
  from nucliadb_protos.resources_pb2 import (
29
31
  ExtractedTextWrapper,
30
32
  ExtractedVectorsWrapper,
@@ -90,7 +92,12 @@ class _BrokerMessageBuilder:
90
92
  self.bm.link_extracted_data.append(link_extracted_data)
91
93
 
92
94
  # Field vectors
93
- await self.generate_field_vectors(type_id, field_id, field)
95
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(
96
+ resource.txn, kbid=resource.kb.kbid
97
+ ):
98
+ await self.generate_field_vectors(
99
+ type_id, field_id, field, vectorset_id, vs.storage_key_kind
100
+ )
94
101
 
95
102
  # Large metadata
96
103
  await self.generate_field_large_computed_metadata(type_id, field_id, field)
@@ -155,13 +162,16 @@ class _BrokerMessageBuilder:
155
162
  type_id: FieldType.ValueType,
156
163
  field_id: str,
157
164
  field: Field,
165
+ vectorset: str,
166
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
158
167
  ):
159
- vo = await field.get_vectors()
168
+ vo = await field.get_vectors(vectorset, storage_key_kind)
160
169
  if vo is None:
161
170
  return
162
171
  evw = ExtractedVectorsWrapper()
163
172
  evw.field.field = field_id
164
173
  evw.field.field_type = type_id
174
+ evw.vectorset_id = vectorset
165
175
  evw.vectors.CopyFrom(vo)
166
176
  self.bm.field_vectors.append(evw)
167
177
 
@@ -57,6 +57,7 @@ from nucliadb_protos.knowledgebox_pb2 import (
57
57
  KnowledgeBoxConfig,
58
58
  SemanticModelMetadata,
59
59
  StoredExternalIndexProviderMetadata,
60
+ VectorSetPurge,
60
61
  )
61
62
  from nucliadb_protos.resources_pb2 import Basic
62
63
  from nucliadb_utils.settings import is_onprem_nucliadb
@@ -103,9 +104,9 @@ class KnowledgeBox:
103
104
  *,
104
105
  kbid: str,
105
106
  slug: str,
107
+ semantic_models: dict[str, SemanticModelMetadata],
106
108
  title: str = "",
107
109
  description: str = "",
108
- semantic_models: Optional[dict[str, SemanticModelMetadata]] = None,
109
110
  external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
110
111
  hidden_resources_enabled: bool = False,
111
112
  hidden_resources_hide_on_creation: bool = False,
@@ -120,7 +121,7 @@ class KnowledgeBox:
120
121
  raise KnowledgeBoxCreationError(
121
122
  "Cannot hide new resources if the hidden resources feature is disabled"
122
123
  )
123
- if semantic_models is None or len(semantic_models) == 0:
124
+ if len(semantic_models) == 0:
124
125
  raise KnowledgeBoxCreationError("KB must define at least one semantic model")
125
126
 
126
127
  rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
@@ -523,11 +524,21 @@ class KnowledgeBox:
523
524
  await shard_manager.create_vectorset(self.kbid, config)
524
525
 
525
526
  async def delete_vectorset(self, vectorset_id: str):
526
- await datamanagers.vectorsets.delete(self.txn, kbid=self.kbid, vectorset_id=vectorset_id)
527
+ vectorset_count = await datamanagers.vectorsets.count(self.txn, kbid=self.kbid)
528
+ if vectorset_count == 1:
529
+ raise VectorSetConflict("Deletion of your last vectorset is not allowed")
530
+
531
+ deleted = await datamanagers.vectorsets.delete(
532
+ self.txn, kbid=self.kbid, vectorset_id=vectorset_id
533
+ )
534
+ if deleted is None:
535
+ # already deleted
536
+ return
527
537
 
528
538
  # mark vectorset for async deletion
529
539
  deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=vectorset_id)
530
- await self.txn.set(deletion_mark_key, b"")
540
+ payload = VectorSetPurge(storage_key_kind=deleted.storage_key_kind)
541
+ await self.txn.set(deletion_mark_key, payload.SerializeToString())
531
542
 
532
543
  shard_manager = get_shard_manager()
533
544
  await shard_manager.delete_vectorset(self.kbid, vectorset_id)