nucliadb 6.3.5.post3980__py3-none-any.whl → 6.3.5.post3990__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,6 +29,7 @@ from typing import cast
29
29
 
30
30
  from nucliadb.common import datamanagers
31
31
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
+ from nucliadb.ingest.orm.index_message import get_resource_index_message
32
33
  from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
33
34
  from nucliadb.migrator.context import ExecutionContext
34
35
 
@@ -73,8 +74,8 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
73
74
  logger.warning(f"Could not load resource {rid} for kbid {kbid}")
74
75
  continue
75
76
 
76
- await resource.compute_global_tags(resource.indexer)
77
- await pgcatalog_update(txn, kbid, resource)
77
+ index_message = await get_resource_index_message(resource, reindex=False)
78
+ await pgcatalog_update(txn, kbid, resource, index_message)
78
79
 
79
80
  await txn.commit()
80
81
  continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
@@ -38,7 +38,7 @@ from nucliadb_telemetry import errors
38
38
  from .utils import (
39
39
  delete_resource_from_shard,
40
40
  get_resource,
41
- get_resource_index_message,
41
+ get_rollover_resource_index_message,
42
42
  index_resource_to_shard,
43
43
  )
44
44
 
@@ -288,7 +288,7 @@ async def _index_resource_to_rollover_index(
288
288
  f"Shard {shard_id} not found. Was a new one created during migration?"
289
289
  )
290
290
  resource = await get_resource(kbid, resource_id)
291
- index_message = await get_resource_index_message(kbid, resource_id)
291
+ index_message = await get_rollover_resource_index_message(kbid, resource_id)
292
292
  if resource is None or index_message is None:
293
293
  # resource no longer existing, remove indexing and carry on
294
294
  async with datamanagers.with_transaction() as txn:
@@ -503,7 +503,7 @@ async def validate_indexed_data(
503
503
  await txn.commit()
504
504
  continue
505
505
 
506
- index_message = await get_resource_index_message(kbid, resource_id)
506
+ index_message = await get_rollover_resource_index_message(kbid, resource_id)
507
507
  if index_message is None:
508
508
  logger.error(
509
509
  "Resource index message not found while validating, skipping",
@@ -28,6 +28,7 @@ from nucliadb.common.cluster.manager import (
28
28
  StandaloneKBShardManager,
29
29
  )
30
30
  from nucliadb.common.cluster.settings import settings
31
+ from nucliadb.ingest.orm import index_message
31
32
  from nucliadb.ingest.orm.resource import Resource
32
33
  from nucliadb_protos import nodereader_pb2, writer_pb2
33
34
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
@@ -80,7 +81,9 @@ async def get_resource(kbid: str, resource_id: str) -> Optional[Resource]:
80
81
 
81
82
 
82
83
  @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
83
- async def get_resource_index_message(kbid: str, resource_id: str) -> Optional[nodereader_pb2.Resource]:
84
+ async def get_rollover_resource_index_message(
85
+ kbid: str, resource_id: str
86
+ ) -> Optional[nodereader_pb2.Resource]:
84
87
  async with datamanagers.with_ro_transaction() as txn:
85
88
  resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
86
89
  if resource is None:
@@ -89,8 +92,9 @@ async def get_resource_index_message(kbid: str, resource_id: str) -> Optional[no
89
92
  extra={"kbid": kbid, "resource_id": resource_id},
90
93
  )
91
94
  return None
92
- resource_index_message = (await resource.generate_index_message(reindex=False)).brain
93
- return resource_index_message
95
+ # We set the reindex=False because we are indexing the resource for the first time in the
96
+ # newly created shards.
97
+ return await index_message.get_resource_index_message(resource, reindex=False)
94
98
 
95
99
 
96
100
  @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
@@ -106,7 +110,7 @@ async def index_resource_to_shard(
106
110
  partitioning = app_context.partitioning
107
111
 
108
112
  if resource_index_message is None:
109
- resource_index_message = await get_resource_index_message(kbid, resource_id)
113
+ resource_index_message = await get_rollover_resource_index_message(kbid, resource_id)
110
114
  if resource_index_message is None:
111
115
  return
112
116
 
@@ -36,7 +36,7 @@ from nucliadb.common.external_index_providers.base import (
36
36
  VectorsetExternalIndex,
37
37
  )
38
38
  from nucliadb.common.external_index_providers.exceptions import ExternalIndexCreationError
39
- from nucliadb.common.ids import FieldId, ParagraphId, VectorId
39
+ from nucliadb.common.ids import ParagraphId, VectorId
40
40
  from nucliadb_models.search import SCORE_TYPE, TextPosition
41
41
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
42
42
  from nucliadb_protos import utils_pb2
@@ -418,52 +418,17 @@ class PineconeIndexManager(ExternalIndexManager):
418
418
  if len(delete_tasks) > 0:
419
419
  await asyncio.gather(*delete_tasks)
420
420
 
421
- def get_vectorsets_in_resource(self, index_data: Resource) -> set[str]:
422
- vectorsets: set[str] = set()
423
- for _, paragraph in iter_paragraphs(index_data):
424
- if not paragraph.sentences and not paragraph.vectorsets_sentences:
425
- continue
426
- if paragraph.sentences and self.default_vectorset:
427
- vectorsets.add(self.default_vectorset)
428
- for vectorset_id, vectorsets_sentences in paragraph.vectorsets_sentences.items():
429
- if vectorsets_sentences.sentences:
430
- vectorsets.add(vectorset_id)
431
- # Once we have found at least one paragraph with vectors, we can stop iterating
432
- return vectorsets
433
- return vectorsets
434
-
435
421
  def get_index_host(self, vectorset_id: str, rollover: bool = False) -> str:
436
422
  if rollover:
437
423
  return self.rollover_indexes[vectorset_id].index_host
438
424
  else:
439
425
  return self.indexes[vectorset_id].index_host
440
426
 
441
- def get_prefixes_to_delete(self, index_data: Resource) -> set[str]:
442
- prefixes_to_delete = set()
443
- # TODO: migrate to vector_prefixes_to_delete
444
- for field_id in index_data.sentences_to_delete:
445
- try:
446
- delete_vid = VectorId.from_string(field_id)
447
- prefixes_to_delete.add(delete_vid.field_id.full())
448
- except ValueError: # pragma: no cover
449
- try:
450
- delete_field = FieldId.from_string(field_id)
451
- prefixes_to_delete.add(delete_field.full())
452
- except ValueError:
453
- logger.warning(f"Invalid id to delete sentences from: {field_id}.")
454
- continue
455
- for paragraph_id in index_data.paragraphs_to_delete:
456
- try:
457
- delete_pid = ParagraphId.from_string(paragraph_id)
458
- prefixes_to_delete.add(delete_pid.field_id.full())
459
- except ValueError: # pragma: no cover
460
- try:
461
- delete_field = FieldId.from_string(paragraph_id)
462
- prefixes_to_delete.add(delete_field.full())
463
- except ValueError:
464
- logger.warning(f"Invalid id to delete: {paragraph_id}. ParagraphId expected.")
465
- continue
466
- return prefixes_to_delete
427
+ def get_prefixes_to_delete(self, index_data: Resource) -> dict[str, set[str]]:
428
+ return {
429
+ vectorset_id: set(prefixes_list.items)
430
+ for vectorset_id, prefixes_list in index_data.vector_prefixes_to_delete.items()
431
+ }
467
432
 
468
433
  async def _index_resource(
469
434
  self, resource_uuid: str, index_data: Resource, to_rollover_indexes: bool = False
@@ -480,10 +445,8 @@ class PineconeIndexManager(ExternalIndexManager):
480
445
  metadata with any specific sentence metadata. This is done for each vectorset.
481
446
  - Finally, upsert the vectors to each vectorset index in parallel.
482
447
  """
483
- vectorsets = self.get_vectorsets_in_resource(index_data)
484
- prefixes_to_delete = self.get_prefixes_to_delete(index_data)
485
448
  delete_tasks = []
486
- for vectorset in vectorsets:
449
+ for vectorset, prefixes_to_delete in self.get_prefixes_to_delete(index_data).items():
487
450
  index_host = self.get_index_host(vectorset_id=vectorset, rollover=to_rollover_indexes)
488
451
  delete_tasks.append(
489
452
  asyncio.create_task(
@@ -29,3 +29,7 @@ class InvalidPBClass(Exception):
29
29
  self.source = source
30
30
  self.destination = destination
31
31
  super().__init__(f"Source and destination does not match {self.source} - {self.destination}")
32
+
33
+
34
+ class FieldAuthorNotFound(Exception):
35
+ pass