nucliadb 6.3.5.post3985__py3-none-any.whl → 6.3.5.post3990__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +3 -2
- nucliadb/common/cluster/rollover.py +3 -3
- nucliadb/common/cluster/utils.py +8 -4
- nucliadb/common/external_index_providers/pinecone.py +7 -44
- nucliadb/ingest/fields/exceptions.py +4 -0
- nucliadb/ingest/orm/brain_v2.py +782 -0
- nucliadb/ingest/orm/index_message.py +409 -0
- nucliadb/ingest/orm/metrics.py +1 -1
- nucliadb/ingest/orm/processor/pgcatalog.py +3 -2
- nucliadb/ingest/orm/processor/processor.py +61 -47
- nucliadb/ingest/orm/resource.py +70 -50
- nucliadb/ingest/service/writer.py +2 -2
- nucliadb/writer/api/v1/resource.py +1 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3990.dist-info}/METADATA +6 -6
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3990.dist-info}/RECORD +18 -16
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3990.dist-info}/WHEEL +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3990.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3990.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,7 @@ from typing import cast
|
|
29
29
|
|
30
30
|
from nucliadb.common import datamanagers
|
31
31
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
32
|
+
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
32
33
|
from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
|
33
34
|
from nucliadb.migrator.context import ExecutionContext
|
34
35
|
|
@@ -73,8 +74,8 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
73
74
|
logger.warning(f"Could not load resource {rid} for kbid {kbid}")
|
74
75
|
continue
|
75
76
|
|
76
|
-
await
|
77
|
-
await pgcatalog_update(txn, kbid, resource)
|
77
|
+
index_message = await get_resource_index_message(resource, reindex=False)
|
78
|
+
await pgcatalog_update(txn, kbid, resource, index_message)
|
78
79
|
|
79
80
|
await txn.commit()
|
80
81
|
continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
|
@@ -38,7 +38,7 @@ from nucliadb_telemetry import errors
|
|
38
38
|
from .utils import (
|
39
39
|
delete_resource_from_shard,
|
40
40
|
get_resource,
|
41
|
-
|
41
|
+
get_rollover_resource_index_message,
|
42
42
|
index_resource_to_shard,
|
43
43
|
)
|
44
44
|
|
@@ -288,7 +288,7 @@ async def _index_resource_to_rollover_index(
|
|
288
288
|
f"Shard {shard_id} not found. Was a new one created during migration?"
|
289
289
|
)
|
290
290
|
resource = await get_resource(kbid, resource_id)
|
291
|
-
index_message = await
|
291
|
+
index_message = await get_rollover_resource_index_message(kbid, resource_id)
|
292
292
|
if resource is None or index_message is None:
|
293
293
|
# resource no longer existing, remove indexing and carry on
|
294
294
|
async with datamanagers.with_transaction() as txn:
|
@@ -503,7 +503,7 @@ async def validate_indexed_data(
|
|
503
503
|
await txn.commit()
|
504
504
|
continue
|
505
505
|
|
506
|
-
index_message = await
|
506
|
+
index_message = await get_rollover_resource_index_message(kbid, resource_id)
|
507
507
|
if index_message is None:
|
508
508
|
logger.error(
|
509
509
|
"Resource index message not found while validating, skipping",
|
nucliadb/common/cluster/utils.py
CHANGED
@@ -28,6 +28,7 @@ from nucliadb.common.cluster.manager import (
|
|
28
28
|
StandaloneKBShardManager,
|
29
29
|
)
|
30
30
|
from nucliadb.common.cluster.settings import settings
|
31
|
+
from nucliadb.ingest.orm import index_message
|
31
32
|
from nucliadb.ingest.orm.resource import Resource
|
32
33
|
from nucliadb_protos import nodereader_pb2, writer_pb2
|
33
34
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
@@ -80,7 +81,9 @@ async def get_resource(kbid: str, resource_id: str) -> Optional[Resource]:
|
|
80
81
|
|
81
82
|
|
82
83
|
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
|
83
|
-
async def
|
84
|
+
async def get_rollover_resource_index_message(
|
85
|
+
kbid: str, resource_id: str
|
86
|
+
) -> Optional[nodereader_pb2.Resource]:
|
84
87
|
async with datamanagers.with_ro_transaction() as txn:
|
85
88
|
resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
|
86
89
|
if resource is None:
|
@@ -89,8 +92,9 @@ async def get_resource_index_message(kbid: str, resource_id: str) -> Optional[no
|
|
89
92
|
extra={"kbid": kbid, "resource_id": resource_id},
|
90
93
|
)
|
91
94
|
return None
|
92
|
-
|
93
|
-
|
95
|
+
# We set the reindex=False because we are indexing the resource for the first time in the
|
96
|
+
# newly created shards.
|
97
|
+
return await index_message.get_resource_index_message(resource, reindex=False)
|
94
98
|
|
95
99
|
|
96
100
|
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
|
@@ -106,7 +110,7 @@ async def index_resource_to_shard(
|
|
106
110
|
partitioning = app_context.partitioning
|
107
111
|
|
108
112
|
if resource_index_message is None:
|
109
|
-
resource_index_message = await
|
113
|
+
resource_index_message = await get_rollover_resource_index_message(kbid, resource_id)
|
110
114
|
if resource_index_message is None:
|
111
115
|
return
|
112
116
|
|
@@ -36,7 +36,7 @@ from nucliadb.common.external_index_providers.base import (
|
|
36
36
|
VectorsetExternalIndex,
|
37
37
|
)
|
38
38
|
from nucliadb.common.external_index_providers.exceptions import ExternalIndexCreationError
|
39
|
-
from nucliadb.common.ids import
|
39
|
+
from nucliadb.common.ids import ParagraphId, VectorId
|
40
40
|
from nucliadb_models.search import SCORE_TYPE, TextPosition
|
41
41
|
from nucliadb_protos import knowledgebox_pb2 as kb_pb2
|
42
42
|
from nucliadb_protos import utils_pb2
|
@@ -418,52 +418,17 @@ class PineconeIndexManager(ExternalIndexManager):
|
|
418
418
|
if len(delete_tasks) > 0:
|
419
419
|
await asyncio.gather(*delete_tasks)
|
420
420
|
|
421
|
-
def get_vectorsets_in_resource(self, index_data: Resource) -> set[str]:
|
422
|
-
vectorsets: set[str] = set()
|
423
|
-
for _, paragraph in iter_paragraphs(index_data):
|
424
|
-
if not paragraph.sentences and not paragraph.vectorsets_sentences:
|
425
|
-
continue
|
426
|
-
if paragraph.sentences and self.default_vectorset:
|
427
|
-
vectorsets.add(self.default_vectorset)
|
428
|
-
for vectorset_id, vectorsets_sentences in paragraph.vectorsets_sentences.items():
|
429
|
-
if vectorsets_sentences.sentences:
|
430
|
-
vectorsets.add(vectorset_id)
|
431
|
-
# Once we have found at least one paragraph with vectors, we can stop iterating
|
432
|
-
return vectorsets
|
433
|
-
return vectorsets
|
434
|
-
|
435
421
|
def get_index_host(self, vectorset_id: str, rollover: bool = False) -> str:
|
436
422
|
if rollover:
|
437
423
|
return self.rollover_indexes[vectorset_id].index_host
|
438
424
|
else:
|
439
425
|
return self.indexes[vectorset_id].index_host
|
440
426
|
|
441
|
-
def get_prefixes_to_delete(self, index_data: Resource) -> set[str]:
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
delete_vid = VectorId.from_string(field_id)
|
447
|
-
prefixes_to_delete.add(delete_vid.field_id.full())
|
448
|
-
except ValueError: # pragma: no cover
|
449
|
-
try:
|
450
|
-
delete_field = FieldId.from_string(field_id)
|
451
|
-
prefixes_to_delete.add(delete_field.full())
|
452
|
-
except ValueError:
|
453
|
-
logger.warning(f"Invalid id to delete sentences from: {field_id}.")
|
454
|
-
continue
|
455
|
-
for paragraph_id in index_data.paragraphs_to_delete:
|
456
|
-
try:
|
457
|
-
delete_pid = ParagraphId.from_string(paragraph_id)
|
458
|
-
prefixes_to_delete.add(delete_pid.field_id.full())
|
459
|
-
except ValueError: # pragma: no cover
|
460
|
-
try:
|
461
|
-
delete_field = FieldId.from_string(paragraph_id)
|
462
|
-
prefixes_to_delete.add(delete_field.full())
|
463
|
-
except ValueError:
|
464
|
-
logger.warning(f"Invalid id to delete: {paragraph_id}. ParagraphId expected.")
|
465
|
-
continue
|
466
|
-
return prefixes_to_delete
|
427
|
+
def get_prefixes_to_delete(self, index_data: Resource) -> dict[str, set[str]]:
|
428
|
+
return {
|
429
|
+
vectorset_id: set(prefixes_list.items)
|
430
|
+
for vectorset_id, prefixes_list in index_data.vector_prefixes_to_delete.items()
|
431
|
+
}
|
467
432
|
|
468
433
|
async def _index_resource(
|
469
434
|
self, resource_uuid: str, index_data: Resource, to_rollover_indexes: bool = False
|
@@ -480,10 +445,8 @@ class PineconeIndexManager(ExternalIndexManager):
|
|
480
445
|
metadata with any specific sentence metadata. This is done for each vectorset.
|
481
446
|
- Finally, upsert the vectors to each vectorset index in parallel.
|
482
447
|
"""
|
483
|
-
vectorsets = self.get_vectorsets_in_resource(index_data)
|
484
|
-
prefixes_to_delete = self.get_prefixes_to_delete(index_data)
|
485
448
|
delete_tasks = []
|
486
|
-
for vectorset in
|
449
|
+
for vectorset, prefixes_to_delete in self.get_prefixes_to_delete(index_data).items():
|
487
450
|
index_host = self.get_index_host(vectorset_id=vectorset, rollover=to_rollover_indexes)
|
488
451
|
delete_tasks.append(
|
489
452
|
asyncio.create_task(
|