nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -27,10 +27,11 @@ Backfill the data into the PG catalog
|
|
|
27
27
|
import logging
|
|
28
28
|
from typing import cast
|
|
29
29
|
|
|
30
|
-
from nucliadb.common import
|
|
30
|
+
from nucliadb.common.catalog import catalog_update, get_catalog
|
|
31
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
31
32
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
32
33
|
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
|
33
|
-
from nucliadb.ingest.orm.
|
|
34
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
34
35
|
from nucliadb.migrator.context import ExecutionContext
|
|
35
36
|
|
|
36
37
|
logger = logging.getLogger(__name__)
|
|
@@ -43,6 +44,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
43
44
|
if not isinstance(context.kv_driver, PGDriver):
|
|
44
45
|
return
|
|
45
46
|
|
|
47
|
+
if not isinstance(get_catalog(), PGCatalog):
|
|
48
|
+
return
|
|
49
|
+
|
|
46
50
|
BATCH_SIZE = 100
|
|
47
51
|
async with context.kv_driver.rw_transaction() as txn:
|
|
48
52
|
txn = cast(PGTransaction, txn)
|
|
@@ -69,13 +73,13 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
69
73
|
# Index each resource
|
|
70
74
|
for rid in resources_to_index:
|
|
71
75
|
rid = str(rid).replace("-", "")
|
|
72
|
-
resource = await
|
|
76
|
+
resource = await Resource.get(txn, kbid=kbid, rid=rid)
|
|
73
77
|
if resource is None:
|
|
74
78
|
logger.warning(f"Could not load resource {rid} for kbid {kbid}")
|
|
75
79
|
continue
|
|
76
80
|
|
|
77
81
|
index_message = await get_resource_index_message(resource, reindex=False)
|
|
78
|
-
await
|
|
82
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
79
83
|
|
|
80
84
|
await txn.commit()
|
|
81
85
|
continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
|
|
@@ -39,7 +39,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
39
39
|
async with datamanagers.with_rw_transaction() as txn:
|
|
40
40
|
vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
|
|
41
41
|
|
|
42
|
-
if len(vectorsets) == 0: # pragma:
|
|
42
|
+
if len(vectorsets) == 0: # pragma: no cover
|
|
43
43
|
# should never happen, everyone should have at least one
|
|
44
44
|
logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
|
|
45
45
|
return
|
|
@@ -24,7 +24,6 @@ Backfill field status (from error)
|
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
26
|
import logging
|
|
27
|
-
from typing import Optional
|
|
28
27
|
|
|
29
28
|
from nucliadb.migrator.context import ExecutionContext
|
|
30
29
|
from nucliadb_protos import resources_pb2, writer_pb2
|
|
@@ -33,7 +32,7 @@ logger = logging.getLogger(__name__)
|
|
|
33
32
|
|
|
34
33
|
|
|
35
34
|
async def migrate(context: ExecutionContext) -> None:
|
|
36
|
-
start:
|
|
35
|
+
start: str | None = ""
|
|
37
36
|
while True:
|
|
38
37
|
if start is None:
|
|
39
38
|
break
|
|
@@ -43,7 +42,7 @@ async def migrate(context: ExecutionContext) -> None:
|
|
|
43
42
|
async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
|
|
44
43
|
|
|
45
44
|
|
|
46
|
-
async def do_batch(context: ExecutionContext, start: str) ->
|
|
45
|
+
async def do_batch(context: ExecutionContext, start: str) -> str | None:
|
|
47
46
|
logger.info(f"Running batch from {start}")
|
|
48
47
|
async with context.kv_driver.rw_transaction() as txn:
|
|
49
48
|
async with txn.connection.cursor() as cur: # type: ignore
|
|
@@ -64,7 +63,7 @@ async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
|
|
|
64
63
|
field_keys = [r[0] for r in records]
|
|
65
64
|
|
|
66
65
|
# Retrieve resources basic (to check status)
|
|
67
|
-
resource_keys =
|
|
66
|
+
resource_keys = {"/".join(f.split("/")[:5]) for f in field_keys}
|
|
68
67
|
await cur.execute(
|
|
69
68
|
"""
|
|
70
69
|
SELECT key, value FROM resources
|
|
@@ -26,7 +26,6 @@ is stored in object storage.
|
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
28
|
import logging
|
|
29
|
-
from typing import Optional
|
|
30
29
|
|
|
31
30
|
from nucliadb.migrator.context import ExecutionContext
|
|
32
31
|
|
|
@@ -34,7 +33,7 @@ logger = logging.getLogger(__name__)
|
|
|
34
33
|
|
|
35
34
|
|
|
36
35
|
async def migrate(context: ExecutionContext) -> None:
|
|
37
|
-
start:
|
|
36
|
+
start: str | None = ""
|
|
38
37
|
while True:
|
|
39
38
|
if start is None:
|
|
40
39
|
break
|
|
@@ -45,7 +44,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
45
44
|
pass
|
|
46
45
|
|
|
47
46
|
|
|
48
|
-
async def do_batch(context: ExecutionContext, start: str) ->
|
|
47
|
+
async def do_batch(context: ExecutionContext, start: str) -> str | None:
|
|
49
48
|
logger.info(f"Running batch from {start}")
|
|
50
49
|
async with context.kv_driver.rw_transaction() as txn:
|
|
51
50
|
async with txn.connection.cursor() as cur: # type: ignore
|
|
@@ -27,10 +27,11 @@ Backfill the catalog with labels from fields metadata
|
|
|
27
27
|
import logging
|
|
28
28
|
from typing import cast
|
|
29
29
|
|
|
30
|
-
from nucliadb.common import
|
|
30
|
+
from nucliadb.common.catalog import catalog_update, get_catalog
|
|
31
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
31
32
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
32
33
|
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
|
33
|
-
from nucliadb.ingest.orm.
|
|
34
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
34
35
|
from nucliadb.migrator.context import ExecutionContext
|
|
35
36
|
from nucliadb_protos import resources_pb2
|
|
36
37
|
|
|
@@ -44,6 +45,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
44
45
|
if not isinstance(context.kv_driver, PGDriver):
|
|
45
46
|
return
|
|
46
47
|
|
|
48
|
+
if not isinstance(get_catalog(), PGCatalog):
|
|
49
|
+
return
|
|
50
|
+
|
|
47
51
|
BATCH_SIZE = 100
|
|
48
52
|
async with context.kv_driver.rw_transaction() as txn:
|
|
49
53
|
txn = cast(PGTransaction, txn)
|
|
@@ -78,13 +82,13 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
78
82
|
# Index each resource
|
|
79
83
|
for key in to_index:
|
|
80
84
|
rid = key.split("/")[4]
|
|
81
|
-
resource = await
|
|
85
|
+
resource = await Resource.get(txn, kbid=kbid, rid=rid)
|
|
82
86
|
if resource is None:
|
|
83
87
|
logger.warning(f"Could not load resource {rid} for kbid {kbid}")
|
|
84
88
|
continue
|
|
85
89
|
|
|
86
90
|
index_message = await get_resource_index_message(resource, reindex=False)
|
|
87
|
-
await
|
|
91
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
88
92
|
|
|
89
93
|
if to_index:
|
|
90
94
|
await txn.commit()
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
"""Migration #39
|
|
22
|
+
|
|
23
|
+
Backfill splits metadata on conversation fields
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
from typing import cast
|
|
29
|
+
|
|
30
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
31
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
32
|
+
from nucliadb.ingest.fields.conversation import (
|
|
33
|
+
CONVERSATION_SPLITS_METADATA,
|
|
34
|
+
Conversation,
|
|
35
|
+
)
|
|
36
|
+
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
|
37
|
+
from nucliadb.migrator.context import ExecutionContext
|
|
38
|
+
from nucliadb_protos import resources_pb2
|
|
39
|
+
from nucliadb_protos.resources_pb2 import SplitsMetadata
|
|
40
|
+
from nucliadb_utils.storages.storage import Storage
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
49
|
+
BATCH_SIZE = 100
|
|
50
|
+
start = ""
|
|
51
|
+
while True:
|
|
52
|
+
to_fix: list[tuple[str, str]] = []
|
|
53
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
54
|
+
txn = cast(PGTransaction, txn)
|
|
55
|
+
async with txn.connection.cursor() as cur:
|
|
56
|
+
# Retrieve a bunch of conversation fields
|
|
57
|
+
await cur.execute(
|
|
58
|
+
"""
|
|
59
|
+
SELECT key FROM resources
|
|
60
|
+
WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
|
|
61
|
+
AND key > %s
|
|
62
|
+
ORDER BY key
|
|
63
|
+
LIMIT %s""",
|
|
64
|
+
(kbid, start, BATCH_SIZE),
|
|
65
|
+
)
|
|
66
|
+
rows = await cur.fetchall()
|
|
67
|
+
if len(rows) == 0:
|
|
68
|
+
return
|
|
69
|
+
for row in rows:
|
|
70
|
+
key = row[0]
|
|
71
|
+
start = key
|
|
72
|
+
rid = key.split("/")[4]
|
|
73
|
+
field_id = key.split("/")[7]
|
|
74
|
+
to_fix.append((rid, field_id))
|
|
75
|
+
|
|
76
|
+
for rid, field_id in to_fix:
|
|
77
|
+
async with context.kv_driver.rw_transaction() as txn2:
|
|
78
|
+
splits_metadata = await build_splits_metadata(
|
|
79
|
+
txn2, context.blob_storage, kbid, rid, field_id
|
|
80
|
+
)
|
|
81
|
+
splits_metadata_key = CONVERSATION_SPLITS_METADATA.format(
|
|
82
|
+
kbid=kbid, uuid=rid, type="c", field=field_id
|
|
83
|
+
)
|
|
84
|
+
await txn2.set(splits_metadata_key, splits_metadata.SerializeToString())
|
|
85
|
+
await txn2.commit()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def build_splits_metadata(
|
|
89
|
+
txn: Transaction, storage: Storage, kbid: str, rid: str, field_id: str
|
|
90
|
+
) -> SplitsMetadata:
|
|
91
|
+
splits_metadata = SplitsMetadata()
|
|
92
|
+
kb_orm = KnowledgeBoxORM(txn, storage, kbid)
|
|
93
|
+
resource_obj = await kb_orm.get(rid)
|
|
94
|
+
if resource_obj is None:
|
|
95
|
+
return splits_metadata
|
|
96
|
+
field_obj: Conversation = await resource_obj.get_field(
|
|
97
|
+
field_id, resources_pb2.FieldType.CONVERSATION, load=False
|
|
98
|
+
)
|
|
99
|
+
conv_metadata = await field_obj.get_metadata()
|
|
100
|
+
for i in range(1, conv_metadata.pages + 1):
|
|
101
|
+
page = await field_obj.get_value(page=i)
|
|
102
|
+
if page is None:
|
|
103
|
+
continue
|
|
104
|
+
for message in page.messages:
|
|
105
|
+
splits_metadata.metadata.get_or_create(message.ident)
|
|
106
|
+
return splits_metadata
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
"""Migration #40
|
|
22
|
+
|
|
23
|
+
Replaces deprecated and removed generative models from search configurations
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
from typing import cast
|
|
29
|
+
|
|
30
|
+
from nucliadb.common import datamanagers
|
|
31
|
+
from nucliadb.migrator.context import ExecutionContext
|
|
32
|
+
from nucliadb_models.configuration import SearchConfiguration
|
|
33
|
+
from nucliadb_models.search import AskRequest, FindRequest
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
REPLACEMENTS = {
|
|
38
|
+
"claude-3-5-small": "claude-4-5-sonnet",
|
|
39
|
+
"gcp-claude-3-5-sonnet-v2": "gcp-claude-4-5-sonnet",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
47
|
+
affected = await get_affected_search_configurations(kbid)
|
|
48
|
+
if not affected:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
async with datamanagers.with_rw_transaction() as txn:
|
|
52
|
+
for name, config in affected.items():
|
|
53
|
+
logger.info(
|
|
54
|
+
"Migrating search config for kb",
|
|
55
|
+
extra={
|
|
56
|
+
"kbid": kbid,
|
|
57
|
+
"search_config": name,
|
|
58
|
+
"generative_model": config.config.generative_model, # type: ignore
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
config.config.generative_model = REPLACEMENTS[config.config.generative_model] # type: ignore
|
|
62
|
+
await datamanagers.search_configurations.set(txn, kbid=kbid, name=name, config=config)
|
|
63
|
+
await txn.commit()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def get_affected_search_configurations(kbid: str) -> dict[str, SearchConfiguration]:
|
|
67
|
+
result: dict[str, SearchConfiguration] = {}
|
|
68
|
+
async with datamanagers.with_ro_transaction() as txn:
|
|
69
|
+
search_configs = await datamanagers.search_configurations.list(txn, kbid=kbid)
|
|
70
|
+
for name, config in search_configs.items():
|
|
71
|
+
if config.kind == "find":
|
|
72
|
+
find_config = cast(FindRequest, config.config)
|
|
73
|
+
if find_config.generative_model in REPLACEMENTS:
|
|
74
|
+
result[name] = config
|
|
75
|
+
elif config.kind == "ask":
|
|
76
|
+
ask_config = cast(AskRequest, config.config)
|
|
77
|
+
if ask_config.generative_model in REPLACEMENTS:
|
|
78
|
+
result[name] = config
|
|
79
|
+
return result
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
import logging
|
|
21
|
+
import uuid
|
|
22
|
+
from collections.abc import AsyncIterator
|
|
23
|
+
from typing import cast
|
|
24
|
+
|
|
25
|
+
from nucliadb.common import datamanagers
|
|
26
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
27
|
+
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
|
28
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
29
|
+
from nucliadb.migrator.context import ExecutionContext
|
|
30
|
+
from nucliadb_protos.writer_pb2 import ShardObject, Shards
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Reindex resources that have conversation fields
|
|
41
|
+
"""
|
|
42
|
+
kb_shards = await datamanagers.atomic.cluster.get_kb_shards(kbid=kbid, for_update=False)
|
|
43
|
+
if kb_shards is not None:
|
|
44
|
+
async for rid in iter_affected_resource_ids(context, kbid):
|
|
45
|
+
await reindex_resource(context, kbid, rid, kb_shards)
|
|
46
|
+
else:
|
|
47
|
+
logger.warning(
|
|
48
|
+
"Migration 41: KB shards not found, skipping reindexing",
|
|
49
|
+
extra={"kbid": kbid},
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
async def reindex_resource(
|
|
54
|
+
context: ExecutionContext,
|
|
55
|
+
kbid: str,
|
|
56
|
+
rid: str,
|
|
57
|
+
kb_shards: Shards,
|
|
58
|
+
) -> None:
|
|
59
|
+
"""
|
|
60
|
+
Reindex a single resource
|
|
61
|
+
"""
|
|
62
|
+
async with datamanagers.with_ro_transaction() as rs_txn:
|
|
63
|
+
# Fetch the resource
|
|
64
|
+
resource = await Resource.get(rs_txn, kbid=kbid, rid=rid)
|
|
65
|
+
if resource is None:
|
|
66
|
+
logger.warning(
|
|
67
|
+
"Migration 41: Resource not found, skipping reindexing",
|
|
68
|
+
extra={"kbid": kbid, "rid": rid},
|
|
69
|
+
)
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
# Get the shard for the resource
|
|
73
|
+
shard: ShardObject | None = None
|
|
74
|
+
shard_id = await datamanagers.resources.get_resource_shard_id(
|
|
75
|
+
rs_txn, kbid=kbid, rid=rid, for_update=False
|
|
76
|
+
)
|
|
77
|
+
if shard_id is not None:
|
|
78
|
+
shard = next((shard for shard in kb_shards.shards if shard.shard == shard_id), None)
|
|
79
|
+
if shard is None:
|
|
80
|
+
logger.warning(
|
|
81
|
+
"Migration 41: Shard not found for resource, skipping reindexing",
|
|
82
|
+
extra={"kbid": kbid, "rid": rid, "shard_id": shard_id},
|
|
83
|
+
)
|
|
84
|
+
return
|
|
85
|
+
|
|
86
|
+
# Create the index message and reindex the resource
|
|
87
|
+
index_message = await get_resource_index_message(resource, reindex=True)
|
|
88
|
+
await context.shard_manager.add_resource(
|
|
89
|
+
shard,
|
|
90
|
+
index_message,
|
|
91
|
+
0,
|
|
92
|
+
partition="0",
|
|
93
|
+
kb=kbid,
|
|
94
|
+
reindex_id=uuid.uuid4().hex,
|
|
95
|
+
)
|
|
96
|
+
logger.info(
|
|
97
|
+
"Migration 41: Resource reindexed",
|
|
98
|
+
extra={"kbid": kbid, "rid": rid},
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
async def iter_affected_resource_ids(context: ExecutionContext, kbid: str) -> AsyncIterator[str]:
|
|
103
|
+
start = ""
|
|
104
|
+
while True:
|
|
105
|
+
keys_batch = await get_batch(context, kbid, start)
|
|
106
|
+
if keys_batch is None:
|
|
107
|
+
break
|
|
108
|
+
start = keys_batch[-1]
|
|
109
|
+
for key in keys_batch:
|
|
110
|
+
# The keys have the format /kbs/{kbid}/r/{rid}/f/c/{field_id}
|
|
111
|
+
rid = key.split("/")[4]
|
|
112
|
+
yield rid
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
async def get_batch(context: ExecutionContext, kbid: str, start: str) -> list[str] | None:
|
|
116
|
+
"""
|
|
117
|
+
Get a batch of resource keys that hold conversation fields for the given KB.
|
|
118
|
+
Starting after the given start key.
|
|
119
|
+
Returns None if no more keys are found.
|
|
120
|
+
"""
|
|
121
|
+
batch_size = 100
|
|
122
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
123
|
+
txn = cast(PGTransaction, txn)
|
|
124
|
+
async with txn.connection.cursor() as cur:
|
|
125
|
+
await cur.execute(
|
|
126
|
+
"""
|
|
127
|
+
SELECT key FROM resources
|
|
128
|
+
WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
|
|
129
|
+
AND key > %s
|
|
130
|
+
ORDER BY key
|
|
131
|
+
LIMIT %s""",
|
|
132
|
+
(kbid, start, batch_size),
|
|
133
|
+
)
|
|
134
|
+
rows = await cur.fetchall()
|
|
135
|
+
if len(rows) == 0:
|
|
136
|
+
return None
|
|
137
|
+
return [row[0] for row in rows]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
|
25
|
+
# Concurrent index must be created outside of a transaction but psycopg automatically
|
|
26
|
+
# creates transactions. We temporarily disable this for building indexes.
|
|
27
|
+
await txn.connection.commit()
|
|
28
|
+
try:
|
|
29
|
+
await txn.connection.set_autocommit(True)
|
|
30
|
+
await txn.connection.execute(
|
|
31
|
+
"CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
|
|
32
|
+
)
|
|
33
|
+
finally:
|
|
34
|
+
await txn.connection.set_autocommit(False)
|
|
@@ -18,11 +18,10 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
from
|
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
22
22
|
|
|
23
|
-
from nucliadb.common import datamanagers
|
|
24
23
|
|
|
25
|
-
|
|
26
|
-
async
|
|
27
|
-
|
|
28
|
-
|
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
|
25
|
+
async with txn.connection.cursor() as cur:
|
|
26
|
+
await cur.execute("CREATE STATISTICS catalog_kbid_labels ON kbid, labels FROM catalog;")
|
|
27
|
+
await cur.execute("ANALYZE catalog;")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
|
25
|
+
async with txn.connection.cursor() as cur:
|
|
26
|
+
await cur.execute("DROP STATISTICS catalog_kbid_labels;")
|
nucliadb/backups/create.py
CHANGED
|
@@ -21,8 +21,8 @@ import asyncio
|
|
|
21
21
|
import json
|
|
22
22
|
import logging
|
|
23
23
|
import tarfile
|
|
24
|
+
from collections.abc import AsyncIterator
|
|
24
25
|
from datetime import datetime, timezone
|
|
25
|
-
from typing import AsyncIterator, Optional
|
|
26
26
|
|
|
27
27
|
from nucliadb.backups.const import (
|
|
28
28
|
BackupFinishedStream,
|
|
@@ -37,7 +37,6 @@ from nucliadb.export_import.utils import (
|
|
|
37
37
|
download_binary,
|
|
38
38
|
get_broker_message,
|
|
39
39
|
get_cloud_files,
|
|
40
|
-
get_entities,
|
|
41
40
|
get_labels,
|
|
42
41
|
get_search_configurations,
|
|
43
42
|
get_synonyms,
|
|
@@ -76,7 +75,6 @@ async def backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
|
|
|
76
75
|
"""
|
|
77
76
|
await backup_resources(context, kbid, backup_id)
|
|
78
77
|
await backup_labels(context, kbid, backup_id)
|
|
79
|
-
await backup_entities(context, kbid, backup_id)
|
|
80
78
|
await backup_synonyms(context, kbid, backup_id)
|
|
81
79
|
await backup_search_configurations(context, kbid, backup_id)
|
|
82
80
|
await notify_backup_completed(context, kbid, backup_id)
|
|
@@ -235,15 +233,6 @@ async def backup_labels(context: ApplicationContext, kbid: str, backup_id: str):
|
|
|
235
233
|
)
|
|
236
234
|
|
|
237
235
|
|
|
238
|
-
async def backup_entities(context: ApplicationContext, kbid: str, backup_id: str):
|
|
239
|
-
entities = await get_entities(context, kbid)
|
|
240
|
-
await context.blob_storage.upload_object(
|
|
241
|
-
bucket=settings.backups_bucket,
|
|
242
|
-
key=StorageKeys.ENTITIES.format(backup_id=backup_id),
|
|
243
|
-
data=entities.SerializeToString(),
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
|
|
247
236
|
async def backup_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
|
|
248
237
|
synonyms = await get_synonyms(context, kbid)
|
|
249
238
|
await context.blob_storage.upload_object(
|
|
@@ -266,9 +255,7 @@ async def backup_search_configurations(context: ApplicationContext, kbid: str, b
|
|
|
266
255
|
)
|
|
267
256
|
|
|
268
257
|
|
|
269
|
-
async def get_metadata(
|
|
270
|
-
context: ApplicationContext, kbid: str, backup_id: str
|
|
271
|
-
) -> Optional[BackupMetadata]:
|
|
258
|
+
async def get_metadata(context: ApplicationContext, kbid: str, backup_id: str) -> BackupMetadata | None:
|
|
272
259
|
async with context.kv_driver.ro_transaction() as txn:
|
|
273
260
|
metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
|
|
274
261
|
if metadata_raw is None:
|
nucliadb/backups/restore.py
CHANGED
|
@@ -24,7 +24,8 @@ import functools
|
|
|
24
24
|
import json
|
|
25
25
|
import logging
|
|
26
26
|
import tarfile
|
|
27
|
-
from
|
|
27
|
+
from collections.abc import AsyncIterator, Callable
|
|
28
|
+
from typing import Any
|
|
28
29
|
|
|
29
30
|
from pydantic import TypeAdapter
|
|
30
31
|
|
|
@@ -35,7 +36,6 @@ from nucliadb.common.context import ApplicationContext
|
|
|
35
36
|
from nucliadb.export_import.utils import (
|
|
36
37
|
import_binary,
|
|
37
38
|
restore_broker_message,
|
|
38
|
-
set_entities_groups,
|
|
39
39
|
set_labels,
|
|
40
40
|
set_search_configurations,
|
|
41
41
|
set_synonyms,
|
|
@@ -74,7 +74,6 @@ async def restore_kb(context: ApplicationContext, kbid: str, backup_id: str):
|
|
|
74
74
|
"""
|
|
75
75
|
await restore_resources(context, kbid, backup_id)
|
|
76
76
|
await restore_labels(context, kbid, backup_id)
|
|
77
|
-
await restore_entities(context, kbid, backup_id)
|
|
78
77
|
await restore_synonyms(context, kbid, backup_id)
|
|
79
78
|
await restore_search_configurations(context, kbid, backup_id)
|
|
80
79
|
await delete_last_restored(context, kbid, backup_id)
|
|
@@ -101,7 +100,7 @@ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: s
|
|
|
101
100
|
await set_last_restored(context, kbid, backup_id, key)
|
|
102
101
|
|
|
103
102
|
|
|
104
|
-
async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) ->
|
|
103
|
+
async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> str | None:
|
|
105
104
|
key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
|
|
106
105
|
async with context.kv_driver.ro_transaction() as txn:
|
|
107
106
|
raw = await txn.get(key)
|
|
@@ -191,7 +190,7 @@ class ResourceBackupReader:
|
|
|
191
190
|
data = await self.read(tarinfo_size + padding_bytes)
|
|
192
191
|
return data[:tarinfo_size]
|
|
193
192
|
|
|
194
|
-
async def read_item(self) ->
|
|
193
|
+
async def read_item(self) -> BrokerMessage | CloudFile | CloudFileBinary:
|
|
195
194
|
tarinfo = await self.read_tarinfo()
|
|
196
195
|
if tarinfo.name.startswith("broker-message"):
|
|
197
196
|
raw_bm = await self.read_data(tarinfo)
|
|
@@ -257,16 +256,6 @@ async def restore_labels(context: ApplicationContext, kbid: str, backup_id: str)
|
|
|
257
256
|
await set_labels(context, kbid, labels)
|
|
258
257
|
|
|
259
258
|
|
|
260
|
-
async def restore_entities(context: ApplicationContext, kbid: str, backup_id: str):
|
|
261
|
-
raw = await context.blob_storage.downloadbytes(
|
|
262
|
-
bucket=settings.backups_bucket,
|
|
263
|
-
key=StorageKeys.ENTITIES.format(backup_id=backup_id),
|
|
264
|
-
)
|
|
265
|
-
entities = kb_pb2.EntitiesGroups()
|
|
266
|
-
entities.ParseFromString(raw.getvalue())
|
|
267
|
-
await set_entities_groups(context, kbid, entities)
|
|
268
|
-
|
|
269
|
-
|
|
270
259
|
async def restore_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
|
|
271
260
|
raw = await context.blob_storage.downloadbytes(
|
|
272
261
|
bucket=settings.backups_bucket,
|