nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -55,7 +55,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
# async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
|
|
58
|
-
# async with context.kv_driver.
|
|
58
|
+
# async with context.kv_driver.ro_transaction() as txn:
|
|
59
59
|
# shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=False)
|
|
60
60
|
# if not shards_object:
|
|
61
61
|
# raise ShardsObjectNotFound()
|
|
@@ -44,7 +44,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
44
44
|
|
|
45
45
|
# No longer relevant with nidx
|
|
46
46
|
|
|
47
|
-
# async with context.kv_driver.
|
|
47
|
+
# async with context.kv_driver.rw_transaction() as txn:
|
|
48
48
|
# shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
|
|
49
49
|
# if shards is None:
|
|
50
50
|
# logger.error("KB without shards", extra={"kbid": kbid})
|
|
@@ -35,7 +35,7 @@ logger = logging.getLogger(__name__)
|
|
|
35
35
|
|
|
36
36
|
|
|
37
37
|
async def migrate(context: ExecutionContext) -> None:
|
|
38
|
-
async with context.kv_driver.
|
|
38
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
39
39
|
async for key in txn.keys(KB_SLUGS_BASE):
|
|
40
40
|
slug = key.replace(KB_SLUGS_BASE, "")
|
|
41
41
|
value = await txn.get(key, for_update=False)
|
|
@@ -55,7 +55,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
# async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
|
|
58
|
-
# async with context.kv_driver.
|
|
58
|
+
# async with context.kv_driver.ro_transaction() as txn:
|
|
59
59
|
# shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
|
60
60
|
# if not shards_object:
|
|
61
61
|
# raise ShardsObjectNotFound()
|
|
@@ -38,7 +38,7 @@ async def migrate(context: ExecutionContext) -> None: ...
|
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
41
|
-
async with context.kv_driver.
|
|
41
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
42
42
|
logger.info(f"Overwriting vectorsets key", extra={"kbid": kbid})
|
|
43
43
|
await datamanagers.vectorsets.initialize(txn, kbid=kbid)
|
|
44
44
|
await txn.commit()
|
|
@@ -28,9 +28,10 @@ import logging
|
|
|
28
28
|
from typing import cast
|
|
29
29
|
|
|
30
30
|
from nucliadb.common import datamanagers
|
|
31
|
+
from nucliadb.common.catalog import catalog_update, get_catalog
|
|
32
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
31
33
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
32
34
|
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
|
33
|
-
from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
|
|
34
35
|
from nucliadb.migrator.context import ExecutionContext
|
|
35
36
|
|
|
36
37
|
logger = logging.getLogger(__name__)
|
|
@@ -43,8 +44,11 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
43
44
|
if not isinstance(context.kv_driver, PGDriver):
|
|
44
45
|
return
|
|
45
46
|
|
|
47
|
+
if not isinstance(get_catalog(), PGCatalog):
|
|
48
|
+
return
|
|
49
|
+
|
|
46
50
|
BATCH_SIZE = 100
|
|
47
|
-
async with context.kv_driver.
|
|
51
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
48
52
|
txn = cast(PGTransaction, txn)
|
|
49
53
|
continue_sql = ""
|
|
50
54
|
while True:
|
|
@@ -75,7 +79,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
75
79
|
continue
|
|
76
80
|
|
|
77
81
|
index_message = await get_resource_index_message(resource, reindex=False)
|
|
78
|
-
await
|
|
82
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
79
83
|
|
|
80
84
|
await txn.commit()
|
|
81
85
|
continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
|
|
@@ -47,7 +47,7 @@ async def migrate(context: ExecutionContext) -> None: ...
|
|
|
47
47
|
|
|
48
48
|
|
|
49
49
|
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
50
|
-
async with context.kv_driver.
|
|
50
|
+
async with context.kv_driver.ro_transaction() as txn:
|
|
51
51
|
vectorsets_count = len([vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)])
|
|
52
52
|
if vectorsets_count > 0:
|
|
53
53
|
logger.info("Skipping KB with vectorsets already populated", extra={"kbid": kbid})
|
|
@@ -65,7 +65,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
65
65
|
learning_matryoshka_dimensions = learning_model_metadata.matryoshka_dimensions
|
|
66
66
|
learning_normalize_vectors = len(learning_matryoshka_dimensions) > 0
|
|
67
67
|
|
|
68
|
-
async with context.kv_driver.
|
|
68
|
+
async with context.kv_driver.ro_transaction() as txn:
|
|
69
69
|
semantic_model = await datamanagers.kb.get_model_metadata(txn, kbid=kbid)
|
|
70
70
|
|
|
71
71
|
maindb_similarity = semantic_model.similarity_function
|
|
@@ -103,7 +103,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
103
103
|
matryoshka_dimensions=maindb_matryoshka_dimensions,
|
|
104
104
|
)
|
|
105
105
|
|
|
106
|
-
async with context.kv_driver.
|
|
106
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
107
107
|
# Populate KB vectorsets with data from learning. We are skipping KBs
|
|
108
108
|
# with this key already set, so we can set here safely
|
|
109
109
|
await datamanagers.vectorsets.set(txn, kbid=kbid, config=default_vectorset)
|
|
@@ -49,7 +49,7 @@ async def maybe_fix_vector_dimensions(context: ExecutionContext, kbid: str) -> N
|
|
|
49
49
|
logger.warning(f"KB has no learning config", extra={"kbid": kbid})
|
|
50
50
|
return
|
|
51
51
|
|
|
52
|
-
async with context.kv_driver.
|
|
52
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
53
53
|
vectorsets = [vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)]
|
|
54
54
|
if len(vectorsets) != 1:
|
|
55
55
|
# If multiple vectorsets, they are new shards created correctly, we can safely skip it
|
|
@@ -39,7 +39,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
39
39
|
async with datamanagers.with_rw_transaction() as txn:
|
|
40
40
|
vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
|
|
41
41
|
|
|
42
|
-
if len(vectorsets) == 0: # pragma:
|
|
42
|
+
if len(vectorsets) == 0: # pragma: no cover
|
|
43
43
|
# should never happen, everyone should have at least one
|
|
44
44
|
logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
|
|
45
45
|
return
|
|
@@ -45,7 +45,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
|
|
|
45
45
|
|
|
46
46
|
async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
|
|
47
47
|
logger.info(f"Running batch from {start}")
|
|
48
|
-
async with context.kv_driver.
|
|
48
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
49
49
|
async with txn.connection.cursor() as cur: # type: ignore
|
|
50
50
|
# Retrieve a batch of fields
|
|
51
51
|
await cur.execute(
|
|
@@ -47,7 +47,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
47
47
|
|
|
48
48
|
async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
|
|
49
49
|
logger.info(f"Running batch from {start}")
|
|
50
|
-
async with context.kv_driver.
|
|
50
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
51
51
|
async with txn.connection.cursor() as cur: # type: ignore
|
|
52
52
|
# Retrieve a batch of fields
|
|
53
53
|
await cur.execute(
|
|
@@ -37,7 +37,7 @@ async def migrate(context: ExecutionContext) -> None:
|
|
|
37
37
|
driver = cast(PGDriver, context.kv_driver)
|
|
38
38
|
|
|
39
39
|
BATCH_SIZE = 10_000
|
|
40
|
-
async with driver.
|
|
40
|
+
async with driver.rw_transaction() as txn:
|
|
41
41
|
txn = cast(PGTransaction, txn)
|
|
42
42
|
start_key = ""
|
|
43
43
|
while True:
|
|
@@ -37,7 +37,7 @@ async def migrate(context: ExecutionContext) -> None:
|
|
|
37
37
|
driver = cast(PGDriver, context.kv_driver)
|
|
38
38
|
|
|
39
39
|
BATCH_SIZE = 1_000
|
|
40
|
-
async with driver.
|
|
40
|
+
async with driver.rw_transaction() as txn:
|
|
41
41
|
txn = cast(PGTransaction, txn)
|
|
42
42
|
start_kbid = "00000000000000000000000000000000"
|
|
43
43
|
start_rid = "00000000000000000000000000000000"
|
|
@@ -28,9 +28,10 @@ import logging
|
|
|
28
28
|
from typing import cast
|
|
29
29
|
|
|
30
30
|
from nucliadb.common import datamanagers
|
|
31
|
+
from nucliadb.common.catalog import catalog_update, get_catalog
|
|
32
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
31
33
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
|
32
34
|
from nucliadb.ingest.orm.index_message import get_resource_index_message
|
|
33
|
-
from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
|
|
34
35
|
from nucliadb.migrator.context import ExecutionContext
|
|
35
36
|
from nucliadb_protos import resources_pb2
|
|
36
37
|
|
|
@@ -44,8 +45,11 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
44
45
|
if not isinstance(context.kv_driver, PGDriver):
|
|
45
46
|
return
|
|
46
47
|
|
|
48
|
+
if not isinstance(get_catalog(), PGCatalog):
|
|
49
|
+
return
|
|
50
|
+
|
|
47
51
|
BATCH_SIZE = 100
|
|
48
|
-
async with context.kv_driver.
|
|
52
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
49
53
|
txn = cast(PGTransaction, txn)
|
|
50
54
|
start = ""
|
|
51
55
|
while True:
|
|
@@ -84,7 +88,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
|
84
88
|
continue
|
|
85
89
|
|
|
86
90
|
index_message = await get_resource_index_message(resource, reindex=False)
|
|
87
|
-
await
|
|
91
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
88
92
|
|
|
89
93
|
if to_index:
|
|
90
94
|
await txn.commit()
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
"""Migration #39
|
|
22
|
+
|
|
23
|
+
Backfill splits metadata on conversation fields
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
from typing import cast
|
|
29
|
+
|
|
30
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
31
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
32
|
+
from nucliadb.ingest.fields.conversation import (
|
|
33
|
+
CONVERSATION_SPLITS_METADATA,
|
|
34
|
+
Conversation,
|
|
35
|
+
)
|
|
36
|
+
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
|
37
|
+
from nucliadb.migrator.context import ExecutionContext
|
|
38
|
+
from nucliadb_protos import resources_pb2
|
|
39
|
+
from nucliadb_protos.resources_pb2 import SplitMetadata, SplitsMetadata
|
|
40
|
+
from nucliadb_utils.storages.storage import Storage
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
49
|
+
BATCH_SIZE = 100
|
|
50
|
+
start = ""
|
|
51
|
+
while True:
|
|
52
|
+
to_fix: list[tuple[str, str]] = []
|
|
53
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
54
|
+
txn = cast(PGTransaction, txn)
|
|
55
|
+
async with txn.connection.cursor() as cur:
|
|
56
|
+
# Retrieve a bunch of conversation fields
|
|
57
|
+
await cur.execute(
|
|
58
|
+
"""
|
|
59
|
+
SELECT key FROM resources
|
|
60
|
+
WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
|
|
61
|
+
AND key > %s
|
|
62
|
+
ORDER BY key
|
|
63
|
+
LIMIT %s""",
|
|
64
|
+
(kbid, start, BATCH_SIZE),
|
|
65
|
+
)
|
|
66
|
+
rows = await cur.fetchall()
|
|
67
|
+
if len(rows) == 0:
|
|
68
|
+
return
|
|
69
|
+
for row in rows:
|
|
70
|
+
key = row[0]
|
|
71
|
+
start = key
|
|
72
|
+
rid = key.split("/")[4]
|
|
73
|
+
field_id = key.split("/")[7]
|
|
74
|
+
to_fix.append((rid, field_id))
|
|
75
|
+
|
|
76
|
+
for rid, field_id in to_fix:
|
|
77
|
+
async with context.kv_driver.rw_transaction() as txn2:
|
|
78
|
+
splits_metadata = await build_splits_metadata(
|
|
79
|
+
txn2, context.blob_storage, kbid, rid, field_id
|
|
80
|
+
)
|
|
81
|
+
splits_metadata_key = CONVERSATION_SPLITS_METADATA.format(
|
|
82
|
+
kbid=kbid, uuid=rid, type="c", field=field_id
|
|
83
|
+
)
|
|
84
|
+
await txn2.set(splits_metadata_key, splits_metadata.SerializeToString())
|
|
85
|
+
await txn2.commit()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def build_splits_metadata(
|
|
89
|
+
txn: Transaction, storage: Storage, kbid: str, rid: str, field_id: str
|
|
90
|
+
) -> SplitsMetadata:
|
|
91
|
+
splits_metadata = SplitsMetadata()
|
|
92
|
+
kb_orm = KnowledgeBoxORM(txn, storage, kbid)
|
|
93
|
+
resource_obj = await kb_orm.get(rid)
|
|
94
|
+
if resource_obj is None:
|
|
95
|
+
return splits_metadata
|
|
96
|
+
field_obj: Conversation = await resource_obj.get_field(
|
|
97
|
+
field_id, resources_pb2.FieldType.CONVERSATION, load=False
|
|
98
|
+
)
|
|
99
|
+
conv_metadata = await field_obj.get_metadata()
|
|
100
|
+
for i in range(1, conv_metadata.pages + 1):
|
|
101
|
+
page = await field_obj.get_value(page=i)
|
|
102
|
+
if page is None:
|
|
103
|
+
continue
|
|
104
|
+
for message in page.messages:
|
|
105
|
+
splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
|
|
106
|
+
return splits_metadata
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
"""Migration #40
|
|
22
|
+
|
|
23
|
+
Replaces deprecated and removed generative models from search configurations
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
from typing import cast
|
|
29
|
+
|
|
30
|
+
from nucliadb.common import datamanagers
|
|
31
|
+
from nucliadb.migrator.context import ExecutionContext
|
|
32
|
+
from nucliadb_models.configuration import SearchConfiguration
|
|
33
|
+
from nucliadb_models.search import AskRequest, FindRequest
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
REPLACEMENTS = {
|
|
38
|
+
"claude-3-5-small": "claude-4-5-sonnet",
|
|
39
|
+
"gcp-claude-3-5-sonnet-v2": "gcp-claude-4-5-sonnet",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
|
47
|
+
affected = await get_affected_search_configurations(kbid)
|
|
48
|
+
if not affected:
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
async with datamanagers.with_rw_transaction() as txn:
|
|
52
|
+
for name, config in affected.items():
|
|
53
|
+
logger.info(
|
|
54
|
+
"Migrating search config for kb",
|
|
55
|
+
extra={
|
|
56
|
+
"kbid": kbid,
|
|
57
|
+
"search_config": name,
|
|
58
|
+
"generative_model": config.config.generative_model, # type: ignore
|
|
59
|
+
},
|
|
60
|
+
)
|
|
61
|
+
config.config.generative_model = REPLACEMENTS[config.config.generative_model] # type: ignore
|
|
62
|
+
await datamanagers.search_configurations.set(txn, kbid=kbid, name=name, config=config)
|
|
63
|
+
await txn.commit()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def get_affected_search_configurations(kbid: str) -> dict[str, SearchConfiguration]:
|
|
67
|
+
result: dict[str, SearchConfiguration] = {}
|
|
68
|
+
async with datamanagers.with_ro_transaction() as txn:
|
|
69
|
+
search_configs = await datamanagers.search_configurations.list(txn, kbid=kbid)
|
|
70
|
+
for name, config in search_configs.items():
|
|
71
|
+
if config.kind == "find":
|
|
72
|
+
find_config = cast(FindRequest, config.config)
|
|
73
|
+
if find_config.generative_model in REPLACEMENTS:
|
|
74
|
+
result[name] = config
|
|
75
|
+
elif config.kind == "ask":
|
|
76
|
+
ask_config = cast(AskRequest, config.config)
|
|
77
|
+
if ask_config.generative_model in REPLACEMENTS:
|
|
78
|
+
result[name] = config
|
|
79
|
+
return result
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
|
25
|
+
# Concurrent index must be created outside of a transaction but psycopg automatically
|
|
26
|
+
# creates transactions. We temporarily disable this for building indexes.
|
|
27
|
+
await txn.connection.commit()
|
|
28
|
+
try:
|
|
29
|
+
await txn.connection.set_autocommit(True)
|
|
30
|
+
await txn.connection.execute(
|
|
31
|
+
"CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
|
|
32
|
+
)
|
|
33
|
+
finally:
|
|
34
|
+
await txn.connection.set_autocommit(False)
|
nucliadb/backups/create.py
CHANGED
|
@@ -269,7 +269,7 @@ async def backup_search_configurations(context: ApplicationContext, kbid: str, b
|
|
|
269
269
|
async def get_metadata(
|
|
270
270
|
context: ApplicationContext, kbid: str, backup_id: str
|
|
271
271
|
) -> Optional[BackupMetadata]:
|
|
272
|
-
async with context.kv_driver.
|
|
272
|
+
async with context.kv_driver.ro_transaction() as txn:
|
|
273
273
|
metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
|
|
274
274
|
if metadata_raw is None:
|
|
275
275
|
return None
|
|
@@ -277,7 +277,7 @@ async def get_metadata(
|
|
|
277
277
|
|
|
278
278
|
|
|
279
279
|
async def set_metadata(context: ApplicationContext, kbid: str, backup_id: str, metadata: BackupMetadata):
|
|
280
|
-
async with context.kv_driver.
|
|
280
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
281
281
|
await txn.set(
|
|
282
282
|
MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id),
|
|
283
283
|
metadata.model_dump_json().encode(),
|
|
@@ -286,7 +286,7 @@ async def set_metadata(context: ApplicationContext, kbid: str, backup_id: str, m
|
|
|
286
286
|
|
|
287
287
|
|
|
288
288
|
async def delete_metadata(context: ApplicationContext, kbid: str, backup_id: str):
|
|
289
|
-
async with context.kv_driver.
|
|
289
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
290
290
|
await txn.delete(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
|
|
291
291
|
await txn.commit()
|
|
292
292
|
|
nucliadb/backups/restore.py
CHANGED
|
@@ -103,7 +103,7 @@ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: s
|
|
|
103
103
|
|
|
104
104
|
async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> Optional[str]:
|
|
105
105
|
key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
|
|
106
|
-
async with context.kv_driver.
|
|
106
|
+
async with context.kv_driver.ro_transaction() as txn:
|
|
107
107
|
raw = await txn.get(key)
|
|
108
108
|
if raw is None:
|
|
109
109
|
return None
|
|
@@ -112,14 +112,14 @@ async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: s
|
|
|
112
112
|
|
|
113
113
|
async def set_last_restored(context: ApplicationContext, kbid: str, backup_id: str, resource_id: str):
|
|
114
114
|
key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
|
|
115
|
-
async with context.kv_driver.
|
|
115
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
116
116
|
await txn.set(key, resource_id.encode())
|
|
117
117
|
await txn.commit()
|
|
118
118
|
|
|
119
119
|
|
|
120
120
|
async def delete_last_restored(context: ApplicationContext, kbid: str, backup_id: str):
|
|
121
121
|
key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
|
|
122
|
-
async with context.kv_driver.
|
|
122
|
+
async with context.kv_driver.rw_transaction() as txn:
|
|
123
123
|
await txn.delete(key)
|
|
124
124
|
await txn.commit()
|
|
125
125
|
|
nucliadb/common/cache.py
CHANGED
|
@@ -90,7 +90,7 @@ class ResourceCache(Cache[[str, str], ResourceORM]):
|
|
|
90
90
|
@alru_cache(maxsize=cache_size)
|
|
91
91
|
async def _get_resource(kbid: str, rid: str) -> Optional[ResourceORM]:
|
|
92
92
|
storage = await get_storage()
|
|
93
|
-
async with get_driver().
|
|
93
|
+
async with get_driver().ro_transaction() as txn:
|
|
94
94
|
kb = KnowledgeBoxORM(txn, storage, kbid)
|
|
95
95
|
return await kb.get(rid)
|
|
96
96
|
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
21
|
+
#
|
|
22
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
23
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
24
|
+
#
|
|
25
|
+
# AGPL:
|
|
26
|
+
# This program is free software: you can redistribute it and/or modify
|
|
27
|
+
# it under the terms of the GNU Affero General Public License as
|
|
28
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
29
|
+
# License, or (at your option) any later version.
|
|
30
|
+
#
|
|
31
|
+
# This program is distributed in the hope that it will be useful,
|
|
32
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
33
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
34
|
+
# GNU Affero General Public License for more details.
|
|
35
|
+
#
|
|
36
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
37
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
38
|
+
|
|
39
|
+
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
40
|
+
|
|
41
|
+
from nucliadb.common.catalog.dummy import DummyCatalog
|
|
42
|
+
from nucliadb.common.catalog.interface import Catalog, CatalogQuery
|
|
43
|
+
from nucliadb.common.catalog.pg import PGCatalog
|
|
44
|
+
from nucliadb.common.catalog.utils import build_catalog_resource_data
|
|
45
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
46
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
47
|
+
from nucliadb.ingest.settings import CatalogConfig, settings
|
|
48
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
49
|
+
from nucliadb_utils.exceptions import ConfigurationError
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_catalog() -> Catalog:
|
|
53
|
+
if settings.catalog == CatalogConfig.UNSET:
|
|
54
|
+
return DummyCatalog()
|
|
55
|
+
elif settings.catalog == CatalogConfig.PG:
|
|
56
|
+
return PGCatalog()
|
|
57
|
+
else:
|
|
58
|
+
raise ConfigurationError(f"Unknown catalog configuration: {settings.catalog}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
async def catalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
|
|
62
|
+
catalog = get_catalog()
|
|
63
|
+
resource_data = build_catalog_resource_data(resource, index_message)
|
|
64
|
+
await catalog.update(txn, kbid, resource.uuid, resource_data)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
async def catalog_delete(txn: Transaction, kbid: str, rid: str):
|
|
68
|
+
catalog = get_catalog()
|
|
69
|
+
await catalog.delete(txn, kbid, rid)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def catalog_search(query: CatalogQuery) -> Resources:
|
|
73
|
+
catalog = get_catalog()
|
|
74
|
+
return await catalog.search(query)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
async def catalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
78
|
+
catalog = get_catalog()
|
|
79
|
+
return await catalog.facets(kbid, request)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
|
2
|
+
#
|
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
|
5
|
+
#
|
|
6
|
+
# AGPL:
|
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
|
10
|
+
# License, or (at your option) any later version.
|
|
11
|
+
#
|
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
# GNU Affero General Public License for more details.
|
|
16
|
+
#
|
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
+
#
|
|
20
|
+
from nucliadb.common.catalog.interface import Catalog, CatalogQuery, CatalogResourceData
|
|
21
|
+
from nucliadb.common.maindb.driver import Transaction
|
|
22
|
+
from nucliadb_models.search import CatalogFacetsRequest, Resources
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DummyCatalog(Catalog):
|
|
26
|
+
async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
async def delete(self, txn: Transaction, kbid: str, rid: str):
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
async def search(self, query: CatalogQuery) -> Resources:
|
|
33
|
+
return Resources(results=[], min_score=0.0)
|
|
34
|
+
|
|
35
|
+
async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
|
|
36
|
+
return {}
|