nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -19,13 +19,10 @@
|
|
19
19
|
|
20
20
|
import logging
|
21
21
|
import os
|
22
|
-
import shutil
|
23
22
|
import uuid
|
24
|
-
from socket import gethostname
|
25
23
|
|
26
24
|
from nucliadb.common.cluster.settings import StandaloneNodeRole
|
27
25
|
from nucliadb.common.cluster.settings import settings as cluster_settings
|
28
|
-
from nucliadb.common.cluster.standalone.index_node import StandaloneIndexNode
|
29
26
|
|
30
27
|
logger = logging.getLogger(__name__)
|
31
28
|
|
@@ -46,46 +43,6 @@ def get_standalone_node_id() -> str:
|
|
46
43
|
return str(uuid.UUID(bytes=f.read()))
|
47
44
|
|
48
45
|
|
49
|
-
_SELF_INDEX_NODE = None
|
50
|
-
|
51
|
-
|
52
|
-
def get_self() -> StandaloneIndexNode:
|
53
|
-
"""
|
54
|
-
This returns an instance of the standalone index node
|
55
|
-
so when API requests come into this mode, we don't
|
56
|
-
make another grpc request since this node can service it directly.
|
57
|
-
"""
|
58
|
-
if not is_index_node():
|
59
|
-
raise Exception("This node is not an Index Node. You should not reach this code path.")
|
60
|
-
global _SELF_INDEX_NODE
|
61
|
-
node_id = get_standalone_node_id()
|
62
|
-
if _SELF_INDEX_NODE is None or node_id != _SELF_INDEX_NODE.id:
|
63
|
-
if "NUCLIADB_SERVICE_HOST" in os.environ:
|
64
|
-
hn = os.environ["HOSTNAME"]
|
65
|
-
ns = os.environ.get("NAMESPACE", "nucliadb")
|
66
|
-
host = f"{hn}.{ns}"
|
67
|
-
else:
|
68
|
-
host = gethostname()
|
69
|
-
_SELF_INDEX_NODE = StandaloneIndexNode(id=node_id, address=host, shard_count=0, available_disk=0)
|
70
|
-
try:
|
71
|
-
_, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
|
72
|
-
_SELF_INDEX_NODE.available_disk = available_disk
|
73
|
-
except FileNotFoundError: # pragma: no cover
|
74
|
-
...
|
75
|
-
try:
|
76
|
-
_shards_dir = os.path.join(cluster_settings.data_path, "shards")
|
77
|
-
_SELF_INDEX_NODE.shard_count = len(
|
78
|
-
[
|
79
|
-
shard_dir
|
80
|
-
for shard_dir in os.listdir(_shards_dir)
|
81
|
-
if os.path.isdir(os.path.join(_shards_dir, shard_dir))
|
82
|
-
]
|
83
|
-
)
|
84
|
-
except FileNotFoundError: # pragma: no cover
|
85
|
-
...
|
86
|
-
return _SELF_INDEX_NODE
|
87
|
-
|
88
|
-
|
89
46
|
def is_index_node() -> bool:
|
90
47
|
return cluster_settings.standalone_node_role in (
|
91
48
|
StandaloneNodeRole.ALL,
|
nucliadb/common/cluster/utils.py
CHANGED
@@ -23,20 +23,11 @@ from typing import TYPE_CHECKING, Optional, Union
|
|
23
23
|
import backoff
|
24
24
|
|
25
25
|
from nucliadb.common import datamanagers
|
26
|
-
from nucliadb.common.cluster.discovery.utils import (
|
27
|
-
setup_cluster_discovery,
|
28
|
-
teardown_cluster_discovery,
|
29
|
-
)
|
30
26
|
from nucliadb.common.cluster.manager import (
|
31
27
|
KBShardManager,
|
32
28
|
StandaloneKBShardManager,
|
33
|
-
clear_index_nodes,
|
34
29
|
)
|
35
30
|
from nucliadb.common.cluster.settings import settings
|
36
|
-
from nucliadb.common.cluster.standalone.service import (
|
37
|
-
start_grpc as start_standalone_grpc,
|
38
|
-
)
|
39
|
-
from nucliadb.common.cluster.standalone.utils import is_index_node
|
40
31
|
from nucliadb.ingest.orm.resource import Resource
|
41
32
|
from nucliadb_protos import nodereader_pb2, writer_pb2
|
42
33
|
from nucliadb_utils import const
|
@@ -62,12 +53,8 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
|
|
62
53
|
# already setup
|
63
54
|
return get_utility(Utility.SHARD_MANAGER)
|
64
55
|
|
65
|
-
await setup_cluster_discovery()
|
66
56
|
mng: Union[KBShardManager, StandaloneKBShardManager]
|
67
57
|
if settings.standalone_mode:
|
68
|
-
if is_index_node():
|
69
|
-
server = await start_standalone_grpc()
|
70
|
-
set_utility(_STANDALONE_SERVER, server)
|
71
58
|
mng = StandaloneKBShardManager()
|
72
59
|
else:
|
73
60
|
mng = KBShardManager()
|
@@ -76,7 +63,6 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
|
|
76
63
|
|
77
64
|
|
78
65
|
async def teardown_cluster():
|
79
|
-
await teardown_cluster_discovery()
|
80
66
|
if get_utility(Utility.SHARD_MANAGER):
|
81
67
|
clean_utility(Utility.SHARD_MANAGER)
|
82
68
|
|
@@ -85,8 +71,6 @@ async def teardown_cluster():
|
|
85
71
|
await std_server.stop(None)
|
86
72
|
clean_utility(_STANDALONE_SERVER)
|
87
73
|
|
88
|
-
clear_index_nodes()
|
89
|
-
|
90
74
|
|
91
75
|
def get_shard_manager() -> KBShardManager:
|
92
76
|
return get_utility(Utility.SHARD_MANAGER) # type: ignore
|
nucliadb/common/counters.py
CHANGED
@@ -23,11 +23,13 @@ from typing import Optional
|
|
23
23
|
from google.protobuf.message import Message
|
24
24
|
|
25
25
|
from nucliadb.common.datamanagers.utils import get_kv_pb
|
26
|
+
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR
|
26
27
|
from nucliadb.common.maindb.driver import Transaction
|
27
28
|
from nucliadb_protos import writer_pb2
|
28
29
|
|
29
30
|
KB_RESOURCE_FIELD = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
|
30
31
|
KB_RESOURCE_FIELD_ERROR = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/error"
|
32
|
+
KB_RESOURCE_FIELD_STATUS = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/status"
|
31
33
|
|
32
34
|
|
33
35
|
async def get_raw(
|
@@ -52,13 +54,7 @@ async def set(
|
|
52
54
|
|
53
55
|
async def delete(txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str):
|
54
56
|
base_key = KB_RESOURCE_FIELD.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
|
55
|
-
|
56
|
-
keys_to_delete = []
|
57
|
-
async for key in txn.keys(base_key):
|
58
|
-
keys_to_delete.append(key)
|
59
|
-
|
60
|
-
for key in keys_to_delete:
|
61
|
-
await txn.delete(key)
|
57
|
+
await txn.delete_by_prefix(base_key)
|
62
58
|
|
63
59
|
|
64
60
|
# Error
|
@@ -82,3 +78,48 @@ async def set_error(
|
|
82
78
|
):
|
83
79
|
key = KB_RESOURCE_FIELD_ERROR.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
|
84
80
|
await txn.set(key, error.SerializeToString())
|
81
|
+
|
82
|
+
|
83
|
+
# Status, replaces error
|
84
|
+
|
85
|
+
|
86
|
+
async def get_status(
|
87
|
+
txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str
|
88
|
+
) -> Optional[writer_pb2.FieldStatus]:
|
89
|
+
key = KB_RESOURCE_FIELD_STATUS.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
|
90
|
+
return await get_kv_pb(txn, key, writer_pb2.FieldStatus)
|
91
|
+
|
92
|
+
|
93
|
+
async def get_statuses(
|
94
|
+
txn: Transaction, *, kbid: str, rid: str, fields: list[writer_pb2.FieldID]
|
95
|
+
) -> list[writer_pb2.FieldStatus]:
|
96
|
+
keys = [
|
97
|
+
KB_RESOURCE_FIELD_STATUS.format(
|
98
|
+
kbid=kbid, uuid=rid, type=FIELD_TYPE_PB_TO_STR[fid.field_type], field=fid.field
|
99
|
+
)
|
100
|
+
for fid in fields
|
101
|
+
]
|
102
|
+
serialized = await txn.batch_get(keys, for_update=False)
|
103
|
+
statuses = []
|
104
|
+
for serialized_status in serialized:
|
105
|
+
pb = writer_pb2.FieldStatus()
|
106
|
+
if serialized_status is not None:
|
107
|
+
pb.ParseFromString(serialized_status)
|
108
|
+
else:
|
109
|
+
pb = writer_pb2.FieldStatus()
|
110
|
+
statuses.append(pb)
|
111
|
+
|
112
|
+
return statuses
|
113
|
+
|
114
|
+
|
115
|
+
async def set_status(
|
116
|
+
txn: Transaction,
|
117
|
+
*,
|
118
|
+
kbid: str,
|
119
|
+
rid: str,
|
120
|
+
field_type: str,
|
121
|
+
field_id: str,
|
122
|
+
status: writer_pb2.FieldStatus,
|
123
|
+
):
|
124
|
+
key = KB_RESOURCE_FIELD_STATUS.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
|
125
|
+
await txn.set(key, status.SerializeToString())
|
@@ -58,6 +58,11 @@ async def iter(
|
|
58
58
|
yield config.vectorset_id, config
|
59
59
|
|
60
60
|
|
61
|
+
async def count(txn: Transaction, *, kbid: str) -> int:
|
62
|
+
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
|
63
|
+
return len(kb_vectorsets.vectorsets)
|
64
|
+
|
65
|
+
|
61
66
|
async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
|
62
67
|
"""Create or update a vectorset configuration"""
|
63
68
|
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
|
@@ -73,16 +78,20 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
|
|
73
78
|
await txn.set(key, kb_vectorsets.SerializeToString())
|
74
79
|
|
75
80
|
|
76
|
-
async def delete(
|
81
|
+
async def delete(
|
82
|
+
txn: Transaction, *, kbid: str, vectorset_id: str
|
83
|
+
) -> Optional[knowledgebox_pb2.VectorSetConfig]:
|
77
84
|
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
|
78
85
|
index = _find_vectorset(kb_vectorsets, vectorset_id)
|
79
86
|
if index is None:
|
80
87
|
# already deleted
|
81
|
-
return
|
88
|
+
return None
|
82
89
|
|
90
|
+
deleted = kb_vectorsets.vectorsets[index]
|
83
91
|
del kb_vectorsets.vectorsets[index]
|
84
92
|
key = KB_VECTORSETS.format(kbid=kbid)
|
85
93
|
await txn.set(key, kb_vectorsets.SerializeToString())
|
94
|
+
return deleted
|
86
95
|
|
87
96
|
|
88
97
|
# XXX At some point in the vectorset epic, we should make this key mandatory and
|
@@ -28,7 +28,7 @@ from nucliadb.common.counters import IndexCounts
|
|
28
28
|
from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
|
29
29
|
from nucliadb.common.ids import ParagraphId
|
30
30
|
from nucliadb_models.external_index_providers import ExternalIndexProviderType
|
31
|
-
from nucliadb_models.search import SCORE_TYPE, TextPosition
|
31
|
+
from nucliadb_models.search import SCORE_TYPE, Relations, TextPosition
|
32
32
|
from nucliadb_protos.knowledgebox_pb2 import (
|
33
33
|
CreateExternalIndexProviderMetadata,
|
34
34
|
StoredExternalIndexProviderMetadata,
|
@@ -73,6 +73,7 @@ class TextBlockMatch(BaseModel):
|
|
73
73
|
paragraph_labels: list[str] = []
|
74
74
|
field_labels: list[str] = []
|
75
75
|
text: Optional[str] = None
|
76
|
+
relevant_relations: Optional[Relations] = None
|
76
77
|
|
77
78
|
|
78
79
|
class QueryResults(BaseModel):
|
@@ -441,6 +441,7 @@ class PineconeIndexManager(ExternalIndexManager):
|
|
441
441
|
|
442
442
|
def get_prefixes_to_delete(self, index_data: Resource) -> set[str]:
|
443
443
|
prefixes_to_delete = set()
|
444
|
+
# TODO: migrate to vector_prefixes_to_delete
|
444
445
|
for field_id in index_data.sentences_to_delete:
|
445
446
|
try:
|
446
447
|
delete_vid = VectorId.from_string(field_id)
|
@@ -706,11 +707,7 @@ class PineconeIndexManager(ExternalIndexManager):
|
|
706
707
|
if self.kbid in COUNTERS_CACHE:
|
707
708
|
# Cache hit
|
708
709
|
return COUNTERS_CACHE[self.kbid]
|
709
|
-
total = IndexCounts(
|
710
|
-
fields=0,
|
711
|
-
paragraphs=0,
|
712
|
-
sentences=0,
|
713
|
-
)
|
710
|
+
total = IndexCounts(fields=0, paragraphs=0, sentences=0, size_bytes=0)
|
714
711
|
tasks = []
|
715
712
|
vectorset_results: dict[str, IndexCounts] = {}
|
716
713
|
|
@@ -738,6 +735,7 @@ class PineconeIndexManager(ExternalIndexManager):
|
|
738
735
|
fields=0,
|
739
736
|
paragraphs=index_stats.totalVectorCount,
|
740
737
|
sentences=index_stats.totalVectorCount,
|
738
|
+
size_bytes=0,
|
741
739
|
)
|
742
740
|
except Exception:
|
743
741
|
logger.exception(
|
nucliadb/common/ids.py
CHANGED
@@ -111,13 +111,11 @@ class FieldId:
|
|
111
111
|
parts = value.split("/")
|
112
112
|
if len(parts) == 3:
|
113
113
|
rid, _type, key = parts
|
114
|
-
|
115
|
-
raise ValueError(f"Invalid FieldId: {value}")
|
114
|
+
_type = cls.parse_field_type(_type)
|
116
115
|
return cls(rid=rid, type=_type, key=key)
|
117
116
|
elif len(parts) == 4:
|
118
117
|
rid, _type, key, subfield_id = parts
|
119
|
-
|
120
|
-
raise ValueError(f"Invalid FieldId: {value}")
|
118
|
+
_type = cls.parse_field_type(_type)
|
121
119
|
return cls(
|
122
120
|
rid=rid,
|
123
121
|
type=_type,
|
@@ -127,6 +125,22 @@ class FieldId:
|
|
127
125
|
else:
|
128
126
|
raise ValueError(f"Invalid FieldId: {value}")
|
129
127
|
|
128
|
+
@classmethod
|
129
|
+
def parse_field_type(cls, _type: str) -> str:
|
130
|
+
if _type not in FIELD_TYPE_STR_TO_PB:
|
131
|
+
# Try to parse the enum value
|
132
|
+
# XXX: This is to support field types that are integer values of FieldType
|
133
|
+
# Which is how legacy processor relations reported the paragraph_id
|
134
|
+
try:
|
135
|
+
type_pb = FieldType.ValueType(int(_type))
|
136
|
+
except ValueError:
|
137
|
+
raise ValueError(f"Invalid FieldId: {_type}")
|
138
|
+
if type_pb in FIELD_TYPE_PB_TO_STR:
|
139
|
+
return FIELD_TYPE_PB_TO_STR[type_pb]
|
140
|
+
else:
|
141
|
+
raise ValueError(f"Invalid FieldId: {_type}")
|
142
|
+
return _type
|
143
|
+
|
130
144
|
|
131
145
|
@dataclass
|
132
146
|
class ParagraphId:
|