nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
nucliadb/common/nidx.py
CHANGED
@@ -21,7 +21,7 @@
|
|
21
21
|
import os
|
22
22
|
from typing import Optional
|
23
23
|
|
24
|
-
from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxSearcherStub
|
24
|
+
from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
|
25
25
|
|
26
26
|
from nucliadb.common.cluster.base import AbstractIndexNode
|
27
27
|
from nucliadb.common.cluster.settings import settings
|
@@ -37,12 +37,10 @@ from nucliadb_utils.settings import FileBackendConfig, indexing_settings, storag
|
|
37
37
|
from nucliadb_utils.storages.settings import settings as extended_storage_settings
|
38
38
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
39
39
|
|
40
|
-
NIDX_ENABLED = bool(os.environ.get("NIDX_ENABLED"))
|
41
|
-
|
42
40
|
|
43
41
|
class NidxUtility:
|
44
|
-
api_client
|
45
|
-
searcher_client
|
42
|
+
api_client: NidxApiStub
|
43
|
+
searcher_client: NidxSearcherStub
|
46
44
|
|
47
45
|
async def initialize(self):
|
48
46
|
raise NotImplementedError()
|
@@ -98,6 +96,9 @@ class NidxBindingUtility(NidxUtility):
|
|
98
96
|
|
99
97
|
self.config = {
|
100
98
|
"METADATA__DATABASE_URL": ingest_settings.driver_pg_url,
|
99
|
+
"SEARCHER__METADATA_REFRESH_INTERVAL": str(
|
100
|
+
indexing_settings.index_searcher_refresh_interval
|
101
|
+
),
|
101
102
|
**_storage_config("INDEXER", None),
|
102
103
|
**_storage_config("STORAGE", "nidx"),
|
103
104
|
}
|
@@ -123,16 +124,8 @@ class NidxBindingUtility(NidxUtility):
|
|
123
124
|
self.binding.wait_for_sync()
|
124
125
|
|
125
126
|
|
126
|
-
class
|
127
|
-
"""Implements Nidx utility connecting to the network service"""
|
128
|
-
|
127
|
+
class NidxNatsIndexer:
|
129
128
|
def __init__(self):
|
130
|
-
if indexing_settings.index_nidx_subject is None:
|
131
|
-
raise ValueError("INDEX_NIDX_SUBJECT needed for nidx utility")
|
132
|
-
|
133
|
-
if not settings.nidx_api_address or not settings.nidx_searcher_address:
|
134
|
-
raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
|
135
|
-
|
136
129
|
self.nats_connection_manager = NatsConnectionManager(
|
137
130
|
service_name="NidxIndexer",
|
138
131
|
nats_servers=indexing_settings.index_jetstream_servers,
|
@@ -142,10 +135,6 @@ class NidxServiceUtility(NidxUtility):
|
|
142
135
|
|
143
136
|
async def initialize(self):
|
144
137
|
await self.nats_connection_manager.initialize()
|
145
|
-
self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
|
146
|
-
self.searcher_client = NidxSearcherStub(
|
147
|
-
get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
|
148
|
-
)
|
149
138
|
|
150
139
|
async def finalize(self):
|
151
140
|
await self.nats_connection_manager.finalize()
|
@@ -158,18 +147,68 @@ class NidxServiceUtility(NidxUtility):
|
|
158
147
|
return res.seq
|
159
148
|
|
160
149
|
|
161
|
-
|
162
|
-
|
163
|
-
|
150
|
+
class NidxGrpcIndexer:
|
151
|
+
def __init__(self, address):
|
152
|
+
self.address = address
|
164
153
|
|
165
|
-
|
154
|
+
async def initialize(self):
|
155
|
+
self.client = NidxIndexerStub(get_traced_grpc_channel(self.address, "nidx_indexer"))
|
156
|
+
|
157
|
+
async def finalize(self):
|
158
|
+
pass
|
159
|
+
|
160
|
+
async def index(self, writer: IndexMessage) -> int:
|
161
|
+
await self.client.Index(writer)
|
162
|
+
return 0
|
163
|
+
|
164
|
+
|
165
|
+
class NidxServiceUtility(NidxUtility):
|
166
|
+
"""Implements Nidx utility connecting to the network service"""
|
167
|
+
|
168
|
+
def __init__(self):
|
169
|
+
if not settings.nidx_api_address or not settings.nidx_searcher_address:
|
170
|
+
raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
|
171
|
+
|
172
|
+
if indexing_settings.index_nidx_subject:
|
173
|
+
self.indexer = NidxNatsIndexer()
|
174
|
+
elif settings.nidx_indexer_address is not None:
|
175
|
+
self.indexer = NidxGrpcIndexer(settings.nidx_indexer_address)
|
176
|
+
else:
|
177
|
+
raise ValueError("NIDX_INDEXER_ADDRESS or INDEX_NIDX_SUBJECT are required")
|
178
|
+
|
179
|
+
async def initialize(self):
|
180
|
+
await self.indexer.initialize()
|
181
|
+
self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
|
182
|
+
self.searcher_client = NidxSearcherStub(
|
183
|
+
get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
|
184
|
+
)
|
185
|
+
|
186
|
+
async def finalize(self):
|
187
|
+
await self.indexer.finalize()
|
188
|
+
|
189
|
+
async def index(self, writer: IndexMessage) -> int:
|
190
|
+
return await self.indexer.index(writer)
|
191
|
+
|
192
|
+
|
193
|
+
async def start_nidx_utility() -> Optional[NidxUtility]:
|
194
|
+
nidx = get_utility(Utility.NIDX)
|
166
195
|
if nidx:
|
167
196
|
return nidx
|
168
197
|
|
169
198
|
nidx_utility: NidxUtility
|
170
199
|
if settings.standalone_mode:
|
171
|
-
|
200
|
+
if (
|
201
|
+
settings.nidx_api_address is not None
|
202
|
+
and settings.nidx_searcher_address is not None
|
203
|
+
and settings.nidx_indexer_address is not None
|
204
|
+
):
|
205
|
+
# Standalone with nidx service (via grpc). This is used in clustered standalone mode
|
206
|
+
nidx_utility = NidxServiceUtility()
|
207
|
+
else:
|
208
|
+
# Normal standalone mode with binding
|
209
|
+
nidx_utility = NidxBindingUtility()
|
172
210
|
else:
|
211
|
+
# Component deploy with nidx service via grpc & nats (cloud)
|
173
212
|
nidx_utility = NidxServiceUtility()
|
174
213
|
|
175
214
|
await nidx_utility.initialize()
|
@@ -178,30 +217,33 @@ async def start_nidx_utility() -> Optional[NidxUtility]:
|
|
178
217
|
|
179
218
|
|
180
219
|
async def stop_nidx_utility():
|
181
|
-
nidx_utility =
|
220
|
+
nidx_utility = get_utility(Utility.NIDX)
|
182
221
|
if nidx_utility:
|
183
222
|
clean_utility(Utility.NIDX)
|
184
223
|
await nidx_utility.finalize()
|
185
224
|
|
186
225
|
|
187
|
-
def get_nidx() ->
|
188
|
-
|
226
|
+
def get_nidx() -> NidxUtility:
|
227
|
+
nidx = get_utility(Utility.NIDX)
|
228
|
+
if nidx is None:
|
229
|
+
raise Exception("nidx not initialized")
|
230
|
+
return nidx
|
189
231
|
|
190
232
|
|
191
|
-
def get_nidx_api_client() ->
|
233
|
+
def get_nidx_api_client() -> "NidxApiStub":
|
192
234
|
nidx = get_nidx()
|
193
|
-
if nidx:
|
235
|
+
if nidx.api_client:
|
194
236
|
return nidx.api_client
|
195
237
|
else:
|
196
|
-
|
238
|
+
raise Exception("nidx not initialized")
|
197
239
|
|
198
240
|
|
199
|
-
def get_nidx_searcher_client() ->
|
241
|
+
def get_nidx_searcher_client() -> "NidxSearcherStub":
|
200
242
|
nidx = get_nidx()
|
201
|
-
if nidx:
|
243
|
+
if nidx.searcher_client:
|
202
244
|
return nidx.searcher_client
|
203
245
|
else:
|
204
|
-
|
246
|
+
raise Exception("nidx not initialized")
|
205
247
|
|
206
248
|
|
207
249
|
# TODO: Remove the index node abstraction
|
@@ -252,9 +294,6 @@ class FakeNode(AbstractIndexNode):
|
|
252
294
|
return "nidx"
|
253
295
|
|
254
296
|
|
255
|
-
def get_nidx_fake_node() ->
|
297
|
+
def get_nidx_fake_node() -> FakeNode:
|
256
298
|
nidx = get_nidx()
|
257
|
-
|
258
|
-
return FakeNode(nidx.api_client, nidx.searcher_client)
|
259
|
-
else:
|
260
|
-
return None
|
299
|
+
return FakeNode(nidx.api_client, nidx.searcher_client)
|
nucliadb/export_import/models.py
CHANGED
@@ -17,7 +17,7 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
|
20
|
+
import datetime
|
21
21
|
from enum import Enum
|
22
22
|
from typing import Any
|
23
23
|
|
@@ -57,8 +57,8 @@ class Metadata(BaseModel):
|
|
57
57
|
task: TaskMetadata = TaskMetadata(status=Status.SCHEDULED)
|
58
58
|
total: int = 0
|
59
59
|
processed: int = 0
|
60
|
-
created: datetime = datetime.
|
61
|
-
modified: datetime = datetime.
|
60
|
+
created: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
|
61
|
+
modified: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
|
62
62
|
|
63
63
|
|
64
64
|
class ExportMetadata(Metadata):
|
nucliadb/health.py
CHANGED
@@ -40,13 +40,6 @@ def nats_manager_healthy() -> bool:
|
|
40
40
|
return nats_manager.healthy()
|
41
41
|
|
42
42
|
|
43
|
-
def nodes_health_check() -> bool:
|
44
|
-
from nucliadb.common.cluster import manager
|
45
|
-
from nucliadb.ingest.settings import DriverConfig, settings
|
46
|
-
|
47
|
-
return len(manager.INDEX_NODES) > 0 or settings.driver == DriverConfig.LOCAL
|
48
|
-
|
49
|
-
|
50
43
|
def pubsub_check() -> bool:
|
51
44
|
driver: Optional[PubSubDriver] = get_utility(Utility.PUBSUB)
|
52
45
|
if driver is None:
|
nucliadb/ingest/app.py
CHANGED
@@ -22,10 +22,6 @@ import importlib.metadata
|
|
22
22
|
from typing import Awaitable, Callable
|
23
23
|
|
24
24
|
from nucliadb import health
|
25
|
-
from nucliadb.common.cluster.discovery.utils import (
|
26
|
-
setup_cluster_discovery,
|
27
|
-
teardown_cluster_discovery,
|
28
|
-
)
|
29
25
|
from nucliadb.common.cluster.settings import settings as cluster_settings
|
30
26
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
31
27
|
from nucliadb.common.context import ApplicationContext
|
@@ -89,13 +85,9 @@ async def initialize() -> list[Callable[[], Awaitable[None]]]:
|
|
89
85
|
)
|
90
86
|
finalizers.append(stop_nats_manager)
|
91
87
|
|
92
|
-
await setup_cluster_discovery()
|
93
|
-
finalizers.append(teardown_cluster_discovery)
|
94
|
-
|
95
88
|
health.register_health_checks(
|
96
89
|
[
|
97
90
|
health.nats_manager_healthy,
|
98
|
-
health.nodes_health_check,
|
99
91
|
health.pubsub_check,
|
100
92
|
]
|
101
93
|
)
|
@@ -113,7 +113,7 @@ class IndexAuditHandler:
|
|
113
113
|
|
114
114
|
for shard_obj in shard_groups:
|
115
115
|
# TODO: Uses node for auditing, don't want to suddenly change metrics
|
116
|
-
node, shard_id = choose_node(shard_obj
|
116
|
+
node, shard_id = choose_node(shard_obj)
|
117
117
|
shard: nodereader_pb2.Shard = await node.reader.GetShard(
|
118
118
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
119
119
|
)
|
@@ -103,7 +103,7 @@ class ShardCreatorHandler:
|
|
103
103
|
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
104
104
|
# remember, a lock will do at least 1+ reads and 1 write.
|
105
105
|
# with heavy writes, this adds some simple k/v pressure
|
106
|
-
node, shard_id = choose_node(current_shard
|
106
|
+
node, shard_id = choose_node(current_shard)
|
107
107
|
shard: nodereader_pb2.Shard = await node.reader.GetShard(
|
108
108
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
109
109
|
)
|
nucliadb/ingest/fields/base.py
CHANGED
@@ -21,12 +21,13 @@ from __future__ import annotations
|
|
21
21
|
|
22
22
|
import enum
|
23
23
|
from datetime import datetime
|
24
|
-
from typing import Any, Generic, Optional, Type, TypeVar
|
24
|
+
from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
|
25
25
|
|
26
26
|
from google.protobuf.message import DecodeError, Message
|
27
27
|
|
28
28
|
from nucliadb.common import datamanagers
|
29
29
|
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
30
|
+
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
30
31
|
from nucliadb_protos.resources_pb2 import (
|
31
32
|
CloudFile,
|
32
33
|
ExtractedTextWrapper,
|
@@ -41,12 +42,19 @@ from nucliadb_protos.resources_pb2 import (
|
|
41
42
|
QuestionAnswers,
|
42
43
|
)
|
43
44
|
from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
|
44
|
-
from nucliadb_protos.writer_pb2 import Error
|
45
|
+
from nucliadb_protos.writer_pb2 import Error, FieldStatus
|
46
|
+
from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
|
45
47
|
from nucliadb_utils.storages.storage import Storage, StorageField
|
46
48
|
|
49
|
+
if TYPE_CHECKING: # pragma: no cover
|
50
|
+
from nucliadb.ingest.orm.resource import Resource
|
51
|
+
|
52
|
+
|
47
53
|
SUBFIELDFIELDS = ("c",)
|
48
54
|
|
49
55
|
|
56
|
+
# NOTE extracted vectors key is no longer a static key, it is stored in each
|
57
|
+
# vectorset
|
50
58
|
class FieldTypes(str, enum.Enum):
|
51
59
|
FIELD_TEXT = "extracted_text"
|
52
60
|
FIELD_VECTORS = "extracted_vectors"
|
@@ -73,7 +81,7 @@ class Field(Generic[PbType]):
|
|
73
81
|
def __init__(
|
74
82
|
self,
|
75
83
|
id: str,
|
76
|
-
resource:
|
84
|
+
resource: Resource,
|
77
85
|
pb: Optional[Any] = None,
|
78
86
|
value: Optional[Any] = None,
|
79
87
|
):
|
@@ -88,7 +96,7 @@ class Field(Generic[PbType]):
|
|
88
96
|
self.question_answers = None
|
89
97
|
|
90
98
|
self.id: str = id
|
91
|
-
self.resource
|
99
|
+
self.resource = resource
|
92
100
|
|
93
101
|
if value is not None:
|
94
102
|
newpb = self.pbklass()
|
@@ -119,11 +127,20 @@ class Field(Generic[PbType]):
|
|
119
127
|
def get_storage_field(self, field_type: FieldTypes) -> StorageField:
|
120
128
|
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, field_type.value)
|
121
129
|
|
122
|
-
def _get_extracted_vectors_storage_field(
|
123
|
-
|
130
|
+
def _get_extracted_vectors_storage_field(
|
131
|
+
self,
|
132
|
+
vectorset: str,
|
133
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
134
|
+
) -> StorageField:
|
135
|
+
if storage_key_kind == VectorSetConfig.StorageKeyKind.LEGACY:
|
136
|
+
key = FieldTypes.FIELD_VECTORS.value
|
137
|
+
elif storage_key_kind == VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX:
|
124
138
|
key = FieldTypes.FIELD_VECTORSET.value.format(vectorset=vectorset)
|
125
139
|
else:
|
126
|
-
|
140
|
+
raise ValueError(
|
141
|
+
f"Can't do anything with UNSET or unknown vectorset storage key kind: {storage_key_kind}"
|
142
|
+
)
|
143
|
+
|
127
144
|
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
|
128
145
|
|
129
146
|
async def db_get_value(self) -> Optional[PbType]:
|
@@ -163,7 +180,8 @@ class Field(Generic[PbType]):
|
|
163
180
|
field_id=self.id,
|
164
181
|
)
|
165
182
|
await self.delete_extracted_text()
|
166
|
-
|
183
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(self.resource.txn, kbid=self.kbid):
|
184
|
+
await self.delete_vectors(vectorset_id, vs.storage_key_kind)
|
167
185
|
await self.delete_metadata()
|
168
186
|
await self.delete_question_answers()
|
169
187
|
|
@@ -181,9 +199,13 @@ class Field(Generic[PbType]):
|
|
181
199
|
except KeyError:
|
182
200
|
pass
|
183
201
|
|
184
|
-
async def delete_vectors(
|
202
|
+
async def delete_vectors(
|
203
|
+
self,
|
204
|
+
vectorset: str,
|
205
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
206
|
+
) -> None:
|
185
207
|
# Try delete vectors
|
186
|
-
sf = self._get_extracted_vectors_storage_field(vectorset)
|
208
|
+
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
187
209
|
try:
|
188
210
|
await self.storage.delete_upload(sf.key, sf.bucket)
|
189
211
|
except KeyError:
|
@@ -215,6 +237,25 @@ class Field(Generic[PbType]):
|
|
215
237
|
error=error,
|
216
238
|
)
|
217
239
|
|
240
|
+
async def get_status(self) -> Optional[FieldStatus]:
|
241
|
+
return await datamanagers.fields.get_status(
|
242
|
+
self.resource.txn,
|
243
|
+
kbid=self.kbid,
|
244
|
+
rid=self.uuid,
|
245
|
+
field_type=self.type,
|
246
|
+
field_id=self.id,
|
247
|
+
)
|
248
|
+
|
249
|
+
async def set_status(self, status: FieldStatus) -> None:
|
250
|
+
await datamanagers.fields.set_status(
|
251
|
+
self.resource.txn,
|
252
|
+
kbid=self.kbid,
|
253
|
+
rid=self.uuid,
|
254
|
+
field_type=self.type,
|
255
|
+
field_id=self.id,
|
256
|
+
status=status,
|
257
|
+
)
|
258
|
+
|
218
259
|
async def get_question_answers(self, force=False) -> Optional[FieldQuestionAnswers]:
|
219
260
|
if self.question_answers is None or force:
|
220
261
|
sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
|
@@ -309,12 +350,17 @@ class Field(Generic[PbType]):
|
|
309
350
|
self.extracted_text = payload
|
310
351
|
return self.extracted_text
|
311
352
|
|
312
|
-
async def set_vectors(
|
313
|
-
|
353
|
+
async def set_vectors(
|
354
|
+
self,
|
355
|
+
payload: ExtractedVectorsWrapper,
|
356
|
+
vectorset: str,
|
357
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
358
|
+
) -> Optional[VectorObject]:
|
314
359
|
if self.type in SUBFIELDFIELDS:
|
315
360
|
try:
|
316
361
|
actual_payload: Optional[VectorObject] = await self.get_vectors(
|
317
362
|
vectorset=vectorset,
|
363
|
+
storage_key_kind=storage_key_kind,
|
318
364
|
force=True,
|
319
365
|
)
|
320
366
|
except KeyError:
|
@@ -322,12 +368,29 @@ class Field(Generic[PbType]):
|
|
322
368
|
else:
|
323
369
|
actual_payload = None
|
324
370
|
|
325
|
-
sf = self._get_extracted_vectors_storage_field(vectorset)
|
371
|
+
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
326
372
|
vo: Optional[VectorObject] = None
|
327
373
|
if actual_payload is None:
|
328
|
-
# Its first extracted
|
374
|
+
# Its first extracted vectors
|
329
375
|
if payload.HasField("file"):
|
330
|
-
|
376
|
+
# When we receive vectors in a cloud file, it points to our
|
377
|
+
# storage but paths are different, we may want to move it. This
|
378
|
+
# can happen, for example, with LEGACY KBs where processing
|
379
|
+
# sends us the extracted vectors prefixed by vectorset but, to
|
380
|
+
# maintain bw/c, we move those to the original not prefixed
|
381
|
+
# path.
|
382
|
+
try:
|
383
|
+
await self.storage.normalize_binary(payload.file, sf)
|
384
|
+
except CouldNotCopyNotFound:
|
385
|
+
# A failure here could mean the payload has already been
|
386
|
+
# moved and we're retrying due to a redelivery or another
|
387
|
+
# retry mechanism
|
388
|
+
already_moved = await sf.exists()
|
389
|
+
if already_moved:
|
390
|
+
# We assume is the correct one and do nothing else
|
391
|
+
pass
|
392
|
+
else:
|
393
|
+
raise
|
331
394
|
vo = await self.storage.download_pb(sf, VectorObject)
|
332
395
|
else:
|
333
396
|
await self.storage.upload_pb(sf, payload.vectors)
|
@@ -354,14 +417,13 @@ class Field(Generic[PbType]):
|
|
354
417
|
return vo
|
355
418
|
|
356
419
|
async def get_vectors(
|
357
|
-
self,
|
420
|
+
self,
|
421
|
+
vectorset: str,
|
422
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
423
|
+
force: bool = False,
|
358
424
|
) -> Optional[VectorObject]:
|
359
|
-
# compat with vectorsets coming from protobuffers where no value is
|
360
|
-
# empty string instead of None. This shouldn't be handled here but we
|
361
|
-
# have to make sure it gets the correct vectorset
|
362
|
-
vectorset = vectorset or None
|
363
425
|
if self.extracted_vectors.get(vectorset, None) is None or force:
|
364
|
-
sf = self._get_extracted_vectors_storage_field(vectorset)
|
426
|
+
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
365
427
|
payload = await self.storage.download_pb(sf, VectorObject)
|
366
428
|
if payload is not None:
|
367
429
|
self.extracted_vectors[vectorset] = payload
|
nucliadb/ingest/orm/brain.py
CHANGED
@@ -100,6 +100,8 @@ class ResourceBrain:
|
|
100
100
|
page_positions: Optional[FilePagePositions],
|
101
101
|
extracted_text: Optional[ExtractedText],
|
102
102
|
basic_user_field_metadata: Optional[UserFieldMetadata] = None,
|
103
|
+
*,
|
104
|
+
replace_field: bool = False,
|
103
105
|
):
|
104
106
|
# To check for duplicate paragraphs
|
105
107
|
unique_paragraphs: set[str] = set()
|
@@ -224,6 +226,11 @@ class ResourceBrain:
|
|
224
226
|
|
225
227
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
226
228
|
|
229
|
+
if replace_field:
|
230
|
+
field_type, field_name = field_key.split("/")
|
231
|
+
full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
|
232
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
233
|
+
|
227
234
|
for relations in metadata.metadata.relations:
|
228
235
|
for relation in relations.relations:
|
229
236
|
self.brain.relations.append(relation)
|
@@ -239,9 +246,10 @@ class ResourceBrain:
|
|
239
246
|
field_id: str,
|
240
247
|
vo: utils_pb2.VectorObject,
|
241
248
|
*,
|
242
|
-
vectorset:
|
249
|
+
vectorset: str,
|
243
250
|
replace_field: bool = False,
|
244
|
-
|
251
|
+
# cut to specific dimension if specified
|
252
|
+
vector_dimension: Optional[int] = None,
|
245
253
|
):
|
246
254
|
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
247
255
|
for subfield, vectors in vo.split_vectors.items():
|
@@ -270,7 +278,7 @@ class ResourceBrain:
|
|
270
278
|
sentence_key,
|
271
279
|
vector,
|
272
280
|
vectorset=vectorset,
|
273
|
-
|
281
|
+
vector_dimension=vector_dimension,
|
274
282
|
)
|
275
283
|
|
276
284
|
_field_id = ids.FieldId(
|
@@ -296,13 +304,12 @@ class ResourceBrain:
|
|
296
304
|
sentence_key,
|
297
305
|
vector,
|
298
306
|
vectorset=vectorset,
|
299
|
-
|
307
|
+
vector_dimension=vector_dimension,
|
300
308
|
)
|
301
309
|
|
302
310
|
if replace_field:
|
303
311
|
full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
|
304
|
-
self.brain.
|
305
|
-
self.brain.paragraphs_to_delete.append(full_field_id)
|
312
|
+
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
306
313
|
|
307
314
|
def _apply_field_vector(
|
308
315
|
self,
|
@@ -311,22 +318,15 @@ class ResourceBrain:
|
|
311
318
|
sentence_key: ids.VectorId,
|
312
319
|
vector: utils_pb2.Vector,
|
313
320
|
*,
|
314
|
-
vectorset:
|
315
|
-
|
321
|
+
vectorset: str,
|
322
|
+
# cut vectors if a specific dimension is specified
|
323
|
+
vector_dimension: Optional[int] = None,
|
316
324
|
):
|
317
325
|
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
318
|
-
|
319
|
-
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
320
|
-
else:
|
321
|
-
sentence_pb = paragraph_pb.sentences[sentence_key.full()]
|
326
|
+
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
322
327
|
|
323
328
|
sentence_pb.ClearField("vector") # clear first to prevent duplicates
|
324
|
-
|
325
|
-
# cut vectors if a specific dimension is specified
|
326
|
-
if matryoshka_vector_dimension is not None:
|
327
|
-
sentence_pb.vector.extend(vector.vector[:matryoshka_vector_dimension])
|
328
|
-
else:
|
329
|
-
sentence_pb.vector.extend(vector.vector)
|
329
|
+
sentence_pb.vector.extend(vector.vector[:vector_dimension])
|
330
330
|
|
331
331
|
# we only care about start/stop position of the paragraph for a given sentence here
|
332
332
|
# the key has the sentence position
|
@@ -490,22 +490,29 @@ class ResourceBrain:
|
|
490
490
|
):
|
491
491
|
if metadata.mime_type != "":
|
492
492
|
labels["mt"].add(metadata.mime_type)
|
493
|
+
|
494
|
+
base_classification_relation = Relation(
|
495
|
+
relation=Relation.ABOUT,
|
496
|
+
source=relation_node_document,
|
497
|
+
to=RelationNode(
|
498
|
+
ntype=RelationNode.NodeType.LABEL,
|
499
|
+
),
|
500
|
+
)
|
493
501
|
for classification in metadata.classifications:
|
494
502
|
label = f"{classification.labelset}/{classification.label}"
|
495
503
|
if label not in user_canceled_labels:
|
496
504
|
labels["l"].add(label)
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
)
|
501
|
-
|
502
|
-
Relation(
|
503
|
-
relation=Relation.ABOUT,
|
504
|
-
source=relation_node_document,
|
505
|
-
to=relation_node_label,
|
506
|
-
)
|
507
|
-
)
|
505
|
+
relation = Relation()
|
506
|
+
relation.CopyFrom(base_classification_relation)
|
507
|
+
relation.to.value = label
|
508
|
+
self.brain.relations.append(relation)
|
509
|
+
|
508
510
|
# Data Augmentation + Processor entities
|
511
|
+
base_entity_relation = Relation(
|
512
|
+
relation=Relation.ENTITY,
|
513
|
+
source=relation_node_document,
|
514
|
+
to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
|
515
|
+
)
|
509
516
|
use_legacy_entities = True
|
510
517
|
for data_augmentation_task_id, entities in metadata.entities.items():
|
511
518
|
# If we recieved the entities from the processor here, we don't want to use the legacy entities
|
@@ -521,38 +528,30 @@ class ResourceBrain:
|
|
521
528
|
labels["e"].add(
|
522
529
|
f"{entity_label}/{entity_text}"
|
523
530
|
) # Add data_augmentation_task_id as a prefix?
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
)
|
529
|
-
rel = Relation(
|
530
|
-
relation=Relation.ENTITY,
|
531
|
-
source=relation_node_document,
|
532
|
-
to=relation_node_entity,
|
533
|
-
)
|
534
|
-
self.brain.relations.append(rel)
|
531
|
+
relation = Relation()
|
532
|
+
relation.CopyFrom(base_entity_relation)
|
533
|
+
relation.to.value = entity_text
|
534
|
+
relation.to.subtype = entity_label
|
535
|
+
self.brain.relations.append(relation)
|
535
536
|
|
536
537
|
# Legacy processor entities
|
537
538
|
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
539
|
+
def _parse_entity(klass_entity: str) -> tuple[str, str]:
|
540
|
+
try:
|
541
|
+
klass, entity = klass_entity.split("/", 1)
|
542
|
+
return klass, entity
|
543
|
+
except ValueError:
|
544
|
+
raise AttributeError(f"Entity should be with type {klass_entity}")
|
545
|
+
|
538
546
|
if use_legacy_entities:
|
539
|
-
for klass_entity
|
547
|
+
for klass_entity in metadata.positions.keys():
|
540
548
|
labels["e"].add(klass_entity)
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
relation_node_entity = RelationNode(
|
548
|
-
value=entity, ntype=RelationNode.NodeType.ENTITY, subtype=klass
|
549
|
-
)
|
550
|
-
rel = Relation(
|
551
|
-
relation=Relation.ENTITY,
|
552
|
-
source=relation_node_document,
|
553
|
-
to=relation_node_entity,
|
554
|
-
)
|
555
|
-
self.brain.relations.append(rel)
|
549
|
+
klass, entity = _parse_entity(klass_entity)
|
550
|
+
relation = Relation()
|
551
|
+
relation.CopyFrom(base_entity_relation)
|
552
|
+
relation.to.value = entity
|
553
|
+
relation.to.subtype = klass
|
554
|
+
self.brain.relations.append(relation)
|
556
555
|
|
557
556
|
def apply_field_labels(
|
558
557
|
self,
|