nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
nucliadb/ingest/processing.py
CHANGED
@@ -261,6 +261,7 @@ class ProcessingEngine:
|
|
261
261
|
"content_type": file.file.content_type,
|
262
262
|
"password": file.password,
|
263
263
|
"language": file.language,
|
264
|
+
"extract_strategy": file.extract_strategy,
|
264
265
|
}
|
265
266
|
return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
|
266
267
|
|
@@ -278,6 +279,8 @@ class ProcessingEngine:
|
|
278
279
|
headers["X-LANGUAGE"] = file.language
|
279
280
|
headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode() # type: ignore
|
280
281
|
headers["X-MD5"] = file.file.md5
|
282
|
+
if file.extract_strategy is not None:
|
283
|
+
headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
|
281
284
|
headers["CONTENT_TYPE"] = file.file.content_type
|
282
285
|
headers["CONTENT-LENGTH"] = str(len(file.file.payload)) # type: ignore
|
283
286
|
headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
|
@@ -317,6 +320,7 @@ class ProcessingEngine:
|
|
317
320
|
"content_type": file_field.file.content_type,
|
318
321
|
"language": file_field.language,
|
319
322
|
"password": file_field.password,
|
323
|
+
"extract_strategy": file_field.extract_strategy,
|
320
324
|
}
|
321
325
|
return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
|
322
326
|
|
@@ -341,6 +345,8 @@ class ProcessingEngine:
|
|
341
345
|
headers["CONTENT-TYPE"] = file.file.content_type
|
342
346
|
if file.file.size:
|
343
347
|
headers["CONTENT-LENGTH"] = str(file.file.size)
|
348
|
+
if file.extract_strategy != "":
|
349
|
+
headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
|
344
350
|
headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
|
345
351
|
|
346
352
|
iterator = storage.downloadbytescf_iterator(file.file)
|
@@ -406,13 +412,13 @@ class ProcessingEngine:
|
|
406
412
|
# Upload the payload
|
407
413
|
item.partition = partition
|
408
414
|
resp = await self.session.post(
|
409
|
-
url=self.nuclia_internal_push, data=item.
|
415
|
+
url=self.nuclia_internal_push, data=item.model_dump_json(), headers=headers
|
410
416
|
)
|
411
417
|
else:
|
412
418
|
headers.update({"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"})
|
413
419
|
# Upload the payload
|
414
420
|
resp = await self.session.post(
|
415
|
-
url=self.nuclia_external_push_v2, data=item.
|
421
|
+
url=self.nuclia_external_push_v2, data=item.model_dump_json(), headers=headers
|
416
422
|
)
|
417
423
|
if resp.status == 200:
|
418
424
|
data = await resp.json()
|
nucliadb/ingest/serialize.py
CHANGED
@@ -18,18 +18,20 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
-
from typing import Optional
|
21
|
+
from typing import Optional, Union
|
22
22
|
|
23
23
|
import nucliadb_models as models
|
24
|
+
from nucliadb.common import datamanagers
|
24
25
|
from nucliadb.common.maindb.driver import Transaction
|
25
26
|
from nucliadb.common.maindb.utils import get_driver
|
27
|
+
from nucliadb.common.models_utils import from_proto
|
26
28
|
from nucliadb.ingest.fields.base import Field
|
27
29
|
from nucliadb.ingest.fields.conversation import Conversation
|
28
30
|
from nucliadb.ingest.fields.file import File
|
29
31
|
from nucliadb.ingest.fields.link import Link
|
30
32
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
31
33
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
32
|
-
from nucliadb_models.common import
|
34
|
+
from nucliadb_models.common import FieldTypeName
|
33
35
|
from nucliadb_models.resource import (
|
34
36
|
ConversationFieldData,
|
35
37
|
ConversationFieldExtractedData,
|
@@ -49,7 +51,9 @@ from nucliadb_models.resource import (
|
|
49
51
|
)
|
50
52
|
from nucliadb_models.search import ResourceProperties
|
51
53
|
from nucliadb_models.security import ResourceSecurity
|
52
|
-
from
|
54
|
+
from nucliadb_protos.writer_pb2 import FieldStatus
|
55
|
+
from nucliadb_utils import const
|
56
|
+
from nucliadb_utils.utilities import get_storage, has_feature
|
53
57
|
|
54
58
|
|
55
59
|
async def set_resource_field_extracted_data(
|
@@ -64,7 +68,7 @@ async def set_resource_field_extracted_data(
|
|
64
68
|
if ExtractedDataTypeName.TEXT in wanted_extracted_data:
|
65
69
|
data_et = await field.get_extracted_text()
|
66
70
|
if data_et is not None:
|
67
|
-
field_data.text =
|
71
|
+
field_data.text = from_proto.extracted_text(data_et)
|
68
72
|
|
69
73
|
metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
|
70
74
|
shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
|
@@ -72,24 +76,35 @@ async def set_resource_field_extracted_data(
|
|
72
76
|
data_fcm = await field.get_field_metadata()
|
73
77
|
|
74
78
|
if data_fcm is not None:
|
75
|
-
field_data.metadata =
|
79
|
+
field_data.metadata = from_proto.field_computed_metadata(
|
76
80
|
data_fcm, shortened=shortened_metadata_wanted and not metadata_wanted
|
77
81
|
)
|
78
82
|
|
79
83
|
if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
|
80
84
|
data_lcm = await field.get_large_field_metadata()
|
81
85
|
if data_lcm is not None:
|
82
|
-
field_data.large_metadata =
|
86
|
+
field_data.large_metadata = from_proto.large_computed_metadata(data_lcm)
|
83
87
|
|
84
88
|
if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
|
85
|
-
|
89
|
+
# XXX: our extracted API is not vectorset-compatible, so we'll get the
|
90
|
+
# first vectorset and return the values. Ideally, we should provide a
|
91
|
+
# way to select a vectorset
|
92
|
+
vectorset_id = None
|
93
|
+
async with datamanagers.with_ro_transaction() as txn:
|
94
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(
|
95
|
+
txn=txn,
|
96
|
+
kbid=field.resource.kb.kbid,
|
97
|
+
):
|
98
|
+
break
|
99
|
+
assert vectorset_id is not None, "All KBs must have at least a vectorset"
|
100
|
+
data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
|
86
101
|
if data_vec is not None:
|
87
|
-
field_data.vectors =
|
102
|
+
field_data.vectors = from_proto.vector_object(data_vec)
|
88
103
|
|
89
104
|
if ExtractedDataTypeName.QA in wanted_extracted_data:
|
90
105
|
qa = await field.get_question_answers()
|
91
106
|
if qa is not None:
|
92
|
-
field_data.question_answers =
|
107
|
+
field_data.question_answers = from_proto.field_question_answers(qa)
|
93
108
|
|
94
109
|
if (
|
95
110
|
isinstance(field, File)
|
@@ -98,7 +113,7 @@ async def set_resource_field_extracted_data(
|
|
98
113
|
):
|
99
114
|
data_fed = await field.get_file_extracted_data()
|
100
115
|
if data_fed is not None:
|
101
|
-
field_data.file =
|
116
|
+
field_data.file = from_proto.file_extracted_data(data_fed)
|
102
117
|
|
103
118
|
if (
|
104
119
|
isinstance(field, Link)
|
@@ -107,7 +122,7 @@ async def set_resource_field_extracted_data(
|
|
107
122
|
):
|
108
123
|
data_led = await field.get_link_extracted_data()
|
109
124
|
if data_led is not None:
|
110
|
-
field_data.link =
|
125
|
+
field_data.link = from_proto.link_extracted_data(data_led)
|
111
126
|
|
112
127
|
|
113
128
|
async def serialize(
|
@@ -133,6 +148,40 @@ async def serialize(
|
|
133
148
|
)
|
134
149
|
|
135
150
|
|
151
|
+
async def serialize_field_errors(
|
152
|
+
field: Field,
|
153
|
+
serialized: Union[
|
154
|
+
TextFieldData, FileFieldData, LinkFieldData, ConversationFieldData, GenericFieldData
|
155
|
+
],
|
156
|
+
):
|
157
|
+
if has_feature(const.Features.FIELD_STATUS):
|
158
|
+
status = await field.get_status()
|
159
|
+
if status is None:
|
160
|
+
status = FieldStatus()
|
161
|
+
serialized.status = status.Status.Name(status.status)
|
162
|
+
if status.errors:
|
163
|
+
serialized.errors = []
|
164
|
+
for error in status.errors:
|
165
|
+
serialized.errors.append(
|
166
|
+
Error(
|
167
|
+
body=error.source_error.error,
|
168
|
+
code=error.source_error.code,
|
169
|
+
code_str=error.source_error.ErrorCode.Name(error.source_error.code),
|
170
|
+
created=error.created.ToDatetime(),
|
171
|
+
)
|
172
|
+
)
|
173
|
+
serialized.error = serialized.errors[-1]
|
174
|
+
else:
|
175
|
+
field_error = await field.get_error()
|
176
|
+
if field_error is not None:
|
177
|
+
serialized.error = Error(
|
178
|
+
body=field_error.error,
|
179
|
+
code=field_error.code,
|
180
|
+
code_str=field_error.ErrorCode.Name(field_error.code),
|
181
|
+
created=None,
|
182
|
+
)
|
183
|
+
|
184
|
+
|
136
185
|
async def managed_serialize(
|
137
186
|
txn: Transaction,
|
138
187
|
kbid: str,
|
@@ -174,14 +223,12 @@ async def managed_serialize(
|
|
174
223
|
else None
|
175
224
|
)
|
176
225
|
|
177
|
-
resource.metadata =
|
178
|
-
resource.usermetadata =
|
226
|
+
resource.metadata = from_proto.metadata(orm_resource.basic.metadata)
|
227
|
+
resource.usermetadata = from_proto.user_metadata(orm_resource.basic.usermetadata)
|
179
228
|
resource.fieldmetadata = [
|
180
|
-
|
229
|
+
from_proto.user_field_metadata(fm) for fm in orm_resource.basic.fieldmetadata
|
181
230
|
]
|
182
|
-
resource.computedmetadata =
|
183
|
-
orm_resource.basic.computedmetadata
|
184
|
-
)
|
231
|
+
resource.computedmetadata = from_proto.computed_metadata(orm_resource.basic.computedmetadata)
|
185
232
|
|
186
233
|
resource.last_seqid = orm_resource.basic.last_seqid
|
187
234
|
|
@@ -195,18 +242,18 @@ async def managed_serialize(
|
|
195
242
|
await orm_resource.get_relations()
|
196
243
|
if orm_resource.relations is not None:
|
197
244
|
resource.relations = [
|
198
|
-
|
245
|
+
from_proto.relation(relation) for relation in orm_resource.relations.relations
|
199
246
|
]
|
200
247
|
|
201
248
|
if ResourceProperties.ORIGIN in show:
|
202
249
|
await orm_resource.get_origin()
|
203
250
|
if orm_resource.origin is not None:
|
204
|
-
resource.origin =
|
251
|
+
resource.origin = from_proto.origin(orm_resource.origin)
|
205
252
|
|
206
253
|
if ResourceProperties.EXTRA in show:
|
207
254
|
await orm_resource.get_extra()
|
208
255
|
if orm_resource.extra is not None:
|
209
|
-
resource.extra =
|
256
|
+
resource.extra = from_proto.extra(orm_resource.extra)
|
210
257
|
|
211
258
|
include_errors = ResourceProperties.ERRORS in show
|
212
259
|
|
@@ -221,7 +268,7 @@ async def managed_serialize(
|
|
221
268
|
await orm_resource.get_fields()
|
222
269
|
resource.data = ResourceData()
|
223
270
|
for (field_type, _), field in orm_resource.fields.items():
|
224
|
-
field_type_name =
|
271
|
+
field_type_name = from_proto.field_type_name(field_type)
|
225
272
|
if field_type_name not in field_type_filter:
|
226
273
|
continue
|
227
274
|
|
@@ -236,14 +283,10 @@ async def managed_serialize(
|
|
236
283
|
if field.id not in resource.data.texts:
|
237
284
|
resource.data.texts[field.id] = TextFieldData()
|
238
285
|
if include_value:
|
239
|
-
serialized_value = (
|
240
|
-
models.FieldText.from_message(value) if value is not None else None
|
241
|
-
)
|
286
|
+
serialized_value = from_proto.field_text(value) if value is not None else None
|
242
287
|
resource.data.texts[field.id].value = serialized_value
|
243
288
|
if include_errors:
|
244
|
-
|
245
|
-
if error is not None:
|
246
|
-
resource.data.texts[field.id].error = Error(body=error.error, code=error.code)
|
289
|
+
await serialize_field_errors(field, resource.data.texts[field.id])
|
247
290
|
if include_extracted_data:
|
248
291
|
resource.data.texts[field.id].extracted = TextFieldExtractedData()
|
249
292
|
await set_resource_field_extracted_data(
|
@@ -259,14 +302,12 @@ async def managed_serialize(
|
|
259
302
|
resource.data.files[field.id] = FileFieldData()
|
260
303
|
if include_value:
|
261
304
|
if value is not None:
|
262
|
-
resource.data.files[field.id].value =
|
305
|
+
resource.data.files[field.id].value = from_proto.field_file(value)
|
263
306
|
else:
|
264
307
|
resource.data.files[field.id].value = None
|
265
308
|
|
266
309
|
if include_errors:
|
267
|
-
|
268
|
-
if error is not None:
|
269
|
-
resource.data.files[field.id].error = Error(body=error.error, code=error.code)
|
310
|
+
await serialize_field_errors(field, resource.data.files[field.id])
|
270
311
|
|
271
312
|
if include_extracted_data:
|
272
313
|
resource.data.files[field.id].extracted = FileFieldExtractedData()
|
@@ -282,12 +323,10 @@ async def managed_serialize(
|
|
282
323
|
if field.id not in resource.data.links:
|
283
324
|
resource.data.links[field.id] = LinkFieldData()
|
284
325
|
if include_value and value is not None:
|
285
|
-
resource.data.links[field.id].value =
|
326
|
+
resource.data.links[field.id].value = from_proto.field_link(value)
|
286
327
|
|
287
328
|
if include_errors:
|
288
|
-
|
289
|
-
if error is not None:
|
290
|
-
resource.data.links[field.id].error = Error(body=error.error, code=error.code)
|
329
|
+
await serialize_field_errors(field, resource.data.links[field.id])
|
291
330
|
|
292
331
|
if include_extracted_data:
|
293
332
|
resource.data.links[field.id].extracted = LinkFieldExtractedData()
|
@@ -303,16 +342,10 @@ async def managed_serialize(
|
|
303
342
|
if field.id not in resource.data.conversations:
|
304
343
|
resource.data.conversations[field.id] = ConversationFieldData()
|
305
344
|
if include_errors:
|
306
|
-
|
307
|
-
if error is not None:
|
308
|
-
resource.data.conversations[field.id].error = Error(
|
309
|
-
body=error.error, code=error.code
|
310
|
-
)
|
345
|
+
await serialize_field_errors(field, resource.data.conversations[field.id])
|
311
346
|
if include_value and isinstance(field, Conversation):
|
312
347
|
value = await field.get_metadata()
|
313
|
-
resource.data.conversations[field.id].value =
|
314
|
-
value
|
315
|
-
)
|
348
|
+
resource.data.conversations[field.id].value = from_proto.field_conversation(value)
|
316
349
|
if include_extracted_data:
|
317
350
|
resource.data.conversations[field.id].extracted = ConversationFieldExtractedData()
|
318
351
|
await set_resource_field_extracted_data(
|
@@ -329,9 +362,7 @@ async def managed_serialize(
|
|
329
362
|
if include_value:
|
330
363
|
resource.data.generics[field.id].value = value
|
331
364
|
if include_errors:
|
332
|
-
|
333
|
-
if error is not None:
|
334
|
-
resource.data.generics[field.id].error = Error(body=error.error, code=error.code)
|
365
|
+
await serialize_field_errors(field, resource.data.generics[field.id])
|
335
366
|
if include_extracted_data:
|
336
367
|
resource.data.generics[field.id].extracted = TextFieldExtractedData(
|
337
368
|
text=models.ExtractedText(text=resource.data.generics[field.id].value)
|
@@ -31,12 +31,12 @@ from nucliadb.common.maindb.utils import setup_driver
|
|
31
31
|
from nucliadb.ingest import SERVICE_NAME, logger
|
32
32
|
from nucliadb.ingest.orm.broker_message import generate_broker_message
|
33
33
|
from nucliadb.ingest.orm.entities import EntitiesManager
|
34
|
-
from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
|
34
|
+
from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
|
35
35
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
36
36
|
from nucliadb.ingest.orm.processor import Processor, sequence_manager
|
37
37
|
from nucliadb.ingest.orm.resource import Resource as ResourceORM
|
38
38
|
from nucliadb.ingest.settings import settings
|
39
|
-
from nucliadb_protos import
|
39
|
+
from nucliadb_protos import writer_pb2, writer_pb2_grpc
|
40
40
|
from nucliadb_protos.knowledgebox_pb2 import (
|
41
41
|
DeleteKnowledgeBoxResponse,
|
42
42
|
KnowledgeBoxID,
|
@@ -44,13 +44,10 @@ from nucliadb_protos.knowledgebox_pb2 import (
|
|
44
44
|
KnowledgeBoxUpdate,
|
45
45
|
SemanticModelMetadata,
|
46
46
|
UpdateKnowledgeBoxResponse,
|
47
|
-
VectorSetConfig,
|
48
47
|
)
|
49
48
|
from nucliadb_protos.writer_pb2 import (
|
50
49
|
BrokerMessage,
|
51
50
|
DelEntitiesRequest,
|
52
|
-
DelVectorSetRequest,
|
53
|
-
DelVectorSetResponse,
|
54
51
|
GetEntitiesGroupRequest,
|
55
52
|
GetEntitiesGroupResponse,
|
56
53
|
GetEntitiesRequest,
|
@@ -63,8 +60,6 @@ from nucliadb_protos.writer_pb2 import (
|
|
63
60
|
ListMembersResponse,
|
64
61
|
NewEntitiesGroupRequest,
|
65
62
|
NewEntitiesGroupResponse,
|
66
|
-
NewVectorSetRequest,
|
67
|
-
NewVectorSetResponse,
|
68
63
|
OpStatusWriter,
|
69
64
|
SetEntitiesRequest,
|
70
65
|
UpdateEntitiesGroupRequest,
|
@@ -472,52 +467,3 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
472
467
|
errors.capture_exception(e)
|
473
468
|
logger.error("Error in ingest gRPC servicer", exc_info=True)
|
474
469
|
raise
|
475
|
-
|
476
|
-
async def NewVectorSet( # type: ignore
|
477
|
-
self, request: NewVectorSetRequest, context=None
|
478
|
-
) -> NewVectorSetResponse:
|
479
|
-
config = VectorSetConfig(
|
480
|
-
vectorset_id=request.vectorset_id,
|
481
|
-
vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
|
482
|
-
similarity=request.similarity,
|
483
|
-
normalize_vectors=request.normalize_vectors,
|
484
|
-
vector_type=request.vector_type,
|
485
|
-
vector_dimension=request.vector_dimension,
|
486
|
-
),
|
487
|
-
matryoshka_dimensions=request.matryoshka_dimensions,
|
488
|
-
)
|
489
|
-
response = NewVectorSetResponse()
|
490
|
-
try:
|
491
|
-
async with self.driver.transaction() as txn:
|
492
|
-
kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
|
493
|
-
await kbobj.create_vectorset(config)
|
494
|
-
await txn.commit()
|
495
|
-
except VectorSetConflict as exc:
|
496
|
-
response.status = NewVectorSetResponse.Status.ERROR
|
497
|
-
response.details = str(exc)
|
498
|
-
except Exception as exc:
|
499
|
-
errors.capture_exception(exc)
|
500
|
-
logger.error("Error in ingest gRPC while creating a vectorset", exc_info=True)
|
501
|
-
response.status = NewVectorSetResponse.Status.ERROR
|
502
|
-
response.details = str(exc)
|
503
|
-
else:
|
504
|
-
response.status = NewVectorSetResponse.Status.OK
|
505
|
-
return response
|
506
|
-
|
507
|
-
async def DelVectorSet( # type: ignore
|
508
|
-
self, request: DelVectorSetRequest, context=None
|
509
|
-
) -> DelVectorSetResponse:
|
510
|
-
response = DelVectorSetResponse()
|
511
|
-
try:
|
512
|
-
async with self.driver.transaction() as txn:
|
513
|
-
kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
|
514
|
-
await kbobj.delete_vectorset(request.vectorset_id)
|
515
|
-
await txn.commit()
|
516
|
-
except Exception as exc:
|
517
|
-
errors.capture_exception(exc)
|
518
|
-
logger.error("Error in ingest gRPC while deleting a vectorset", exc_info=True)
|
519
|
-
response.status = DelVectorSetResponse.Status.ERROR
|
520
|
-
response.details = str(exc)
|
521
|
-
else:
|
522
|
-
response.status = DelVectorSetResponse.Status.OK
|
523
|
-
return response
|
nucliadb/ingest/settings.py
CHANGED
@@ -77,7 +77,7 @@ class Settings(DriverSettings):
|
|
77
77
|
total_replicas: int = 1 # number of ingest processor replicas in the cluster
|
78
78
|
nuclia_partitions: int = 50
|
79
79
|
|
80
|
-
max_receive_message_length: int =
|
80
|
+
max_receive_message_length: int = 500 # In MB
|
81
81
|
|
82
82
|
# Search query timeouts
|
83
83
|
relation_search_timeout: float = 10.0
|
@@ -85,8 +85,5 @@ class Settings(DriverSettings):
|
|
85
85
|
|
86
86
|
max_concurrent_ingest_processing: int = 5
|
87
87
|
|
88
|
-
# Ingest processor settings
|
89
|
-
ingest_delete_resource_storage_max_parallel: int = 20
|
90
|
-
|
91
88
|
|
92
89
|
settings = Settings()
|
nucliadb/learning_proxy.py
CHANGED
@@ -155,10 +155,9 @@ class LearningConfiguration(BaseModel):
|
|
155
155
|
|
156
156
|
|
157
157
|
class ProxiedLearningConfigError(Exception):
|
158
|
-
def __init__(self, status_code: int, content:
|
158
|
+
def __init__(self, status_code: int, content: Union[str, dict[str, Any]]):
|
159
159
|
self.status_code = status_code
|
160
160
|
self.content = content
|
161
|
-
self.content_type = content_type
|
162
161
|
|
163
162
|
|
164
163
|
def raise_for_status(response: httpx.Response) -> None:
|
@@ -166,10 +165,13 @@ def raise_for_status(response: httpx.Response) -> None:
|
|
166
165
|
response.raise_for_status()
|
167
166
|
except httpx.HTTPStatusError as err:
|
168
167
|
content_type = err.response.headers.get("Content-Type", "application/json")
|
168
|
+
if content_type == "application/json":
|
169
|
+
content = err.response.json()
|
170
|
+
else:
|
171
|
+
content = err.response.text
|
169
172
|
raise ProxiedLearningConfigError(
|
170
173
|
status_code=err.response.status_code,
|
171
|
-
content=
|
172
|
-
content_type=content_type,
|
174
|
+
content=content,
|
173
175
|
)
|
174
176
|
|
175
177
|
|
nucliadb/purge/__init__.py
CHANGED
@@ -25,15 +25,19 @@ from nucliadb.common.cluster.exceptions import NodeError, ShardNotFound
|
|
25
25
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
26
26
|
from nucliadb.common.maindb.driver import Driver
|
27
27
|
from nucliadb.common.maindb.utils import setup_driver, teardown_driver
|
28
|
+
from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
|
28
29
|
from nucliadb.ingest import SERVICE_NAME, logger
|
30
|
+
from nucliadb.ingest.fields.base import Field
|
29
31
|
from nucliadb.ingest.orm.knowledgebox import (
|
30
32
|
KB_TO_DELETE,
|
31
33
|
KB_TO_DELETE_BASE,
|
32
34
|
KB_TO_DELETE_STORAGE_BASE,
|
33
35
|
KB_VECTORSET_TO_DELETE,
|
34
36
|
KB_VECTORSET_TO_DELETE_BASE,
|
37
|
+
RESOURCE_TO_DELETE_STORAGE_BASE,
|
35
38
|
KnowledgeBox,
|
36
39
|
)
|
40
|
+
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig, VectorSetPurge
|
37
41
|
from nucliadb_telemetry import errors
|
38
42
|
from nucliadb_telemetry.logs import setup_logging
|
39
43
|
from nucliadb_utils.storages.storage import Storage
|
@@ -131,6 +135,67 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
|
|
131
135
|
logger.info("FINISH PURGING KB STORAGE")
|
132
136
|
|
133
137
|
|
138
|
+
async def purge_deleted_resource_storage(driver: Driver, storage: Storage) -> None:
|
139
|
+
"""
|
140
|
+
Remove from storage all resources marked as deleted.
|
141
|
+
|
142
|
+
Returns the number of resources purged.
|
143
|
+
"""
|
144
|
+
logger.info("Starting purge of deleted resource storage")
|
145
|
+
to_purge = await _count_resources_storage_to_purge(driver)
|
146
|
+
logger.info(f"Found {to_purge} resources to purge")
|
147
|
+
while True:
|
148
|
+
try:
|
149
|
+
purged = await _purge_resources_storage_batch(driver, storage, batch_size=100)
|
150
|
+
if not purged:
|
151
|
+
logger.info("No more resources to purge found")
|
152
|
+
return
|
153
|
+
logger.info(f"Purged {purged} resources")
|
154
|
+
|
155
|
+
except asyncio.CancelledError:
|
156
|
+
logger.info("Purge of deleted resource storage was cancelled")
|
157
|
+
return
|
158
|
+
|
159
|
+
|
160
|
+
async def _count_resources_storage_to_purge(driver: Driver) -> int:
|
161
|
+
"""
|
162
|
+
Count the number of resources marked as deleted in storage.
|
163
|
+
"""
|
164
|
+
async with driver.transaction(read_only=True) as txn:
|
165
|
+
return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
|
166
|
+
|
167
|
+
|
168
|
+
async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch_size: int = 100) -> int:
|
169
|
+
"""
|
170
|
+
Remove from storage a batch of resources marked as deleted. Returns the
|
171
|
+
number of resources purged.
|
172
|
+
"""
|
173
|
+
# Get the keys of the resources to delete in batches of 100
|
174
|
+
to_delete_batch = []
|
175
|
+
async with driver.transaction(read_only=True) as txn:
|
176
|
+
async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
|
177
|
+
to_delete_batch.append(key)
|
178
|
+
|
179
|
+
if not to_delete_batch:
|
180
|
+
return 0
|
181
|
+
|
182
|
+
# Delete the resources blobs from storage
|
183
|
+
logger.info(f"Purging {len(to_delete_batch)} deleted resources")
|
184
|
+
tasks = []
|
185
|
+
for key in to_delete_batch:
|
186
|
+
kbid, resource_id = key.split("/")[-2:]
|
187
|
+
tasks.append(asyncio.create_task(storage.delete_resource(kbid, resource_id)))
|
188
|
+
await asyncio.gather(*tasks)
|
189
|
+
|
190
|
+
# Delete the schedule-to-delete keys
|
191
|
+
async with driver.transaction() as txn:
|
192
|
+
for key in to_delete_batch:
|
193
|
+
await txn.delete(key)
|
194
|
+
await txn.commit()
|
195
|
+
|
196
|
+
return len(to_delete_batch)
|
197
|
+
|
198
|
+
|
134
199
|
async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
135
200
|
"""Vectors for a vectorset are stored in a key inside each resource. Iterate
|
136
201
|
through all resources of the KB and remove any storage object containing
|
@@ -139,8 +204,8 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
|
139
204
|
"""
|
140
205
|
logger.info("START PURGING KB VECTORSETS")
|
141
206
|
|
142
|
-
|
143
|
-
|
207
|
+
vectorsets_to_delete = [key async for key in _iter_keys(driver, KB_VECTORSET_TO_DELETE_BASE)]
|
208
|
+
for key in vectorsets_to_delete:
|
144
209
|
logger.info(f"Purging vectorsets {key}")
|
145
210
|
try:
|
146
211
|
_base, kbid, vectorset = key.lstrip("/").split("/")
|
@@ -149,13 +214,38 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
|
149
214
|
continue
|
150
215
|
|
151
216
|
try:
|
217
|
+
async with driver.transaction(read_only=True) as txn:
|
218
|
+
value = await txn.get(key)
|
219
|
+
assert value is not None, "Key must exist or we wouldn't had fetch it iterating keys"
|
220
|
+
purge_payload = VectorSetPurge()
|
221
|
+
purge_payload.ParseFromString(value)
|
222
|
+
|
223
|
+
fields: list[Field] = []
|
152
224
|
async with driver.transaction(read_only=True) as txn:
|
153
225
|
kb = KnowledgeBox(txn, storage, kbid)
|
154
226
|
async for resource in kb.iterate_resources():
|
155
|
-
fields
|
227
|
+
fields.extend((await resource.get_fields(force=True)).values())
|
228
|
+
|
156
229
|
# we don't need the maindb transaction anymore to remove vectors from storage
|
157
|
-
for field in fields
|
158
|
-
|
230
|
+
for field in fields:
|
231
|
+
if purge_payload.storage_key_kind == VectorSetConfig.StorageKeyKind.UNSET:
|
232
|
+
# Bw/c for purge before adding purge payload. We assume
|
233
|
+
# there's only 2 kinds of KBs: with one or with more than
|
234
|
+
# one vectorset. KBs with one vectorset are not allowed to
|
235
|
+
# delete their vectorset, so we wouldn't be here. It has to
|
236
|
+
# be a KB with multiple, so the storage key kind has to be
|
237
|
+
# this:
|
238
|
+
await field.delete_vectors(
|
239
|
+
vectorset, VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX
|
240
|
+
)
|
241
|
+
else:
|
242
|
+
await field.delete_vectors(vectorset, purge_payload.storage_key_kind)
|
243
|
+
|
244
|
+
# Finally, delete the key
|
245
|
+
async with driver.transaction() as txn:
|
246
|
+
await txn.delete(key)
|
247
|
+
await txn.commit()
|
248
|
+
|
159
249
|
except Exception as exc:
|
160
250
|
errors.capture_exception(exc)
|
161
251
|
logger.error(
|
@@ -165,13 +255,6 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
|
165
255
|
)
|
166
256
|
continue
|
167
257
|
|
168
|
-
purged.append(key)
|
169
|
-
|
170
|
-
async with driver.transaction() as txn:
|
171
|
-
for key in purged:
|
172
|
-
await txn.delete(key)
|
173
|
-
await txn.commit()
|
174
|
-
|
175
258
|
logger.info("FINISH PURGING KB VECTORSETS")
|
176
259
|
|
177
260
|
|
@@ -180,22 +263,29 @@ async def main():
|
|
180
263
|
This script will purge all knowledge boxes marked to be deleted in maindb.
|
181
264
|
"""
|
182
265
|
await setup_cluster()
|
266
|
+
await start_nidx_utility()
|
183
267
|
driver = await setup_driver()
|
184
268
|
storage = await get_storage(
|
185
269
|
gcs_scopes=["https://www.googleapis.com/auth/devstorage.full_control"],
|
186
270
|
service_name=SERVICE_NAME,
|
187
271
|
)
|
188
272
|
try:
|
273
|
+
purge_resources_storage_task = asyncio.create_task(
|
274
|
+
purge_deleted_resource_storage(driver, storage)
|
275
|
+
)
|
189
276
|
await purge_kb(driver)
|
190
277
|
await purge_kb_storage(driver, storage)
|
191
278
|
await purge_kb_vectorsets(driver, storage)
|
279
|
+
await purge_resources_storage_task
|
192
280
|
except Exception as ex: # pragma: no cover
|
193
281
|
logger.exception("Unhandled exception on purge command")
|
194
282
|
errors.capture_exception(ex)
|
195
283
|
finally:
|
196
284
|
try:
|
285
|
+
purge_resources_storage_task.cancel()
|
197
286
|
await storage.finalize()
|
198
287
|
await teardown_driver()
|
288
|
+
await stop_nidx_utility()
|
199
289
|
await teardown_cluster()
|
200
290
|
except Exception: # pragma: no cover
|
201
291
|
logger.exception("Error tearing down utilities on purge command")
|
nucliadb/purge/orphan_shards.py
CHANGED
@@ -33,6 +33,7 @@ from nucliadb.common.cluster.manager import KBShardManager
|
|
33
33
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
34
34
|
from nucliadb.common.maindb.driver import Driver
|
35
35
|
from nucliadb.common.maindb.utils import setup_driver, teardown_driver
|
36
|
+
from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
|
36
37
|
from nucliadb.ingest import logger
|
37
38
|
from nucliadb_telemetry import errors
|
38
39
|
from nucliadb_telemetry.logs import setup_logging
|
@@ -135,10 +136,9 @@ async def _get_stored_shards(driver: Driver) -> dict[str, ShardLocation]:
|
|
135
136
|
continue
|
136
137
|
else:
|
137
138
|
for shard_object_pb in kb_shards:
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
stored_shards[shard_replica_id] = ShardLocation(kbid=kbid, node_id=node_id)
|
139
|
+
stored_shards[shard_object_pb.nidx_shard_id] = ShardLocation(
|
140
|
+
kbid=kbid, node_id="nidx"
|
141
|
+
)
|
142
142
|
return stored_shards
|
143
143
|
|
144
144
|
|
@@ -241,6 +241,7 @@ async def main():
|
|
241
241
|
"""
|
242
242
|
args = parse_arguments()
|
243
243
|
|
244
|
+
await start_nidx_utility()
|
244
245
|
await setup_cluster()
|
245
246
|
driver = await setup_driver()
|
246
247
|
|
@@ -253,6 +254,7 @@ async def main():
|
|
253
254
|
finally:
|
254
255
|
await teardown_driver()
|
255
256
|
await teardown_cluster()
|
257
|
+
await stop_nidx_utility()
|
256
258
|
|
257
259
|
|
258
260
|
def run() -> int: # pragma: no cover
|