nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -20,11 +20,13 @@
|
|
20
20
|
|
21
21
|
from typing import cast
|
22
22
|
|
23
|
+
from nucliadb.common import datamanagers
|
23
24
|
from nucliadb.ingest.fields.base import Field
|
24
25
|
from nucliadb.ingest.fields.conversation import Conversation
|
25
26
|
from nucliadb.ingest.fields.file import File
|
26
27
|
from nucliadb.ingest.fields.link import Link
|
27
28
|
from nucliadb.ingest.orm.resource import Resource
|
29
|
+
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
28
30
|
from nucliadb_protos.resources_pb2 import (
|
29
31
|
ExtractedTextWrapper,
|
30
32
|
ExtractedVectorsWrapper,
|
@@ -90,7 +92,12 @@ class _BrokerMessageBuilder:
|
|
90
92
|
self.bm.link_extracted_data.append(link_extracted_data)
|
91
93
|
|
92
94
|
# Field vectors
|
93
|
-
|
95
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(
|
96
|
+
resource.txn, kbid=resource.kb.kbid
|
97
|
+
):
|
98
|
+
await self.generate_field_vectors(
|
99
|
+
type_id, field_id, field, vectorset_id, vs.storage_key_kind
|
100
|
+
)
|
94
101
|
|
95
102
|
# Large metadata
|
96
103
|
await self.generate_field_large_computed_metadata(type_id, field_id, field)
|
@@ -155,13 +162,16 @@ class _BrokerMessageBuilder:
|
|
155
162
|
type_id: FieldType.ValueType,
|
156
163
|
field_id: str,
|
157
164
|
field: Field,
|
165
|
+
vectorset: str,
|
166
|
+
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
158
167
|
):
|
159
|
-
vo = await field.get_vectors()
|
168
|
+
vo = await field.get_vectors(vectorset, storage_key_kind)
|
160
169
|
if vo is None:
|
161
170
|
return
|
162
171
|
evw = ExtractedVectorsWrapper()
|
163
172
|
evw.field.field = field_id
|
164
173
|
evw.field.field_type = type_id
|
174
|
+
evw.vectorset_id = vectorset
|
165
175
|
evw.vectors.CopyFrom(vo)
|
166
176
|
self.bm.field_vectors.append(evw)
|
167
177
|
|
nucliadb/ingest/orm/entities.py
CHANGED
@@ -26,7 +26,6 @@ from nucliadb.common.cluster.base import AbstractIndexNode
|
|
26
26
|
from nucliadb.common.cluster.exceptions import (
|
27
27
|
AlreadyExists,
|
28
28
|
EntitiesGroupNotFound,
|
29
|
-
NodeError,
|
30
29
|
)
|
31
30
|
from nucliadb.common.cluster.utils import get_shard_manager
|
32
31
|
from nucliadb.common.datamanagers.entities import (
|
@@ -37,6 +36,7 @@ from nucliadb.common.datamanagers.entities import (
|
|
37
36
|
from nucliadb.common.maindb.driver import Transaction
|
38
37
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
39
38
|
from nucliadb.ingest.settings import settings
|
39
|
+
from nucliadb.search.search.shards import query_shard
|
40
40
|
from nucliadb_protos.knowledgebox_pb2 import (
|
41
41
|
DeletedEntitiesGroups,
|
42
42
|
EntitiesGroup,
|
@@ -53,9 +53,6 @@ from nucliadb_protos.nodereader_pb2 import (
|
|
53
53
|
)
|
54
54
|
from nucliadb_protos.utils_pb2 import RelationNode
|
55
55
|
from nucliadb_protos.writer_pb2 import GetEntitiesResponse
|
56
|
-
from nucliadb_telemetry import errors
|
57
|
-
from nucliadb_utils import const
|
58
|
-
from nucliadb_utils.utilities import has_feature
|
59
56
|
|
60
57
|
from .exceptions import EntityManagementException
|
61
58
|
|
@@ -218,20 +215,15 @@ class EntitiesManager:
|
|
218
215
|
],
|
219
216
|
),
|
220
217
|
)
|
221
|
-
response = await node
|
218
|
+
response = await query_shard(node, shard_id, request)
|
222
219
|
return response.relation
|
223
220
|
|
224
221
|
results = await shard_manager.apply_for_all_shards(
|
225
222
|
self.kbid,
|
226
223
|
do_entities_search,
|
227
224
|
settings.relation_search_timeout,
|
228
|
-
use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": self.kbid}),
|
229
225
|
use_read_replica_nodes=self.use_read_replica_nodes,
|
230
226
|
)
|
231
|
-
for result in results:
|
232
|
-
if isinstance(result, Exception):
|
233
|
-
errors.capture_exception(result)
|
234
|
-
raise NodeError("Error while querying relation index")
|
235
227
|
|
236
228
|
entities = {}
|
237
229
|
for result in results:
|
@@ -307,6 +299,7 @@ class EntitiesManager:
|
|
307
299
|
shard_manager = get_shard_manager()
|
308
300
|
|
309
301
|
async def query_indexed_entities_group_names(node: AbstractIndexNode, shard_id: str) -> set[str]:
|
302
|
+
"""Search all relation types"""
|
310
303
|
request = SearchRequest(
|
311
304
|
shard=shard_id,
|
312
305
|
result_per_page=0,
|
@@ -315,25 +308,21 @@ class EntitiesManager:
|
|
315
308
|
paragraph=False,
|
316
309
|
faceted=Faceted(labels=["/e"]),
|
317
310
|
)
|
318
|
-
response: SearchResponse = await node
|
311
|
+
response: SearchResponse = await query_shard(node, shard_id, request)
|
319
312
|
try:
|
320
313
|
facetresults = response.document.facets["/e"].facetresults
|
321
|
-
return {facet.tag.split("/")[-1] for facet in facetresults}
|
322
314
|
except KeyError:
|
323
315
|
# No entities found
|
324
316
|
return set()
|
317
|
+
else:
|
318
|
+
return {facet.tag.split("/")[-1] for facet in facetresults}
|
325
319
|
|
326
320
|
results = await shard_manager.apply_for_all_shards(
|
327
321
|
self.kbid,
|
328
322
|
query_indexed_entities_group_names,
|
329
323
|
settings.relation_types_timeout,
|
330
|
-
use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": self.kbid}),
|
331
324
|
use_read_replica_nodes=self.use_read_replica_nodes,
|
332
325
|
)
|
333
|
-
for result in results:
|
334
|
-
if isinstance(result, Exception):
|
335
|
-
errors.capture_exception(result)
|
336
|
-
raise NodeError("Error while looking for relations types")
|
337
326
|
|
338
327
|
if not results:
|
339
328
|
return set()
|
@@ -27,7 +27,6 @@ from grpc.aio import AioRpcError
|
|
27
27
|
|
28
28
|
from nucliadb.common import datamanagers
|
29
29
|
from nucliadb.common.cluster.exceptions import ShardNotFound
|
30
|
-
from nucliadb.common.cluster.manager import get_index_node
|
31
30
|
from nucliadb.common.cluster.utils import get_shard_manager
|
32
31
|
|
33
32
|
# XXX: this keys shouldn't be exposed outside datamanagers
|
@@ -49,7 +48,6 @@ from nucliadb.ingest.orm.exceptions import (
|
|
49
48
|
from nucliadb.ingest.orm.metrics import processor_observer
|
50
49
|
from nucliadb.ingest.orm.resource import Resource
|
51
50
|
from nucliadb.ingest.orm.utils import choose_matryoshka_dimension, compute_paragraph_key
|
52
|
-
from nucliadb.ingest.settings import settings
|
53
51
|
from nucliadb.migrator.utils import get_latest_version
|
54
52
|
from nucliadb_protos import knowledgebox_pb2, noderesources_pb2, nodewriter_pb2, writer_pb2
|
55
53
|
from nucliadb_protos.knowledgebox_pb2 import (
|
@@ -58,8 +56,10 @@ from nucliadb_protos.knowledgebox_pb2 import (
|
|
58
56
|
KnowledgeBoxConfig,
|
59
57
|
SemanticModelMetadata,
|
60
58
|
StoredExternalIndexProviderMetadata,
|
59
|
+
VectorSetPurge,
|
61
60
|
)
|
62
61
|
from nucliadb_protos.resources_pb2 import Basic
|
62
|
+
from nucliadb_utils.settings import is_onprem_nucliadb
|
63
63
|
from nucliadb_utils.storages.storage import Storage
|
64
64
|
from nucliadb_utils.utilities import (
|
65
65
|
get_audit,
|
@@ -74,6 +74,9 @@ KB_KEYS = "/kbs/{kbid}/"
|
|
74
74
|
KB_TO_DELETE_BASE = "/kbtodelete/"
|
75
75
|
KB_TO_DELETE_STORAGE_BASE = "/storagetodelete/"
|
76
76
|
|
77
|
+
RESOURCE_TO_DELETE_STORAGE_BASE = "/resourcestoragetodelete"
|
78
|
+
RESOURCE_TO_DELETE_STORAGE = f"{RESOURCE_TO_DELETE_STORAGE_BASE}/{{kbid}}/{{uuid}}"
|
79
|
+
|
77
80
|
KB_TO_DELETE = f"{KB_TO_DELETE_BASE}{{kbid}}"
|
78
81
|
KB_TO_DELETE_STORAGE = f"{KB_TO_DELETE_STORAGE_BASE}{{kbid}}"
|
79
82
|
|
@@ -100,9 +103,9 @@ class KnowledgeBox:
|
|
100
103
|
*,
|
101
104
|
kbid: str,
|
102
105
|
slug: str,
|
106
|
+
semantic_models: dict[str, SemanticModelMetadata],
|
103
107
|
title: str = "",
|
104
108
|
description: str = "",
|
105
|
-
semantic_models: Optional[dict[str, SemanticModelMetadata]] = None,
|
106
109
|
external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
|
107
110
|
hidden_resources_enabled: bool = False,
|
108
111
|
hidden_resources_hide_on_creation: bool = False,
|
@@ -117,7 +120,7 @@ class KnowledgeBox:
|
|
117
120
|
raise KnowledgeBoxCreationError(
|
118
121
|
"Cannot hide new resources if the hidden resources feature is disabled"
|
119
122
|
)
|
120
|
-
if
|
123
|
+
if len(semantic_models) == 0:
|
121
124
|
raise KnowledgeBoxCreationError("KB must define at least one semantic model")
|
122
125
|
|
123
126
|
rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
|
@@ -143,6 +146,7 @@ class KnowledgeBox:
|
|
143
146
|
kb_shards.actual = -1
|
144
147
|
|
145
148
|
vs_external_indexes = []
|
149
|
+
|
146
150
|
for vectorset_id, semantic_model in semantic_models.items(): # type: ignore
|
147
151
|
# if this KB uses a matryoshka model, we can choose a different
|
148
152
|
# dimension
|
@@ -169,6 +173,7 @@ class KnowledgeBox:
|
|
169
173
|
vector_dimension=dimension,
|
170
174
|
),
|
171
175
|
matryoshka_dimensions=semantic_model.matryoshka_dimensions,
|
176
|
+
storage_key_kind=knowledgebox_pb2.VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX,
|
172
177
|
)
|
173
178
|
await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset_config)
|
174
179
|
|
@@ -339,6 +344,8 @@ class KnowledgeBox:
|
|
339
344
|
if exists is False:
|
340
345
|
logger.error(f"{kbid} KB does not exists on Storage")
|
341
346
|
|
347
|
+
nidx_api = get_nidx_api_client()
|
348
|
+
|
342
349
|
async with driver.transaction() as txn:
|
343
350
|
storage_to_delete = KB_TO_DELETE_STORAGE.format(kbid=kbid)
|
344
351
|
await txn.set(storage_to_delete, b"")
|
@@ -351,25 +358,17 @@ class KnowledgeBox:
|
|
351
358
|
logger.warning(f"Shards not found for KB while purging it", extra={"kbid": kbid})
|
352
359
|
else:
|
353
360
|
for shard in shards_obj.shards:
|
354
|
-
|
355
|
-
for replica in shard.replicas:
|
356
|
-
node = get_index_node(replica.node)
|
357
|
-
if node is None:
|
358
|
-
logger.error(
|
359
|
-
f"No node {replica.node} found, let's continue. Some shards may stay orphaned",
|
360
|
-
extra={"kbid": kbid},
|
361
|
-
)
|
362
|
-
continue
|
361
|
+
if shard.nidx_shard_id:
|
363
362
|
try:
|
364
|
-
await
|
363
|
+
await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
|
365
364
|
logger.debug(
|
366
|
-
f"Succeded deleting shard
|
367
|
-
extra={"kbid": kbid, "
|
365
|
+
f"Succeded deleting shard",
|
366
|
+
extra={"kbid": kbid, "shard_id": shard.nidx_shard_id},
|
368
367
|
)
|
369
368
|
except AioRpcError as exc:
|
370
369
|
if exc.code() == StatusCode.NOT_FOUND:
|
371
370
|
continue
|
372
|
-
raise ShardNotFound(f"{exc.details()} @ {
|
371
|
+
raise ShardNotFound(f"{exc.details()} @ shard {shard.nidx_shard_id}")
|
373
372
|
|
374
373
|
await txn.commit()
|
375
374
|
await cls.delete_all_kb_keys(driver, kbid)
|
@@ -415,9 +414,16 @@ class KnowledgeBox:
|
|
415
414
|
logger.exception("Error deleting slug")
|
416
415
|
|
417
416
|
async def storage_delete_resource(self, uuid: str):
|
418
|
-
|
419
|
-
self.kbid, uuid
|
420
|
-
|
417
|
+
if is_onprem_nucliadb():
|
418
|
+
await self.storage.delete_resource(self.kbid, uuid)
|
419
|
+
else:
|
420
|
+
# Deleting from storage can be slow, so we schedule its deletion and the purge cronjob
|
421
|
+
# will take care of it
|
422
|
+
await self.schedule_delete_resource(self.kbid, uuid)
|
423
|
+
|
424
|
+
async def schedule_delete_resource(self, kbid: str, uuid: str):
|
425
|
+
key = RESOURCE_TO_DELETE_STORAGE.format(kbid=kbid, uuid=uuid)
|
426
|
+
await self.txn.set(key, b"")
|
421
427
|
|
422
428
|
async def delete_resource(self, uuid: str):
|
423
429
|
with processor_observer({"type": "delete_resource_maindb"}):
|
@@ -479,6 +485,12 @@ class KnowledgeBox:
|
|
479
485
|
self.txn, kbid=self.kbid, vectorset_id=config.vectorset_id
|
480
486
|
):
|
481
487
|
raise VectorSetConflict(f"Vectorset {config.vectorset_id} already exists")
|
488
|
+
|
489
|
+
# To ensure we always set the storage key kind, we overwrite it with the
|
490
|
+
# correct value. This whole enum business is to maintain bw/c with KBs
|
491
|
+
# pre-vectorsets, so any new vectorset should use the vectorset prefix
|
492
|
+
# key kind
|
493
|
+
config.storage_key_kind = knowledgebox_pb2.VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX
|
482
494
|
await datamanagers.vectorsets.set(self.txn, kbid=self.kbid, config=config)
|
483
495
|
|
484
496
|
# Remove the async deletion mark if it exists, just in case there was a previous deletion
|
@@ -491,11 +503,21 @@ class KnowledgeBox:
|
|
491
503
|
await shard_manager.create_vectorset(self.kbid, config)
|
492
504
|
|
493
505
|
async def delete_vectorset(self, vectorset_id: str):
|
494
|
-
await datamanagers.vectorsets.
|
506
|
+
vectorset_count = await datamanagers.vectorsets.count(self.txn, kbid=self.kbid)
|
507
|
+
if vectorset_count == 1:
|
508
|
+
raise VectorSetConflict("Deletion of your last vectorset is not allowed")
|
509
|
+
|
510
|
+
deleted = await datamanagers.vectorsets.delete(
|
511
|
+
self.txn, kbid=self.kbid, vectorset_id=vectorset_id
|
512
|
+
)
|
513
|
+
if deleted is None:
|
514
|
+
# already deleted
|
515
|
+
return
|
495
516
|
|
496
517
|
# mark vectorset for async deletion
|
497
518
|
deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=vectorset_id)
|
498
|
-
|
519
|
+
payload = VectorSetPurge(storage_key_kind=deleted.storage_key_kind)
|
520
|
+
await self.txn.set(deletion_mark_key, payload.SerializeToString())
|
499
521
|
|
500
522
|
shard_manager = get_shard_manager()
|
501
523
|
await shard_manager.delete_vectorset(self.kbid, vectorset_id)
|
@@ -20,13 +20,13 @@
|
|
20
20
|
|
21
21
|
import logging
|
22
22
|
from dataclasses import dataclass, field
|
23
|
-
from typing import Optional
|
23
|
+
from typing import Optional
|
24
24
|
|
25
25
|
from nucliadb.ingest.orm.resource import Resource
|
26
26
|
from nucliadb.ingest.processing import ProcessingEngine, PushPayload, Source
|
27
27
|
from nucliadb_models.text import PushTextFormat, Text
|
28
28
|
from nucliadb_protos import resources_pb2, writer_pb2
|
29
|
-
from nucliadb_protos.resources_pb2 import
|
29
|
+
from nucliadb_protos.resources_pb2 import FieldType
|
30
30
|
from nucliadb_utils.utilities import Utility, get_partitioning, get_utility
|
31
31
|
|
32
32
|
logger = logging.getLogger("ingest-processor")
|
@@ -50,7 +50,7 @@ async def get_generated_fields(bm: writer_pb2.BrokerMessage, resource: Resource)
|
|
50
50
|
ingest the processed thing later).
|
51
51
|
|
52
52
|
Given a broker message and a resource, this function returns the list of
|
53
|
-
generated fields, that can be empty.
|
53
|
+
generated fields, that can be empty. It skips fields with errors.
|
54
54
|
|
55
55
|
"""
|
56
56
|
generated_fields = GeneratedFields()
|
@@ -60,34 +60,12 @@ async def get_generated_fields(bm: writer_pb2.BrokerMessage, resource: Resource)
|
|
60
60
|
return generated_fields
|
61
61
|
|
62
62
|
# search all fields
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
fields = []
|
68
|
-
else:
|
69
|
-
fields = all_fields.fields
|
70
|
-
|
71
|
-
for field_id in bm.texts:
|
72
|
-
field = FieldID(field_type=FieldType.TEXT, field=field_id)
|
73
|
-
if field not in fields:
|
63
|
+
for field_id, text in bm.texts.items():
|
64
|
+
errors = [e for e in bm.errors if e.field_type == FieldType.TEXT and e.field == field_id]
|
65
|
+
has_error = len(errors) > 0
|
66
|
+
if text.generated_by.WhichOneof("author") == "data_augmentation" and not has_error:
|
74
67
|
generated_fields.texts.append(field_id)
|
75
68
|
|
76
|
-
for field_id in bm.links:
|
77
|
-
field = FieldID(field_type=FieldType.LINK, field=field_id)
|
78
|
-
if field not in fields:
|
79
|
-
generated_fields.links.append(field_id)
|
80
|
-
|
81
|
-
for field_id in bm.files:
|
82
|
-
field = FieldID(field_type=FieldType.FILE, field=field_id)
|
83
|
-
if field not in fields:
|
84
|
-
generated_fields.files.append(field_id)
|
85
|
-
|
86
|
-
for field_id in bm.conversations:
|
87
|
-
field = FieldID(field_type=FieldType.CONVERSATION, field=field_id)
|
88
|
-
if field not in fields:
|
89
|
-
generated_fields.conversations.append(field_id)
|
90
|
-
|
91
69
|
return generated_fields
|
92
70
|
|
93
71
|
|
@@ -275,7 +275,6 @@ class Processor:
|
|
275
275
|
|
276
276
|
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
277
277
|
resource = await kb.get(uuid)
|
278
|
-
|
279
278
|
if resource is None:
|
280
279
|
# It's a new resource
|
281
280
|
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
@@ -737,7 +736,11 @@ def has_vectors_operation(index_message: PBBrainResource) -> bool:
|
|
737
736
|
"""
|
738
737
|
Returns True if the index message has any vectors to index or to delete.
|
739
738
|
"""
|
740
|
-
if
|
739
|
+
if (
|
740
|
+
len(index_message.sentences_to_delete) > 0
|
741
|
+
or len(index_message.paragraphs_to_delete) > 0
|
742
|
+
or any([len(deletions.items) for deletions in index_message.vector_prefixes_to_delete.values()])
|
743
|
+
):
|
741
744
|
return True
|
742
745
|
for field_paragraphs in index_message.paragraphs.values():
|
743
746
|
for paragraph in field_paragraphs.paragraphs.values():
|