nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -54,7 +54,7 @@ class ExportImportDataManager:
|
|
|
54
54
|
|
|
55
55
|
async def get_metadata(self, type: str, kbid: str, id: str) -> Metadata:
|
|
56
56
|
key = self._get_maindb_metadata_key(type, kbid, id)
|
|
57
|
-
async with self.driver.
|
|
57
|
+
async with self.driver.ro_transaction() as txn:
|
|
58
58
|
data = await txn.get(key)
|
|
59
59
|
if data is None or data == b"":
|
|
60
60
|
raise MetadataNotFound()
|
|
@@ -89,13 +89,13 @@ class ExportImportDataManager:
|
|
|
89
89
|
metadata.modified = datetime.now(timezone.utc)
|
|
90
90
|
key = self._get_maindb_metadata_key(type, metadata.kbid, metadata.id)
|
|
91
91
|
data = metadata.model_dump_json().encode("utf-8")
|
|
92
|
-
async with self.driver.
|
|
92
|
+
async with self.driver.rw_transaction() as txn:
|
|
93
93
|
await txn.set(key, data)
|
|
94
94
|
await txn.commit()
|
|
95
95
|
|
|
96
96
|
async def delete_metadata(self, type: str, metadata: Metadata):
|
|
97
97
|
key = self._get_maindb_metadata_key(type, metadata.kbid, metadata.id)
|
|
98
|
-
async with self.driver.
|
|
98
|
+
async with self.driver.rw_transaction() as txn:
|
|
99
99
|
await txn.delete(key)
|
|
100
100
|
await txn.commit()
|
|
101
101
|
|
nucliadb/ingest/consumer/pull.py
CHANGED
|
@@ -31,6 +31,7 @@ from opentelemetry.trace import (
|
|
|
31
31
|
Link,
|
|
32
32
|
)
|
|
33
33
|
|
|
34
|
+
from nucliadb.common.http_clients.exceptions import ServiceUnavailableException
|
|
34
35
|
from nucliadb.common.http_clients.processing import (
|
|
35
36
|
ProcessingHTTPClient,
|
|
36
37
|
ProcessingPullMessageProgressUpdater,
|
|
@@ -209,6 +210,12 @@ class PullV2Worker:
|
|
|
209
210
|
payload_length = len(base64.b64decode(data.payload))
|
|
210
211
|
logger.error(f"Message too big for transaction: {payload_length}")
|
|
211
212
|
raise e
|
|
213
|
+
|
|
214
|
+
except ServiceUnavailableException as ex:
|
|
215
|
+
logger.warning(f"Processing api is unavailable, will retry shortly: {ex}")
|
|
216
|
+
await processing_http_client.reset_session()
|
|
217
|
+
await asyncio.sleep(self.pull_time_error_backoff)
|
|
218
|
+
|
|
212
219
|
except Exception:
|
|
213
220
|
logger.exception("Unhandled error pulling messages from processing")
|
|
214
221
|
await asyncio.sleep(self.pull_time_error_backoff)
|
|
@@ -22,21 +22,18 @@ import sys
|
|
|
22
22
|
from functools import partial
|
|
23
23
|
from typing import Awaitable, Callable, Optional
|
|
24
24
|
|
|
25
|
-
from nucliadb.common.back_pressure.materializer import BackPressureMaterializer
|
|
26
|
-
from nucliadb.common.back_pressure.settings import settings as back_pressure_settings
|
|
27
25
|
from nucliadb.common.maindb.utils import setup_driver
|
|
28
26
|
from nucliadb.ingest import SERVICE_NAME, logger
|
|
29
27
|
from nucliadb.ingest.consumer.consumer import IngestConsumer
|
|
30
28
|
from nucliadb.ingest.consumer.pull import PullV2Worker
|
|
31
29
|
from nucliadb.ingest.settings import settings
|
|
32
30
|
from nucliadb_utils.exceptions import ConfigurationError
|
|
33
|
-
from nucliadb_utils.settings import
|
|
31
|
+
from nucliadb_utils.settings import transaction_settings
|
|
34
32
|
from nucliadb_utils.utilities import (
|
|
35
33
|
get_audit,
|
|
36
34
|
get_nats_manager,
|
|
37
35
|
get_pubsub,
|
|
38
36
|
get_storage,
|
|
39
|
-
start_nats_manager,
|
|
40
37
|
)
|
|
41
38
|
|
|
42
39
|
from .auditing import IndexAuditHandler, ResourceWritesAuditHandler
|
|
@@ -57,27 +54,6 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
|
|
|
57
54
|
await asyncio.gather(*tasks, return_exceptions=True)
|
|
58
55
|
|
|
59
56
|
|
|
60
|
-
async def start_back_pressure() -> BackPressureMaterializer:
|
|
61
|
-
logger.info("Starting back pressure materializer")
|
|
62
|
-
nats_manager = await start_nats_manager(
|
|
63
|
-
SERVICE_NAME,
|
|
64
|
-
indexing_settings.index_jetstream_servers,
|
|
65
|
-
indexing_settings.index_jetstream_auth,
|
|
66
|
-
)
|
|
67
|
-
back_pressure = BackPressureMaterializer(
|
|
68
|
-
nats_manager,
|
|
69
|
-
indexing_check_interval=back_pressure_settings.indexing_check_interval,
|
|
70
|
-
ingest_check_interval=back_pressure_settings.ingest_check_interval,
|
|
71
|
-
)
|
|
72
|
-
await back_pressure.start()
|
|
73
|
-
return back_pressure
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
async def stop_back_pressure(materializer: BackPressureMaterializer) -> None:
|
|
77
|
-
await materializer.stop()
|
|
78
|
-
await materializer.nats_manager.finalize()
|
|
79
|
-
|
|
80
|
-
|
|
81
57
|
async def start_ingest_consumers(
|
|
82
58
|
service_name: Optional[str] = None,
|
|
83
59
|
) -> Callable[[], Awaitable[None]]:
|
|
@@ -164,9 +140,8 @@ async def start_shard_creator() -> Callable[[], Awaitable[None]]:
|
|
|
164
140
|
driver = await setup_driver()
|
|
165
141
|
pubsub = await get_pubsub()
|
|
166
142
|
assert pubsub is not None, "Pubsub is not configured"
|
|
167
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
|
168
143
|
|
|
169
|
-
shard_creator = ShardCreatorHandler(driver=driver,
|
|
144
|
+
shard_creator = ShardCreatorHandler(driver=driver, pubsub=pubsub)
|
|
170
145
|
await shard_creator.initialize()
|
|
171
146
|
|
|
172
147
|
return shard_creator.finalize
|
|
@@ -25,14 +25,14 @@ from typing import Any
|
|
|
25
25
|
|
|
26
26
|
from nidx_protos import nodereader_pb2, noderesources_pb2
|
|
27
27
|
|
|
28
|
-
from nucliadb.common import locking
|
|
28
|
+
from nucliadb.common import datamanagers, locking
|
|
29
|
+
from nucliadb.common.cluster.settings import settings
|
|
29
30
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
30
31
|
from nucliadb.common.maindb.driver import Driver
|
|
31
32
|
from nucliadb.common.nidx import get_nidx_api_client
|
|
32
33
|
from nucliadb_protos import writer_pb2
|
|
33
34
|
from nucliadb_utils import const
|
|
34
35
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
|
35
|
-
from nucliadb_utils.storages.storage import Storage
|
|
36
36
|
|
|
37
37
|
from . import metrics
|
|
38
38
|
from .utils import DelayedTaskHandler
|
|
@@ -52,12 +52,10 @@ class ShardCreatorHandler:
|
|
|
52
52
|
self,
|
|
53
53
|
*,
|
|
54
54
|
driver: Driver,
|
|
55
|
-
storage: Storage,
|
|
56
55
|
pubsub: PubSubDriver,
|
|
57
56
|
check_delay: float = 10.0,
|
|
58
57
|
):
|
|
59
58
|
self.driver = driver
|
|
60
|
-
self.storage = storage
|
|
61
59
|
self.pubsub = pubsub
|
|
62
60
|
self.shard_manager = get_shard_manager()
|
|
63
61
|
self.task_handler = DelayedTaskHandler(check_delay)
|
|
@@ -91,7 +89,7 @@ class ShardCreatorHandler:
|
|
|
91
89
|
@metrics.handler_histo.wrap({"type": "shard_creator"})
|
|
92
90
|
async def process_kb(self, kbid: str) -> None:
|
|
93
91
|
logger.info({"message": "Processing notification for kbid", "kbid": kbid})
|
|
94
|
-
async with self.driver.
|
|
92
|
+
async with self.driver.ro_transaction() as txn:
|
|
95
93
|
current_shard = await self.shard_manager.get_current_active_shard(txn, kbid)
|
|
96
94
|
|
|
97
95
|
if current_shard is None:
|
|
@@ -111,4 +109,17 @@ class ShardCreatorHandler:
|
|
|
111
109
|
shard_id=noderesources_pb2.ShardId(id=current_shard.nidx_shard_id)
|
|
112
110
|
) # type: ignore
|
|
113
111
|
)
|
|
114
|
-
|
|
112
|
+
|
|
113
|
+
if not should_create_new_shard(shard.paragraphs):
|
|
114
|
+
return
|
|
115
|
+
|
|
116
|
+
logger.info({"message": "Adding shard", "kbid": kbid})
|
|
117
|
+
async with datamanagers.with_rw_transaction() as txn:
|
|
118
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
119
|
+
prewarm = kb_config is not None and kb_config.prewarm_enabled
|
|
120
|
+
await self.shard_manager.create_shard_by_kbid(txn, kbid, prewarm_enabled=prewarm)
|
|
121
|
+
await txn.commit()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def should_create_new_shard(num_paragraphs: int) -> bool:
|
|
125
|
+
return num_paragraphs > settings.max_shard_paragraphs
|
nucliadb/ingest/fields/base.py
CHANGED
|
@@ -29,6 +29,7 @@ from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
|
|
|
29
29
|
from google.protobuf.message import DecodeError, Message
|
|
30
30
|
|
|
31
31
|
from nucliadb.common import datamanagers
|
|
32
|
+
from nucliadb.common.ids import FieldId
|
|
32
33
|
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
|
33
34
|
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
|
34
35
|
from nucliadb_protos.resources_pb2 import (
|
|
@@ -46,10 +47,8 @@ from nucliadb_protos.resources_pb2 import (
|
|
|
46
47
|
)
|
|
47
48
|
from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
|
|
48
49
|
from nucliadb_protos.writer_pb2 import Error, FieldStatus
|
|
49
|
-
from nucliadb_utils import const
|
|
50
50
|
from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
|
|
51
51
|
from nucliadb_utils.storages.storage import Storage, StorageField
|
|
52
|
-
from nucliadb_utils.utilities import has_feature
|
|
53
52
|
|
|
54
53
|
logger = logging.getLogger(__name__)
|
|
55
54
|
|
|
@@ -125,6 +124,14 @@ class Field(Generic[PbType]):
|
|
|
125
124
|
def uuid(self) -> str:
|
|
126
125
|
return self.resource.uuid
|
|
127
126
|
|
|
127
|
+
@property
|
|
128
|
+
def field_id(self) -> FieldId:
|
|
129
|
+
return FieldId(
|
|
130
|
+
rid=self.resource.uuid,
|
|
131
|
+
type=self.type,
|
|
132
|
+
key=self.id,
|
|
133
|
+
)
|
|
134
|
+
|
|
128
135
|
@property
|
|
129
136
|
def storage(self) -> Storage:
|
|
130
137
|
return self.resource.storage
|
|
@@ -215,21 +222,6 @@ class Field(Generic[PbType]):
|
|
|
215
222
|
) -> None:
|
|
216
223
|
# Try delete vectors
|
|
217
224
|
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
|
218
|
-
|
|
219
|
-
if has_feature(const.Features.DEBUG_MISSING_VECTORS):
|
|
220
|
-
# This is a very chatty log. It is just a temporary hint while debugging an issue.
|
|
221
|
-
logger.info(
|
|
222
|
-
"Deleting vectors from storage",
|
|
223
|
-
extra={
|
|
224
|
-
"kbid": self.kbid,
|
|
225
|
-
"rid": self.resource.uuid,
|
|
226
|
-
"field": f"{self.type}/{self.id}",
|
|
227
|
-
"vectorset": vectorset,
|
|
228
|
-
"storage_key_kind": storage_key_kind,
|
|
229
|
-
"key": sf.key,
|
|
230
|
-
"bucket": sf.bucket,
|
|
231
|
-
},
|
|
232
|
-
)
|
|
233
225
|
try:
|
|
234
226
|
await self.storage.delete_upload(sf.key, sf.bucket)
|
|
235
227
|
except KeyError:
|
|
@@ -21,13 +21,16 @@ import uuid
|
|
|
21
21
|
from typing import Any, Optional
|
|
22
22
|
|
|
23
23
|
from nucliadb.ingest.fields.base import Field
|
|
24
|
-
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
|
|
24
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitMetadata, SplitsMetadata
|
|
25
25
|
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
|
26
26
|
from nucliadb_utils.storages.storage import StorageField
|
|
27
27
|
|
|
28
|
+
MAX_CONVERSATION_MESSAGES = 50 * 1024
|
|
29
|
+
|
|
28
30
|
PAGE_SIZE = 200
|
|
29
31
|
|
|
30
32
|
CONVERSATION_PAGE_VALUE = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
|
|
33
|
+
CONVERSATION_SPLITS_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/splits_metadata"
|
|
31
34
|
CONVERSATION_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
|
|
32
35
|
|
|
33
36
|
|
|
@@ -52,9 +55,22 @@ class Conversation(Field[PBConversation]):
|
|
|
52
55
|
):
|
|
53
56
|
super(Conversation, self).__init__(id, resource, pb, value)
|
|
54
57
|
self.value = {}
|
|
58
|
+
self._splits_metadata: Optional[SplitsMetadata] = None
|
|
59
|
+
self.metadata = None
|
|
60
|
+
|
|
61
|
+
async def delete_value(self):
|
|
62
|
+
await self.resource.txn.delete_by_prefix(
|
|
63
|
+
CONVERSATION_METADATA.format(kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id)
|
|
64
|
+
)
|
|
65
|
+
self._split_metadata = None
|
|
55
66
|
self.metadata = None
|
|
67
|
+
self.value.clear()
|
|
56
68
|
|
|
57
69
|
async def set_value(self, payload: PBConversation):
|
|
70
|
+
if payload.replace_field:
|
|
71
|
+
# As we need to overwrite the value of the conversation, first delete any previous data.
|
|
72
|
+
await self.delete_value()
|
|
73
|
+
|
|
58
74
|
metadata = await self.get_metadata()
|
|
59
75
|
metadata.extract_strategy = payload.extract_strategy
|
|
60
76
|
metadata.split_strategy = payload.split_strategy
|
|
@@ -70,10 +86,13 @@ class Conversation(Field[PBConversation]):
|
|
|
70
86
|
last_page = PBConversation()
|
|
71
87
|
metadata.pages += 1
|
|
72
88
|
|
|
89
|
+
self._splits_metadata = await self.get_splits_metadata()
|
|
90
|
+
|
|
73
91
|
# Make sure message attachment files are on our region. This is needed
|
|
74
92
|
# to support the hybrid-onprem deployment as the attachments must be stored
|
|
75
93
|
# at the storage services of the client's premises.
|
|
76
94
|
for message in payload.messages:
|
|
95
|
+
self._splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
|
|
77
96
|
new_message_files = []
|
|
78
97
|
for idx, file in enumerate(message.content.attachments):
|
|
79
98
|
if self.storage.needs_move(file, self.kbid):
|
|
@@ -117,6 +136,7 @@ class Conversation(Field[PBConversation]):
|
|
|
117
136
|
|
|
118
137
|
# Finally, set the metadata
|
|
119
138
|
await self.db_set_metadata(metadata)
|
|
139
|
+
await self.set_splits_metadata(self._splits_metadata)
|
|
120
140
|
|
|
121
141
|
async def get_value(self, page: Optional[int] = None) -> Optional[PBConversation]:
|
|
122
142
|
# If no page was requested, force fetch of metadata
|
|
@@ -203,3 +223,29 @@ class Conversation(Field[PBConversation]):
|
|
|
203
223
|
self.metadata = payload
|
|
204
224
|
self.resource.modified = True
|
|
205
225
|
self._created = False
|
|
226
|
+
|
|
227
|
+
async def get_splits_metadata(self) -> SplitsMetadata:
|
|
228
|
+
if self._splits_metadata is None:
|
|
229
|
+
field_key = CONVERSATION_SPLITS_METADATA.format(
|
|
230
|
+
kbid=self.kbid,
|
|
231
|
+
uuid=self.uuid,
|
|
232
|
+
type=self.type,
|
|
233
|
+
field=self.id,
|
|
234
|
+
)
|
|
235
|
+
payload = await self.resource.txn.get(field_key)
|
|
236
|
+
if payload is None:
|
|
237
|
+
return SplitsMetadata()
|
|
238
|
+
self._splits_metadata = SplitsMetadata()
|
|
239
|
+
self._splits_metadata.ParseFromString(payload)
|
|
240
|
+
return self._splits_metadata
|
|
241
|
+
|
|
242
|
+
async def set_splits_metadata(self, payload: SplitsMetadata) -> None:
|
|
243
|
+
key = CONVERSATION_SPLITS_METADATA.format(
|
|
244
|
+
kbid=self.kbid,
|
|
245
|
+
uuid=self.uuid,
|
|
246
|
+
type=self.type,
|
|
247
|
+
field=self.id,
|
|
248
|
+
)
|
|
249
|
+
await self.resource.txn.set(key, payload.SerializeToString())
|
|
250
|
+
self._split_metadata = payload
|
|
251
|
+
self.resource.modified = True
|
nucliadb/ingest/orm/brain_v2.py
CHANGED
|
@@ -193,7 +193,7 @@ class ResourceBrain:
|
|
|
193
193
|
if field_author is not None and field_author.WhichOneof("author") == "data_augmentation":
|
|
194
194
|
field_type, field_id = field_key.split("/")
|
|
195
195
|
da_task_id = ids.extract_data_augmentation_id(field_id)
|
|
196
|
-
if da_task_id is None: # pragma:
|
|
196
|
+
if da_task_id is None: # pragma: no cover
|
|
197
197
|
logger.warning(
|
|
198
198
|
"Data augmentation field id has an unexpected format! Skipping label",
|
|
199
199
|
extra={
|
|
@@ -217,6 +217,7 @@ class ResourceBrain:
|
|
|
217
217
|
replace_field: bool,
|
|
218
218
|
skip_paragraphs_index: Optional[bool],
|
|
219
219
|
skip_texts_index: Optional[bool],
|
|
220
|
+
append_splits: Optional[set[str]] = None,
|
|
220
221
|
) -> None:
|
|
221
222
|
# We need to add the extracted text to the texts section of the Resource so that
|
|
222
223
|
# the paragraphs can be indexed
|
|
@@ -234,6 +235,7 @@ class ResourceBrain:
|
|
|
234
235
|
user_field_metadata,
|
|
235
236
|
replace_field=replace_field,
|
|
236
237
|
skip_paragraphs=skip_paragraphs_index,
|
|
238
|
+
append_splits=append_splits,
|
|
237
239
|
)
|
|
238
240
|
|
|
239
241
|
@observer.wrap({"type": "apply_field_paragraphs"})
|
|
@@ -246,6 +248,7 @@ class ResourceBrain:
|
|
|
246
248
|
user_field_metadata: Optional[UserFieldMetadata],
|
|
247
249
|
replace_field: bool,
|
|
248
250
|
skip_paragraphs: Optional[bool],
|
|
251
|
+
append_splits: Optional[set[str]] = None,
|
|
249
252
|
) -> None:
|
|
250
253
|
if skip_paragraphs is not None:
|
|
251
254
|
self.brain.skip_paragraphs = skip_paragraphs
|
|
@@ -254,7 +257,12 @@ class ResourceBrain:
|
|
|
254
257
|
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
|
255
258
|
# Splits of the field
|
|
256
259
|
for subfield, field_metadata in field_computed_metadata.split_metadata.items():
|
|
257
|
-
|
|
260
|
+
if should_skip_split_indexing(subfield, replace_field, append_splits):
|
|
261
|
+
continue
|
|
262
|
+
if subfield not in extracted_text.split_text:
|
|
263
|
+
# No extracted text for this split
|
|
264
|
+
continue
|
|
265
|
+
extracted_text_str = extracted_text.split_text[subfield]
|
|
258
266
|
for idx, paragraph in enumerate(field_metadata.paragraphs):
|
|
259
267
|
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
|
260
268
|
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
|
@@ -308,7 +316,7 @@ class ResourceBrain:
|
|
|
308
316
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
|
309
317
|
|
|
310
318
|
# Main field
|
|
311
|
-
extracted_text_str = extracted_text.text
|
|
319
|
+
extracted_text_str = extracted_text.text
|
|
312
320
|
for idx, paragraph in enumerate(field_computed_metadata.metadata.paragraphs):
|
|
313
321
|
key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
|
314
322
|
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
|
@@ -496,9 +504,12 @@ class ResourceBrain:
|
|
|
496
504
|
replace_field: bool = False,
|
|
497
505
|
# cut to specific dimension if specified
|
|
498
506
|
vector_dimension: Optional[int] = None,
|
|
507
|
+
append_splits: Optional[set[str]] = None,
|
|
499
508
|
):
|
|
500
509
|
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
|
501
510
|
for subfield, vectors in vo.split_vectors.items():
|
|
511
|
+
if should_skip_split_indexing(subfield, replace_field, append_splits):
|
|
512
|
+
continue
|
|
502
513
|
_field_id = ids.FieldId(
|
|
503
514
|
rid=fid.rid,
|
|
504
515
|
type=fid.type,
|
|
@@ -792,3 +803,10 @@ class ParagraphPages:
|
|
|
792
803
|
if len(self._materialized) > 0:
|
|
793
804
|
return self._materialized[-1]
|
|
794
805
|
return 0
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def should_skip_split_indexing(
|
|
809
|
+
split: str, replace_field: bool, append_splits: Optional[set[str]]
|
|
810
|
+
) -> bool:
|
|
811
|
+
# When replacing the whole field, reindex all splits. Otherwise, we're only indexing the splits that are appended
|
|
812
|
+
return not replace_field and append_splits is not None and split not in append_splits
|