nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/common/maindb/pg.py
CHANGED
|
@@ -21,8 +21,9 @@ from __future__ import annotations
|
|
|
21
21
|
|
|
22
22
|
import asyncio
|
|
23
23
|
import logging
|
|
24
|
+
from collections.abc import AsyncGenerator
|
|
24
25
|
from contextlib import asynccontextmanager
|
|
25
|
-
from typing import Any
|
|
26
|
+
from typing import Any
|
|
26
27
|
|
|
27
28
|
import backoff
|
|
28
29
|
import psycopg
|
|
@@ -72,7 +73,7 @@ class DataLayer:
|
|
|
72
73
|
self.connection = connection
|
|
73
74
|
self.log_on_select_for_update = settings.driver_pg_log_on_select_for_update
|
|
74
75
|
|
|
75
|
-
async def get(self, key: str, select_for_update: bool = False) ->
|
|
76
|
+
async def get(self, key: str, select_for_update: bool = False) -> bytes | None:
|
|
76
77
|
with pg_observer({"type": "get"}):
|
|
77
78
|
statement = "SELECT value FROM resources WHERE key = %s"
|
|
78
79
|
if select_for_update:
|
|
@@ -116,7 +117,7 @@ class DataLayer:
|
|
|
116
117
|
async with self.connection.cursor() as cur:
|
|
117
118
|
await cur.execute("DELETE FROM resources WHERE key LIKE %s", (prefix + "%",))
|
|
118
119
|
|
|
119
|
-
async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[
|
|
120
|
+
async def batch_get(self, keys: list[str], select_for_update: bool = False) -> list[bytes | None]:
|
|
120
121
|
with pg_observer({"type": "batch_get"}):
|
|
121
122
|
async with self.connection.cursor() as cur:
|
|
122
123
|
statement = "SELECT key, value FROM resources WHERE key = ANY(%s)"
|
|
@@ -134,7 +135,7 @@ class DataLayer:
|
|
|
134
135
|
prefix: str,
|
|
135
136
|
limit: int = DEFAULT_SCAN_LIMIT,
|
|
136
137
|
include_start: bool = True,
|
|
137
|
-
) -> AsyncGenerator[str
|
|
138
|
+
) -> AsyncGenerator[str]:
|
|
138
139
|
query = "SELECT key FROM resources WHERE key LIKE %s ORDER BY key"
|
|
139
140
|
|
|
140
141
|
args: list[Any] = [prefix + "%"]
|
|
@@ -190,7 +191,7 @@ class PGTransaction(Transaction):
|
|
|
190
191
|
async def batch_get(self, keys: list[str], for_update: bool = True):
|
|
191
192
|
return await self.data_layer.batch_get(keys, select_for_update=for_update)
|
|
192
193
|
|
|
193
|
-
async def get(self, key: str, for_update: bool = True) ->
|
|
194
|
+
async def get(self, key: str, for_update: bool = True) -> bytes | None:
|
|
194
195
|
return await self.data_layer.get(key, select_for_update=for_update)
|
|
195
196
|
|
|
196
197
|
async def set(self, key: str, value: bytes):
|
|
@@ -243,7 +244,7 @@ class ReadOnlyPGTransaction(Transaction):
|
|
|
243
244
|
return await DataLayer(conn).batch_get(keys, select_for_update=False)
|
|
244
245
|
|
|
245
246
|
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
|
246
|
-
async def get(self, key: str, for_update: bool = False) ->
|
|
247
|
+
async def get(self, key: str, for_update: bool = False) -> bytes | None:
|
|
247
248
|
async with self.driver._get_connection() as conn:
|
|
248
249
|
return await DataLayer(conn).get(key, select_for_update=False)
|
|
249
250
|
|
|
@@ -330,7 +331,7 @@ class PGDriver(Driver):
|
|
|
330
331
|
metric.set(value)
|
|
331
332
|
|
|
332
333
|
@asynccontextmanager
|
|
333
|
-
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction
|
|
334
|
+
async def _transaction(self, *, read_only: bool) -> AsyncGenerator[Transaction]:
|
|
334
335
|
if read_only:
|
|
335
336
|
yield ReadOnlyPGTransaction(self)
|
|
336
337
|
else:
|
|
@@ -343,7 +344,7 @@ class PGDriver(Driver):
|
|
|
343
344
|
await txn.abort()
|
|
344
345
|
|
|
345
346
|
@asynccontextmanager
|
|
346
|
-
async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection
|
|
347
|
+
async def _get_connection(self) -> AsyncGenerator[psycopg.AsyncConnection]:
|
|
347
348
|
timeout = self.acquire_timeout_ms / 1000
|
|
348
349
|
# Manual retry loop since backoff.on_exception does not play well with async context managers
|
|
349
350
|
retries = 0
|
nucliadb/common/nidx.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
|
|
21
21
|
import os
|
|
22
|
-
from typing import Optional, Union
|
|
23
22
|
|
|
24
23
|
from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
|
|
25
24
|
from nidx_protos.nodewriter_pb2 import (
|
|
@@ -54,7 +53,7 @@ class NidxUtility:
|
|
|
54
53
|
pass
|
|
55
54
|
|
|
56
55
|
|
|
57
|
-
def _storage_config(prefix: str, bucket:
|
|
56
|
+
def _storage_config(prefix: str, bucket: str | None) -> dict[str, str]:
|
|
58
57
|
config = {}
|
|
59
58
|
if storage_settings.file_backend == FileBackendConfig.LOCAL:
|
|
60
59
|
local_bucket = bucket or storage_settings.local_indexing_bucket
|
|
@@ -161,7 +160,7 @@ class NidxNatsIndexer:
|
|
|
161
160
|
async def index(self, writer: IndexMessage) -> int:
|
|
162
161
|
res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
|
|
163
162
|
logger.info(
|
|
164
|
-
f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}"
|
|
163
|
+
f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}"
|
|
165
164
|
)
|
|
166
165
|
return res.seq
|
|
167
166
|
|
|
@@ -185,7 +184,7 @@ class NidxGrpcIndexer:
|
|
|
185
184
|
class NidxServiceUtility(NidxUtility):
|
|
186
185
|
"""Implements Nidx utility connecting to the network service"""
|
|
187
186
|
|
|
188
|
-
indexer:
|
|
187
|
+
indexer: NidxNatsIndexer | NidxGrpcIndexer
|
|
189
188
|
|
|
190
189
|
def __init__(self, service_name: str):
|
|
191
190
|
self.service_name = service_name
|
|
@@ -18,8 +18,9 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import json
|
|
21
|
+
from collections.abc import AsyncGenerator
|
|
21
22
|
from datetime import datetime, timezone
|
|
22
|
-
from typing import
|
|
23
|
+
from typing import Type, cast
|
|
23
24
|
|
|
24
25
|
from nucliadb.common.maindb.driver import Driver
|
|
25
26
|
from nucliadb.export_import import logger
|
|
@@ -34,7 +35,7 @@ MAINDB_IMPORT_KEY = "/kbs/{kbid}/imports/{id}"
|
|
|
34
35
|
STORAGE_EXPORT_KEY = "exports/{export_id}"
|
|
35
36
|
STORAGE_IMPORT_KEY = "imports/{import_id}"
|
|
36
37
|
|
|
37
|
-
Metadata =
|
|
38
|
+
Metadata = ExportMetadata | ImportMetadata
|
|
38
39
|
|
|
39
40
|
|
|
40
41
|
class ExportImportDataManager:
|
|
@@ -59,7 +60,7 @@ class ExportImportDataManager:
|
|
|
59
60
|
if data is None or data == b"":
|
|
60
61
|
raise MetadataNotFound()
|
|
61
62
|
decoded = data.decode("utf-8")
|
|
62
|
-
model_type:
|
|
63
|
+
model_type: Type[ExportMetadata] | Type[ImportMetadata]
|
|
63
64
|
if type == "export":
|
|
64
65
|
model_type = ExportMetadata
|
|
65
66
|
elif type == "import":
|
|
@@ -18,11 +18,12 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
|
|
21
|
-
from
|
|
21
|
+
from collections.abc import AsyncGenerator
|
|
22
22
|
|
|
23
23
|
from nucliadb.common.context import ApplicationContext
|
|
24
24
|
from nucliadb.export_import import logger
|
|
25
25
|
from nucliadb.export_import.datamanager import ExportImportDataManager
|
|
26
|
+
from nucliadb.export_import.exceptions import MetadataNotFound
|
|
26
27
|
from nucliadb.export_import.models import (
|
|
27
28
|
ExportedItemType,
|
|
28
29
|
ExportMetadata,
|
|
@@ -33,7 +34,6 @@ from nucliadb.export_import.utils import (
|
|
|
33
34
|
download_binary,
|
|
34
35
|
get_broker_message,
|
|
35
36
|
get_cloud_files,
|
|
36
|
-
get_entities,
|
|
37
37
|
get_labels,
|
|
38
38
|
get_learning_config,
|
|
39
39
|
iter_kb_resource_uuids,
|
|
@@ -43,7 +43,7 @@ from nucliadb_telemetry import errors
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
async def export_kb(
|
|
46
|
-
context: ApplicationContext, kbid: str, metadata:
|
|
46
|
+
context: ApplicationContext, kbid: str, metadata: ExportMetadata | None = None
|
|
47
47
|
) -> AsyncGenerator[bytes, None]:
|
|
48
48
|
"""Export the data of a knowledgebox to a stream of bytes.
|
|
49
49
|
|
|
@@ -63,9 +63,6 @@ async def export_kb(
|
|
|
63
63
|
async for chunk in resources_iterator:
|
|
64
64
|
yield chunk
|
|
65
65
|
|
|
66
|
-
async for chunk in export_entities(context, kbid):
|
|
67
|
-
yield chunk
|
|
68
|
-
|
|
69
66
|
async for chunk in export_labels(context, kbid):
|
|
70
67
|
yield chunk
|
|
71
68
|
|
|
@@ -76,7 +73,14 @@ async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMe
|
|
|
76
73
|
"""
|
|
77
74
|
kbid, export_id = msg.kbid, msg.id
|
|
78
75
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
|
79
|
-
|
|
76
|
+
try:
|
|
77
|
+
metadata = await dm.get_export_metadata(kbid=kbid, id=export_id)
|
|
78
|
+
except MetadataNotFound: # pragma: no cover
|
|
79
|
+
logger.error(
|
|
80
|
+
"Export metadata not found. Skipping export.", extra={"kbid": kbid, "export_id": export_id}
|
|
81
|
+
)
|
|
82
|
+
return
|
|
83
|
+
|
|
80
84
|
iterator = export_kb(context, kbid, metadata)
|
|
81
85
|
|
|
82
86
|
retry_handler = TaskRetryHandler("export", dm, metadata)
|
|
@@ -167,18 +171,6 @@ async def export_resource_with_binaries(
|
|
|
167
171
|
yield bm_bytes
|
|
168
172
|
|
|
169
173
|
|
|
170
|
-
async def export_entities(
|
|
171
|
-
context: ApplicationContext,
|
|
172
|
-
kbid: str,
|
|
173
|
-
) -> AsyncGenerator[bytes, None]:
|
|
174
|
-
entities = await get_entities(context, kbid)
|
|
175
|
-
if len(entities.entities_groups) > 0:
|
|
176
|
-
data = entities.SerializeToString()
|
|
177
|
-
yield ExportedItemType.ENTITIES.encode("utf-8")
|
|
178
|
-
yield len(data).to_bytes(4, byteorder="big")
|
|
179
|
-
yield data
|
|
180
|
-
|
|
181
|
-
|
|
182
174
|
async def export_labels(
|
|
183
175
|
context: ApplicationContext,
|
|
184
176
|
kbid: str,
|
|
@@ -17,11 +17,13 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from
|
|
20
|
+
from collections.abc import AsyncGenerator, Callable
|
|
21
|
+
from typing import cast
|
|
21
22
|
|
|
22
23
|
from nucliadb.common.context import ApplicationContext
|
|
23
24
|
from nucliadb.export_import import logger
|
|
24
25
|
from nucliadb.export_import.datamanager import ExportImportDataManager
|
|
26
|
+
from nucliadb.export_import.exceptions import MetadataNotFound
|
|
25
27
|
from nucliadb.export_import.models import (
|
|
26
28
|
ExportedItemType,
|
|
27
29
|
ImportMetadata,
|
|
@@ -32,7 +34,6 @@ from nucliadb.export_import.utils import (
|
|
|
32
34
|
TaskRetryHandler,
|
|
33
35
|
import_binary,
|
|
34
36
|
restore_broker_message,
|
|
35
|
-
set_entities_groups,
|
|
36
37
|
set_labels,
|
|
37
38
|
)
|
|
38
39
|
from nucliadb_protos import knowledgebox_pb2 as kb_pb2
|
|
@@ -46,7 +47,7 @@ async def import_kb(
|
|
|
46
47
|
context: ApplicationContext,
|
|
47
48
|
kbid: str,
|
|
48
49
|
stream: AsyncGenerator[bytes, None],
|
|
49
|
-
metadata:
|
|
50
|
+
metadata: ImportMetadata | None = None,
|
|
50
51
|
) -> None:
|
|
51
52
|
"""
|
|
52
53
|
Imports exported data from a stream into a knowledgebox.
|
|
@@ -72,8 +73,8 @@ async def import_kb(
|
|
|
72
73
|
await import_binary(context, kbid, cf, binary_generator)
|
|
73
74
|
|
|
74
75
|
elif item_type == ExportedItemType.ENTITIES:
|
|
75
|
-
|
|
76
|
-
|
|
76
|
+
# This is not supported anymore, we ignore it if we find it in and old backup
|
|
77
|
+
pass
|
|
77
78
|
|
|
78
79
|
elif item_type == ExportedItemType.LABELS:
|
|
79
80
|
labels = cast(kb_pb2.Labels, data)
|
|
@@ -99,7 +100,13 @@ async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTask
|
|
|
99
100
|
"""
|
|
100
101
|
kbid, import_id = msg.kbid, msg.id
|
|
101
102
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
|
102
|
-
|
|
103
|
+
try:
|
|
104
|
+
metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
|
|
105
|
+
except MetadataNotFound: # pragma: no cover
|
|
106
|
+
logger.error(
|
|
107
|
+
"Import metadata not found. Skipping import.", extra={"kbid": kbid, "import_id": import_id}
|
|
108
|
+
)
|
|
109
|
+
return
|
|
103
110
|
|
|
104
111
|
retry_handler = TaskRetryHandler("import", dm, metadata)
|
|
105
112
|
|
nucliadb/export_import/tasks.py
CHANGED
|
@@ -56,6 +56,7 @@ def get_exports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
|
|
|
56
56
|
callback=export_kb_to_blob_storage,
|
|
57
57
|
msg_type=NatsTaskMessage,
|
|
58
58
|
max_concurrent_messages=10,
|
|
59
|
+
max_retries=100,
|
|
59
60
|
)
|
|
60
61
|
|
|
61
62
|
|
|
@@ -77,6 +78,7 @@ def get_imports_consumer() -> NatsTaskConsumer[NatsTaskMessage]:
|
|
|
77
78
|
callback=import_kb_from_blob_storage,
|
|
78
79
|
msg_type=NatsTaskMessage,
|
|
79
80
|
max_concurrent_messages=10,
|
|
81
|
+
max_retries=100,
|
|
80
82
|
)
|
|
81
83
|
|
|
82
84
|
|
nucliadb/export_import/utils.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import functools
|
|
21
|
-
from
|
|
21
|
+
from collections.abc import AsyncGenerator, AsyncIterator, Callable
|
|
22
22
|
|
|
23
23
|
import backoff
|
|
24
24
|
from google.protobuf.message import DecodeError as ProtobufDecodeError
|
|
@@ -35,6 +35,7 @@ from nucliadb.export_import.exceptions import (
|
|
|
35
35
|
)
|
|
36
36
|
from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
|
|
37
37
|
from nucliadb.ingest.orm.broker_message import generate_broker_message
|
|
38
|
+
from nucliadb.ingest.orm.resource import Resource
|
|
38
39
|
from nucliadb_models.configuration import SearchConfiguration
|
|
39
40
|
from nucliadb_models.export_import import Status
|
|
40
41
|
from nucliadb_protos import knowledgebox_pb2 as kb_pb2
|
|
@@ -171,14 +172,6 @@ async def import_binary(
|
|
|
171
172
|
)
|
|
172
173
|
|
|
173
174
|
|
|
174
|
-
async def set_entities_groups(
|
|
175
|
-
context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
|
|
176
|
-
) -> None:
|
|
177
|
-
async with datamanagers.with_transaction() as txn:
|
|
178
|
-
await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
|
|
179
|
-
await txn.commit()
|
|
180
|
-
|
|
181
|
-
|
|
182
175
|
async def set_synonyms(context: ApplicationContext, kbid: str, synonyms: kb_pb2.Synonyms) -> None:
|
|
183
176
|
async with datamanagers.with_transaction() as txn:
|
|
184
177
|
await datamanagers.synonyms.set(txn, kbid=kbid, synonyms=synonyms)
|
|
@@ -207,9 +200,9 @@ async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> Asyn
|
|
|
207
200
|
|
|
208
201
|
async def get_broker_message(
|
|
209
202
|
context: ApplicationContext, kbid: str, rid: str
|
|
210
|
-
) ->
|
|
203
|
+
) -> writer_pb2.BrokerMessage | None:
|
|
211
204
|
async with datamanagers.with_ro_transaction() as txn:
|
|
212
|
-
resource = await
|
|
205
|
+
resource = await Resource.get(txn, kbid=kbid, rid=rid)
|
|
213
206
|
if resource is None:
|
|
214
207
|
return None
|
|
215
208
|
resource.disable_vectors = False
|
|
@@ -284,11 +277,6 @@ async def download_binary(
|
|
|
284
277
|
assert downloaded_bytes == cf.size, "Downloaded bytes do not match the expected size"
|
|
285
278
|
|
|
286
279
|
|
|
287
|
-
async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
|
|
288
|
-
async with datamanagers.with_ro_transaction() as txn:
|
|
289
|
-
return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
|
|
290
|
-
|
|
291
|
-
|
|
292
280
|
async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
|
|
293
281
|
async with datamanagers.with_ro_transaction() as txn:
|
|
294
282
|
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
|
@@ -434,7 +422,7 @@ class ExportStreamReader:
|
|
|
434
422
|
|
|
435
423
|
async def maybe_read_learning_config(
|
|
436
424
|
self,
|
|
437
|
-
) -> tuple[
|
|
425
|
+
) -> tuple[learning_proxy.LearningConfiguration | None, bytes]:
|
|
438
426
|
"""
|
|
439
427
|
Tries to read a learning config from the beginning of the stream.
|
|
440
428
|
Returs the learning config if found. It also returns any leftover bytes that
|
|
@@ -533,7 +521,7 @@ class TaskRetryHandler:
|
|
|
533
521
|
|
|
534
522
|
async def get_learning_config(
|
|
535
523
|
kbid: str,
|
|
536
|
-
) ->
|
|
524
|
+
) -> learning_proxy.LearningConfiguration | None:
|
|
537
525
|
return await learning_proxy.get_configuration(kbid)
|
|
538
526
|
|
|
539
527
|
|
nucliadb/health.py
CHANGED
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
import logging
|
|
22
|
-
from
|
|
22
|
+
from collections.abc import Awaitable, Callable
|
|
23
23
|
|
|
24
24
|
from grpc import aio
|
|
25
25
|
from grpc_health.v1 import health, health_pb2, health_pb2_grpc
|
|
@@ -41,7 +41,7 @@ def nats_manager_healthy() -> bool:
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
def pubsub_check() -> bool:
|
|
44
|
-
driver:
|
|
44
|
+
driver: PubSubDriver | None = get_utility(Utility.PUBSUB)
|
|
45
45
|
if driver is None:
|
|
46
46
|
return True
|
|
47
47
|
if isinstance(driver, NatsPubsub):
|
nucliadb/ingest/app.py
CHANGED
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
import importlib.metadata
|
|
22
|
-
from
|
|
22
|
+
from collections.abc import Awaitable, Callable
|
|
23
23
|
|
|
24
24
|
from nucliadb import health
|
|
25
25
|
from nucliadb.backups.tasks import initialize_consumers as initialize_backup_consumers
|
|
@@ -96,7 +96,7 @@ async def initialize_grpc(): # pragma: no cover
|
|
|
96
96
|
finalizers = await initialize()
|
|
97
97
|
grpc_finalizer = await start_grpc(SERVICE_NAME)
|
|
98
98
|
|
|
99
|
-
return [grpc_finalizer
|
|
99
|
+
return [grpc_finalizer, *finalizers]
|
|
100
100
|
|
|
101
101
|
|
|
102
102
|
async def initialize_pull_workers() -> list[Callable[[], Awaitable[None]]]:
|
|
@@ -114,14 +114,14 @@ async def main_consumer(): # pragma: no cover
|
|
|
114
114
|
|
|
115
115
|
ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
|
|
116
116
|
|
|
117
|
-
await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown
|
|
117
|
+
await run_until_exit([grpc_health_finalizer, ingest_consumers, metrics_server.shutdown, *finalizers])
|
|
118
118
|
|
|
119
119
|
|
|
120
120
|
async def main_orm_grpc(): # pragma: no cover
|
|
121
121
|
finalizers = await initialize()
|
|
122
122
|
grpc_finalizer = await start_grpc(SERVICE_NAME)
|
|
123
123
|
metrics_server = await serve_metrics()
|
|
124
|
-
await run_until_exit([grpc_finalizer, metrics_server.shutdown
|
|
124
|
+
await run_until_exit([grpc_finalizer, metrics_server.shutdown, *finalizers])
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
async def main_ingest_processed_consumer(): # pragma: no cover
|
|
@@ -134,7 +134,7 @@ async def main_ingest_processed_consumer(): # pragma: no cover
|
|
|
134
134
|
consumer = await consumer_service.start_ingest_processed_consumer_v2(SERVICE_NAME)
|
|
135
135
|
|
|
136
136
|
await run_until_exit(
|
|
137
|
-
[grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine
|
|
137
|
+
[grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine, *finalizers]
|
|
138
138
|
)
|
|
139
139
|
|
|
140
140
|
|
|
@@ -158,8 +158,8 @@ async def main_subscriber_workers(): # pragma: no cover
|
|
|
158
158
|
backup_consumers_finalizers = await initialize_backup_consumers(context)
|
|
159
159
|
|
|
160
160
|
await run_until_exit(
|
|
161
|
-
|
|
162
|
-
|
|
161
|
+
[
|
|
162
|
+
*backup_consumers_finalizers,
|
|
163
163
|
imports_consumer.finalize,
|
|
164
164
|
exports_consumer.finalize,
|
|
165
165
|
stop_ingest_utility,
|
|
@@ -169,8 +169,8 @@ async def main_subscriber_workers(): # pragma: no cover
|
|
|
169
169
|
grpc_health_finalizer,
|
|
170
170
|
metrics_server.shutdown,
|
|
171
171
|
context.finalize,
|
|
172
|
+
*finalizers,
|
|
172
173
|
]
|
|
173
|
-
+ finalizers
|
|
174
174
|
)
|
|
175
175
|
|
|
176
176
|
|
|
@@ -20,12 +20,10 @@
|
|
|
20
20
|
import asyncio
|
|
21
21
|
import logging
|
|
22
22
|
import time
|
|
23
|
-
from typing import Optional, Union
|
|
24
23
|
|
|
25
24
|
import backoff
|
|
26
25
|
import nats
|
|
27
26
|
import nats.js.api
|
|
28
|
-
import nats.js.errors
|
|
29
27
|
from nats.aio.client import Msg
|
|
30
28
|
from nats.js import JetStreamContext
|
|
31
29
|
|
|
@@ -74,8 +72,8 @@ class IngestConsumer:
|
|
|
74
72
|
partition: str,
|
|
75
73
|
storage: Storage,
|
|
76
74
|
nats_connection_manager: NatsConnectionManager,
|
|
77
|
-
pubsub:
|
|
78
|
-
lock:
|
|
75
|
+
pubsub: PubSubDriver | None = None,
|
|
76
|
+
lock: asyncio.Lock | asyncio.Semaphore | None = None,
|
|
79
77
|
):
|
|
80
78
|
self.driver = driver
|
|
81
79
|
self.partition = partition
|
|
@@ -85,9 +83,9 @@ class IngestConsumer:
|
|
|
85
83
|
|
|
86
84
|
self.lock = lock or asyncio.Lock()
|
|
87
85
|
self.processor = Processor(driver, storage, pubsub, partition)
|
|
88
|
-
self.subscription:
|
|
86
|
+
self.subscription: JetStreamContext.PullSubscription | None = None
|
|
89
87
|
|
|
90
|
-
async def ack_message(self, msg: Msg, kbid:
|
|
88
|
+
async def ack_message(self, msg: Msg, kbid: str | None = None):
|
|
91
89
|
await msg.ack()
|
|
92
90
|
|
|
93
91
|
async def initialize(self):
|
|
@@ -162,7 +160,7 @@ class IngestConsumer:
|
|
|
162
160
|
async def subscription_worker(self, msg: Msg):
|
|
163
161
|
context.clear_context()
|
|
164
162
|
|
|
165
|
-
kbid:
|
|
163
|
+
kbid: str | None = None
|
|
166
164
|
subject = msg.subject
|
|
167
165
|
reply = msg.reply
|
|
168
166
|
seqid = int(reply.split(".")[5])
|
|
@@ -238,7 +236,7 @@ class IngestConsumer:
|
|
|
238
236
|
logger.info(
|
|
239
237
|
f"An error happend while processing a message from {message_source}. "
|
|
240
238
|
f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
|
|
241
|
-
f"Check sentry for more details: {
|
|
239
|
+
f"Check sentry for more details: {e!s}"
|
|
242
240
|
)
|
|
243
241
|
await self.ack_message(msg, kbid)
|
|
244
242
|
logger.info("Message acked because of deadletter", extra={"seqid": seqid})
|
|
@@ -250,7 +248,7 @@ class IngestConsumer:
|
|
|
250
248
|
logger.info(
|
|
251
249
|
f"An error happend while processing a message from {message_source}. "
|
|
252
250
|
f"This message has been dropped and won't be retried again"
|
|
253
|
-
f"Check sentry for more details: {
|
|
251
|
+
f"Check sentry for more details: {e!s}"
|
|
254
252
|
)
|
|
255
253
|
await self.ack_message(msg, kbid)
|
|
256
254
|
logger.info("Message acked because of drop", extra={"seqid": seqid})
|
|
@@ -260,7 +258,7 @@ class IngestConsumer:
|
|
|
260
258
|
logger.exception(
|
|
261
259
|
f"An error happend while processing a message from {message_source}. "
|
|
262
260
|
"Message has not been ACKd and will be retried. "
|
|
263
|
-
f"Check sentry for more details: {
|
|
261
|
+
f"Check sentry for more details: {e!s}"
|
|
264
262
|
)
|
|
265
263
|
await msg.nak()
|
|
266
264
|
logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})
|
nucliadb/ingest/consumer/pull.py
CHANGED
|
@@ -21,7 +21,6 @@ import asyncio
|
|
|
21
21
|
import base64
|
|
22
22
|
import time
|
|
23
23
|
from contextlib import contextmanager
|
|
24
|
-
from typing import Optional
|
|
25
24
|
|
|
26
25
|
from aiohttp.client_exceptions import ClientConnectorError
|
|
27
26
|
from opentelemetry import trace
|
|
@@ -35,7 +34,6 @@ from nucliadb.common.http_clients.exceptions import ServiceUnavailableException
|
|
|
35
34
|
from nucliadb.common.http_clients.processing import (
|
|
36
35
|
ProcessingHTTPClient,
|
|
37
36
|
ProcessingPullMessageProgressUpdater,
|
|
38
|
-
get_nua_api_id,
|
|
39
37
|
)
|
|
40
38
|
from nucliadb.common.maindb.driver import Driver
|
|
41
39
|
from nucliadb.ingest import SERVICE_NAME, logger, logger_activity
|
|
@@ -96,7 +94,7 @@ class PullV2Worker:
|
|
|
96
94
|
driver: Driver,
|
|
97
95
|
storage: Storage,
|
|
98
96
|
pull_time_error_backoff: int,
|
|
99
|
-
pubsub:
|
|
97
|
+
pubsub: PubSubDriver | None = None,
|
|
100
98
|
pull_time_empty_backoff: float = 5.0,
|
|
101
99
|
pull_api_timeout: int = 60,
|
|
102
100
|
):
|
|
@@ -142,12 +140,9 @@ class PullV2Worker:
|
|
|
142
140
|
data = None
|
|
143
141
|
if nuclia_settings.nuclia_service_account is not None:
|
|
144
142
|
headers["X-STF-NUAKEY"] = f"Bearer {nuclia_settings.nuclia_service_account}"
|
|
145
|
-
|
|
146
|
-
try:
|
|
147
|
-
get_nua_api_id()
|
|
148
|
-
except Exception as exc:
|
|
143
|
+
if nuclia_settings.nuclia_service_account is None:
|
|
149
144
|
logger.exception("Could not read NUA API Key. Can not start pull worker")
|
|
150
|
-
raise ReallyStopPulling()
|
|
145
|
+
raise ReallyStopPulling()
|
|
151
146
|
|
|
152
147
|
ack_tokens = []
|
|
153
148
|
async with ProcessingHTTPClient() as processing_http_client:
|
|
@@ -19,8 +19,8 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
import sys
|
|
22
|
+
from collections.abc import Awaitable, Callable
|
|
22
23
|
from functools import partial
|
|
23
|
-
from typing import Awaitable, Callable, Optional
|
|
24
24
|
|
|
25
25
|
from nucliadb.common.maindb.utils import setup_driver
|
|
26
26
|
from nucliadb.ingest import SERVICE_NAME, logger
|
|
@@ -55,7 +55,7 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
|
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
async def start_ingest_consumers(
|
|
58
|
-
service_name:
|
|
58
|
+
service_name: str | None = None,
|
|
59
59
|
) -> Callable[[], Awaitable[None]]:
|
|
60
60
|
if transaction_settings.transaction_local:
|
|
61
61
|
raise ConfigurationError("Can not start ingest consumers in local mode")
|
|
@@ -91,7 +91,7 @@ async def start_ingest_consumers(
|
|
|
91
91
|
|
|
92
92
|
|
|
93
93
|
async def start_ingest_processed_consumer_v2(
|
|
94
|
-
service_name:
|
|
94
|
+
service_name: str | None = None,
|
|
95
95
|
) -> Callable[[], Awaitable[None]]:
|
|
96
96
|
"""
|
|
97
97
|
This is not meant to be deployed with a stateful set like the other consumers.
|