nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
import logging
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
import aiohttp.client_exceptions
|
|
25
24
|
import nats.errors
|
|
@@ -28,6 +27,7 @@ from nidx_protos import noderesources_pb2, nodewriter_pb2
|
|
|
28
27
|
from nidx_protos.noderesources_pb2 import Resource as PBBrainResource
|
|
29
28
|
|
|
30
29
|
from nucliadb.common import datamanagers, locking
|
|
30
|
+
from nucliadb.common.catalog import catalog_delete, catalog_update
|
|
31
31
|
from nucliadb.common.cluster.settings import settings as cluster_settings
|
|
32
32
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
33
33
|
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
@@ -61,8 +61,6 @@ from nucliadb_utils.cache.pubsub import PubSubDriver
|
|
|
61
61
|
from nucliadb_utils.storages.storage import Storage
|
|
62
62
|
from nucliadb_utils.utilities import get_storage, has_feature
|
|
63
63
|
|
|
64
|
-
from .pgcatalog import pgcatalog_delete, pgcatalog_update
|
|
65
|
-
|
|
66
64
|
logger = logging.getLogger("ingest-processor")
|
|
67
65
|
|
|
68
66
|
MESSAGE_TO_NOTIFICATION_SOURCE = {
|
|
@@ -142,16 +140,13 @@ class Processor:
|
|
|
142
140
|
and can not use the txn id
|
|
143
141
|
"""
|
|
144
142
|
|
|
145
|
-
messages: dict[str, list[writer_pb2.BrokerMessage]]
|
|
146
|
-
|
|
147
143
|
def __init__(
|
|
148
144
|
self,
|
|
149
145
|
driver: Driver,
|
|
150
146
|
storage: Storage,
|
|
151
|
-
pubsub:
|
|
152
|
-
partition:
|
|
147
|
+
pubsub: PubSubDriver | None = None,
|
|
148
|
+
partition: str | None = None,
|
|
153
149
|
):
|
|
154
|
-
self.messages = {}
|
|
155
150
|
self.driver = driver
|
|
156
151
|
self.storage = storage
|
|
157
152
|
self.partition = partition
|
|
@@ -162,7 +157,7 @@ class Processor:
|
|
|
162
157
|
self,
|
|
163
158
|
message: writer_pb2.BrokerMessage,
|
|
164
159
|
seqid: int,
|
|
165
|
-
partition:
|
|
160
|
+
partition: str | None = None,
|
|
166
161
|
transaction_check: bool = True,
|
|
167
162
|
) -> None:
|
|
168
163
|
partition = partition if self.partition is None else self.partition
|
|
@@ -180,18 +175,12 @@ class Processor:
|
|
|
180
175
|
if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
|
|
181
176
|
await self.delete_resource(message, seqid, partition, transaction_check)
|
|
182
177
|
elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
|
|
183
|
-
await self.txn(
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
# XXX Should this be removed?
|
|
190
|
-
await self.multi(message, seqid)
|
|
191
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
|
|
192
|
-
await self.commit(message, seqid, partition)
|
|
193
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
|
|
194
|
-
await self.rollback(message, seqid, partition)
|
|
178
|
+
await self.txn(message, seqid, partition, transaction_check)
|
|
179
|
+
else: # pragma: no cover
|
|
180
|
+
logger.error(
|
|
181
|
+
f"Unsupported message type: {message.type}",
|
|
182
|
+
extra={"seqid": seqid, "partition": partition},
|
|
183
|
+
)
|
|
195
184
|
|
|
196
185
|
async def get_resource_uuid(self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage) -> str:
|
|
197
186
|
if message.uuid is None:
|
|
@@ -227,7 +216,8 @@ class Processor:
|
|
|
227
216
|
shard = await kb.get_resource_shard(shard_id)
|
|
228
217
|
if shard is None:
|
|
229
218
|
raise AttributeError("Shard not available")
|
|
230
|
-
|
|
219
|
+
|
|
220
|
+
await catalog_delete(txn, message.kbid, uuid)
|
|
231
221
|
external_index_manager = await get_external_index_manager(kbid=message.kbid)
|
|
232
222
|
if external_index_manager is not None:
|
|
233
223
|
await self.external_index_delete_resource(external_index_manager, uuid)
|
|
@@ -242,7 +232,6 @@ class Processor:
|
|
|
242
232
|
await self.notify_abort(
|
|
243
233
|
partition=partition,
|
|
244
234
|
seqid=seqid,
|
|
245
|
-
multi=message.multiid,
|
|
246
235
|
kbid=message.kbid,
|
|
247
236
|
rid=message.uuid,
|
|
248
237
|
source=message.source,
|
|
@@ -256,7 +245,6 @@ class Processor:
|
|
|
256
245
|
await self.notify_commit(
|
|
257
246
|
partition=partition,
|
|
258
247
|
seqid=seqid,
|
|
259
|
-
multi=message.multiid,
|
|
260
248
|
message=message,
|
|
261
249
|
write_type=writer_pb2.Notification.WriteType.DELETED,
|
|
262
250
|
)
|
|
@@ -277,15 +265,12 @@ class Processor:
|
|
|
277
265
|
@processor_observer.wrap({"type": "txn"})
|
|
278
266
|
async def txn(
|
|
279
267
|
self,
|
|
280
|
-
|
|
268
|
+
message: writer_pb2.BrokerMessage,
|
|
281
269
|
seqid: int,
|
|
282
270
|
partition: str,
|
|
283
271
|
transaction_check: bool = True,
|
|
284
272
|
) -> None:
|
|
285
|
-
|
|
286
|
-
return None
|
|
287
|
-
|
|
288
|
-
kbid = messages[0].kbid
|
|
273
|
+
kbid = message.kbid
|
|
289
274
|
if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
|
|
290
275
|
logger.info(f"KB {kbid} is deleted: skiping txn")
|
|
291
276
|
if transaction_check:
|
|
@@ -296,58 +281,55 @@ class Processor:
|
|
|
296
281
|
|
|
297
282
|
async with self.driver.rw_transaction() as txn:
|
|
298
283
|
try:
|
|
299
|
-
multi = messages[0].multiid
|
|
300
284
|
kb = KnowledgeBox(txn, self.storage, kbid)
|
|
301
|
-
uuid = await self.get_resource_uuid(kb,
|
|
302
|
-
|
|
285
|
+
uuid = await self.get_resource_uuid(kb, message)
|
|
286
|
+
|
|
287
|
+
resource: Resource | None = None
|
|
303
288
|
handled_exception = None
|
|
304
289
|
created = False
|
|
305
290
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
if resource is None:
|
|
313
|
-
# It's a new resource
|
|
314
|
-
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
|
315
|
-
created = True
|
|
316
|
-
else:
|
|
317
|
-
# It's an update from writer for an existing resource
|
|
318
|
-
...
|
|
319
|
-
|
|
320
|
-
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
321
|
-
resource = await kb.get(uuid)
|
|
322
|
-
if resource is None:
|
|
323
|
-
logger.info(
|
|
324
|
-
f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
|
|
325
|
-
)
|
|
326
|
-
continue
|
|
327
|
-
else:
|
|
328
|
-
# It's an update from processor for an existing resource
|
|
329
|
-
...
|
|
330
|
-
|
|
331
|
-
generated_fields = await get_generated_fields(message, resource)
|
|
332
|
-
if generated_fields.is_not_empty():
|
|
333
|
-
await send_generated_fields_to_process(
|
|
334
|
-
kbid, resource, generated_fields, message
|
|
335
|
-
)
|
|
336
|
-
# TODO: remove this when processor sends the field set
|
|
337
|
-
for generated_text in generated_fields.texts:
|
|
338
|
-
message.texts[
|
|
339
|
-
generated_text
|
|
340
|
-
].generated_by.data_augmentation.SetInParent()
|
|
341
|
-
|
|
291
|
+
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
|
292
|
+
resource = await kb.get(uuid)
|
|
293
|
+
if resource is None:
|
|
294
|
+
# It's a new resource
|
|
295
|
+
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
|
296
|
+
created = True
|
|
342
297
|
else:
|
|
343
|
-
|
|
298
|
+
# It's an update from writer for an existing resource
|
|
299
|
+
...
|
|
300
|
+
|
|
301
|
+
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
302
|
+
resource = await kb.get(uuid)
|
|
303
|
+
if resource is None:
|
|
304
|
+
logger.info(
|
|
305
|
+
f"Processor message for resource received but the resource does not exist, ignoring.",
|
|
306
|
+
extra={
|
|
307
|
+
"kbid": kbid,
|
|
308
|
+
"rid": uuid,
|
|
309
|
+
"seqid": seqid,
|
|
310
|
+
},
|
|
311
|
+
)
|
|
312
|
+
return None
|
|
313
|
+
else:
|
|
314
|
+
# It's an update from processor for an existing resource
|
|
315
|
+
...
|
|
316
|
+
|
|
317
|
+
generated_fields = await get_generated_fields(message, resource)
|
|
318
|
+
if generated_fields.is_not_empty():
|
|
319
|
+
await send_generated_fields_to_process(kbid, resource, generated_fields, message)
|
|
320
|
+
# TODO: remove this when processor sends the field set
|
|
321
|
+
for generated_text in generated_fields.texts:
|
|
322
|
+
message.texts[generated_text].generated_by.data_augmentation.SetInParent()
|
|
323
|
+
|
|
324
|
+
else: # pragma: no cover
|
|
325
|
+
raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
|
|
344
326
|
|
|
345
|
-
|
|
346
|
-
|
|
327
|
+
# apply changes from the broker message to the resource
|
|
328
|
+
await self.apply_resource(message, resource, update=(not created))
|
|
347
329
|
|
|
348
330
|
# index message
|
|
349
331
|
if resource and resource.modified:
|
|
350
|
-
index_message = await self.generate_index_message(resource,
|
|
332
|
+
index_message = await self.generate_index_message(resource, message, created)
|
|
351
333
|
try:
|
|
352
334
|
warnings = await self.index_resource(
|
|
353
335
|
index_message=index_message,
|
|
@@ -357,7 +339,7 @@ class Processor:
|
|
|
357
339
|
seqid=seqid,
|
|
358
340
|
partition=partition,
|
|
359
341
|
kb=kb,
|
|
360
|
-
source=
|
|
342
|
+
source=to_index_message_source(message),
|
|
361
343
|
)
|
|
362
344
|
# Save indexing warnings
|
|
363
345
|
for field_id, warning in warnings:
|
|
@@ -374,8 +356,7 @@ class Processor:
|
|
|
374
356
|
index_message.labels.remove(current_status[0])
|
|
375
357
|
index_message.labels.append("/n/s/ERROR")
|
|
376
358
|
|
|
377
|
-
await
|
|
378
|
-
|
|
359
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
379
360
|
if transaction_check:
|
|
380
361
|
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
|
381
362
|
await txn.commit()
|
|
@@ -386,7 +367,6 @@ class Processor:
|
|
|
386
367
|
await self.notify_commit(
|
|
387
368
|
partition=partition,
|
|
388
369
|
seqid=seqid,
|
|
389
|
-
multi=multi,
|
|
390
370
|
message=message,
|
|
391
371
|
write_type=(
|
|
392
372
|
writer_pb2.Notification.WriteType.CREATED
|
|
@@ -399,7 +379,6 @@ class Processor:
|
|
|
399
379
|
await self.notify_abort(
|
|
400
380
|
partition=partition,
|
|
401
381
|
seqid=seqid,
|
|
402
|
-
multi=multi,
|
|
403
382
|
kbid=kbid,
|
|
404
383
|
rid=uuid,
|
|
405
384
|
source=message.source,
|
|
@@ -419,7 +398,6 @@ class Processor:
|
|
|
419
398
|
await self.notify_abort(
|
|
420
399
|
partition=partition,
|
|
421
400
|
seqid=seqid,
|
|
422
|
-
multi=multi,
|
|
423
401
|
kbid=kbid,
|
|
424
402
|
rid=uuid,
|
|
425
403
|
source=message.source,
|
|
@@ -429,11 +407,10 @@ class Processor:
|
|
|
429
407
|
# As we are in the middle of a transaction, we cannot let the exception raise directly
|
|
430
408
|
# as we need to do some cleanup. The exception will be reraised at the end of the function
|
|
431
409
|
# and then handled by the top caller, so errors can be handled in the same place.
|
|
432
|
-
await self.deadletter(
|
|
410
|
+
await self.deadletter(message, partition, seqid)
|
|
433
411
|
await self.notify_abort(
|
|
434
412
|
partition=partition,
|
|
435
413
|
seqid=seqid,
|
|
436
|
-
multi=multi,
|
|
437
414
|
kbid=kbid,
|
|
438
415
|
rid=uuid,
|
|
439
416
|
source=message.source,
|
|
@@ -468,22 +445,27 @@ class Processor:
|
|
|
468
445
|
# a resource was move to another shard while it was being indexed
|
|
469
446
|
shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
|
|
470
447
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
448
|
+
shard = None
|
|
449
|
+
if shard_id is not None:
|
|
450
|
+
# Resource already has a shard assigned
|
|
451
|
+
shard = await kb.get_resource_shard(shard_id)
|
|
452
|
+
if shard is None:
|
|
453
|
+
raise AttributeError("Shard not available")
|
|
454
|
+
else:
|
|
455
|
+
# It's a new resource, get KB's current active shard to place new resource on
|
|
456
|
+
shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
|
|
457
|
+
if shard is None:
|
|
458
|
+
# No current shard available, create a new one
|
|
459
|
+
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
|
460
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
461
|
+
prewarm = kb_config is not None and kb_config.prewarm_enabled
|
|
462
|
+
shard = await self.index_node_shard_manager.create_shard_by_kbid(
|
|
463
|
+
txn, kbid, prewarm_enabled=prewarm
|
|
464
|
+
)
|
|
465
|
+
await datamanagers.resources.set_resource_shard_id(
|
|
466
|
+
txn, kbid=kbid, rid=uuid, shard=shard.shard
|
|
467
|
+
)
|
|
468
|
+
return shard
|
|
487
469
|
|
|
488
470
|
@processor_observer.wrap({"type": "index_resource"})
|
|
489
471
|
async def index_resource(
|
|
@@ -519,17 +501,16 @@ class Processor:
|
|
|
519
501
|
async def generate_index_message(
|
|
520
502
|
self,
|
|
521
503
|
resource: Resource,
|
|
522
|
-
|
|
504
|
+
message: writer_pb2.BrokerMessage,
|
|
523
505
|
resource_created: bool,
|
|
524
506
|
) -> PBBrainResource:
|
|
525
507
|
builder = IndexMessageBuilder(resource)
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
return await builder.for_processor_bm(messages)
|
|
508
|
+
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
|
509
|
+
return await builder.for_writer_bm(message, resource_created)
|
|
510
|
+
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
511
|
+
return await builder.for_processor_bm(message)
|
|
531
512
|
else: # pragma: no cover
|
|
532
|
-
raise InvalidBrokerMessage(f"Unknown broker message source: {
|
|
513
|
+
raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
|
|
533
514
|
|
|
534
515
|
async def external_index_delete_resource(
|
|
535
516
|
self, external_index_manager: ExternalIndexManager, resource_uuid: str
|
|
@@ -582,35 +563,8 @@ class Processor:
|
|
|
582
563
|
resource_uuid=resource_uuid, resource_data=index_message
|
|
583
564
|
)
|
|
584
565
|
|
|
585
|
-
async def
|
|
586
|
-
self.
|
|
587
|
-
|
|
588
|
-
async def commit(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
|
|
589
|
-
if message.multiid not in self.messages:
|
|
590
|
-
# Error
|
|
591
|
-
logger.error(f"Closed multi {message.multiid}")
|
|
592
|
-
await self.deadletter([message], partition, seqid)
|
|
593
|
-
else:
|
|
594
|
-
await self.txn(self.messages[message.multiid], seqid, partition)
|
|
595
|
-
|
|
596
|
-
async def rollback(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
|
|
597
|
-
# Error
|
|
598
|
-
logger.error(f"Closed multi {message.multiid}")
|
|
599
|
-
del self.messages[message.multiid]
|
|
600
|
-
await self.notify_abort(
|
|
601
|
-
partition=partition,
|
|
602
|
-
seqid=seqid,
|
|
603
|
-
multi=message.multiid,
|
|
604
|
-
kbid=message.kbid,
|
|
605
|
-
rid=message.uuid,
|
|
606
|
-
source=message.source,
|
|
607
|
-
)
|
|
608
|
-
|
|
609
|
-
async def deadletter(
|
|
610
|
-
self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
|
|
611
|
-
) -> None:
|
|
612
|
-
for seq, message in enumerate(messages):
|
|
613
|
-
await self.storage.deadletter(message, seq, seqid, partition)
|
|
566
|
+
async def deadletter(self, message: writer_pb2.BrokerMessage, partition: str, seqid: int) -> None:
|
|
567
|
+
await self.storage.deadletter(message, 0, seqid, partition)
|
|
614
568
|
|
|
615
569
|
@processor_observer.wrap({"type": "apply_resource"})
|
|
616
570
|
async def apply_resource(
|
|
@@ -670,7 +624,6 @@ class Processor:
|
|
|
670
624
|
*,
|
|
671
625
|
partition: str,
|
|
672
626
|
seqid: int,
|
|
673
|
-
multi: str,
|
|
674
627
|
message: writer_pb2.BrokerMessage,
|
|
675
628
|
write_type: writer_pb2.Notification.WriteType.ValueType,
|
|
676
629
|
):
|
|
@@ -678,7 +631,7 @@ class Processor:
|
|
|
678
631
|
notification = writer_pb2.Notification(
|
|
679
632
|
partition=int(partition),
|
|
680
633
|
seqid=seqid,
|
|
681
|
-
multi=
|
|
634
|
+
multi="",
|
|
682
635
|
uuid=message.uuid,
|
|
683
636
|
kbid=message.kbid,
|
|
684
637
|
action=writer_pb2.Notification.Action.COMMIT,
|
|
@@ -698,7 +651,6 @@ class Processor:
|
|
|
698
651
|
*,
|
|
699
652
|
partition: str,
|
|
700
653
|
seqid: int,
|
|
701
|
-
multi: str,
|
|
702
654
|
kbid: str,
|
|
703
655
|
rid: str,
|
|
704
656
|
source: writer_pb2.BrokerMessage.MessageSource.ValueType,
|
|
@@ -706,7 +658,7 @@ class Processor:
|
|
|
706
658
|
message = writer_pb2.Notification(
|
|
707
659
|
partition=int(partition),
|
|
708
660
|
seqid=seqid,
|
|
709
|
-
multi=
|
|
661
|
+
multi="",
|
|
710
662
|
uuid=rid,
|
|
711
663
|
kbid=kbid,
|
|
712
664
|
action=writer_pb2.Notification.ABORT,
|
|
@@ -722,7 +674,7 @@ class Processor:
|
|
|
722
674
|
await self.pubsub.publish(channel, payload)
|
|
723
675
|
|
|
724
676
|
async def _mark_resource_error(
|
|
725
|
-
self, kb: KnowledgeBox, resource:
|
|
677
|
+
self, kb: KnowledgeBox, resource: Resource | None, partition: str, seqid: int
|
|
726
678
|
) -> None:
|
|
727
679
|
"""
|
|
728
680
|
Unhandled error processing, try to mark resource as error
|
|
@@ -743,8 +695,8 @@ class Processor:
|
|
|
743
695
|
# XXX: Why are these utility functions here?
|
|
744
696
|
async def get_kb_obj(
|
|
745
697
|
self, txn: Transaction, kbid: knowledgebox_pb2.KnowledgeBoxID
|
|
746
|
-
) ->
|
|
747
|
-
uuid:
|
|
698
|
+
) -> KnowledgeBox | None:
|
|
699
|
+
uuid: str | None = kbid.uuid
|
|
748
700
|
if uuid == "":
|
|
749
701
|
uuid = await datamanagers.kb.get_kb_uuid(txn, slug=kbid.slug)
|
|
750
702
|
|
|
@@ -759,23 +711,16 @@ class Processor:
|
|
|
759
711
|
return kbobj
|
|
760
712
|
|
|
761
713
|
|
|
762
|
-
def
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
(message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR for message in messages)
|
|
768
|
-
)
|
|
769
|
-
if from_writer:
|
|
770
|
-
source = nodewriter_pb2.IndexMessageSource.WRITER
|
|
771
|
-
elif from_processor:
|
|
772
|
-
source = nodewriter_pb2.IndexMessageSource.PROCESSOR
|
|
714
|
+
def to_index_message_source(message: writer_pb2.BrokerMessage):
|
|
715
|
+
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
|
716
|
+
return nodewriter_pb2.IndexMessageSource.WRITER
|
|
717
|
+
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
718
|
+
return nodewriter_pb2.IndexMessageSource.PROCESSOR
|
|
773
719
|
else: # pragma: no cover
|
|
774
|
-
msg = "Processor received
|
|
720
|
+
msg = f"Processor received a broker message with unexpected source! {message.source}"
|
|
775
721
|
logger.error(msg)
|
|
776
722
|
errors.capture_exception(Exception(msg))
|
|
777
|
-
|
|
778
|
-
return source
|
|
723
|
+
return nodewriter_pb2.IndexMessageSource.PROCESSOR
|
|
779
724
|
|
|
780
725
|
|
|
781
726
|
def has_vectors_operation(index_message: PBBrainResource) -> bool:
|
|
@@ -17,14 +17,13 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional
|
|
21
20
|
|
|
22
21
|
from nucliadb.common.maindb.driver import Driver, Transaction
|
|
23
22
|
|
|
24
23
|
TXNID = "/internal/worker/{worker}"
|
|
25
24
|
|
|
26
25
|
|
|
27
|
-
async def get_last_seqid(driver: Driver, worker: str) ->
|
|
26
|
+
async def get_last_seqid(driver: Driver, worker: str) -> int | None:
|
|
28
27
|
"""
|
|
29
28
|
Get last stored sequence id for a worker.
|
|
30
29
|
|