nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,7 @@ from nidx_protos import noderesources_pb2, nodewriter_pb2
|
|
|
28
28
|
from nidx_protos.noderesources_pb2 import Resource as PBBrainResource
|
|
29
29
|
|
|
30
30
|
from nucliadb.common import datamanagers, locking
|
|
31
|
+
from nucliadb.common.catalog import catalog_delete, catalog_update
|
|
31
32
|
from nucliadb.common.cluster.settings import settings as cluster_settings
|
|
32
33
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
33
34
|
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
|
@@ -61,8 +62,6 @@ from nucliadb_utils.cache.pubsub import PubSubDriver
|
|
|
61
62
|
from nucliadb_utils.storages.storage import Storage
|
|
62
63
|
from nucliadb_utils.utilities import get_storage, has_feature
|
|
63
64
|
|
|
64
|
-
from .pgcatalog import pgcatalog_delete, pgcatalog_update
|
|
65
|
-
|
|
66
65
|
logger = logging.getLogger("ingest-processor")
|
|
67
66
|
|
|
68
67
|
MESSAGE_TO_NOTIFICATION_SOURCE = {
|
|
@@ -142,8 +141,6 @@ class Processor:
|
|
|
142
141
|
and can not use the txn id
|
|
143
142
|
"""
|
|
144
143
|
|
|
145
|
-
messages: dict[str, list[writer_pb2.BrokerMessage]]
|
|
146
|
-
|
|
147
144
|
def __init__(
|
|
148
145
|
self,
|
|
149
146
|
driver: Driver,
|
|
@@ -151,7 +148,6 @@ class Processor:
|
|
|
151
148
|
pubsub: Optional[PubSubDriver] = None,
|
|
152
149
|
partition: Optional[str] = None,
|
|
153
150
|
):
|
|
154
|
-
self.messages = {}
|
|
155
151
|
self.driver = driver
|
|
156
152
|
self.storage = storage
|
|
157
153
|
self.partition = partition
|
|
@@ -180,18 +176,12 @@ class Processor:
|
|
|
180
176
|
if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
|
|
181
177
|
await self.delete_resource(message, seqid, partition, transaction_check)
|
|
182
178
|
elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
|
|
183
|
-
await self.txn(
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
# XXX Should this be removed?
|
|
190
|
-
await self.multi(message, seqid)
|
|
191
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
|
|
192
|
-
await self.commit(message, seqid, partition)
|
|
193
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
|
|
194
|
-
await self.rollback(message, seqid, partition)
|
|
179
|
+
await self.txn(message, seqid, partition, transaction_check)
|
|
180
|
+
else: # pragma: no cover
|
|
181
|
+
logger.error(
|
|
182
|
+
f"Unsupported message type: {message.type}",
|
|
183
|
+
extra={"seqid": seqid, "partition": partition},
|
|
184
|
+
)
|
|
195
185
|
|
|
196
186
|
async def get_resource_uuid(self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage) -> str:
|
|
197
187
|
if message.uuid is None:
|
|
@@ -208,7 +198,7 @@ class Processor:
|
|
|
208
198
|
partition: str,
|
|
209
199
|
transaction_check: bool = True,
|
|
210
200
|
) -> None:
|
|
211
|
-
async with self.driver.
|
|
201
|
+
async with self.driver.rw_transaction() as txn:
|
|
212
202
|
try:
|
|
213
203
|
kb = KnowledgeBox(txn, self.storage, message.kbid)
|
|
214
204
|
|
|
@@ -227,7 +217,8 @@ class Processor:
|
|
|
227
217
|
shard = await kb.get_resource_shard(shard_id)
|
|
228
218
|
if shard is None:
|
|
229
219
|
raise AttributeError("Shard not available")
|
|
230
|
-
|
|
220
|
+
|
|
221
|
+
await catalog_delete(txn, message.kbid, uuid)
|
|
231
222
|
external_index_manager = await get_external_index_manager(kbid=message.kbid)
|
|
232
223
|
if external_index_manager is not None:
|
|
233
224
|
await self.external_index_delete_resource(external_index_manager, uuid)
|
|
@@ -242,7 +233,6 @@ class Processor:
|
|
|
242
233
|
await self.notify_abort(
|
|
243
234
|
partition=partition,
|
|
244
235
|
seqid=seqid,
|
|
245
|
-
multi=message.multiid,
|
|
246
236
|
kbid=message.kbid,
|
|
247
237
|
rid=message.uuid,
|
|
248
238
|
source=message.source,
|
|
@@ -256,7 +246,6 @@ class Processor:
|
|
|
256
246
|
await self.notify_commit(
|
|
257
247
|
partition=partition,
|
|
258
248
|
seqid=seqid,
|
|
259
|
-
multi=message.multiid,
|
|
260
249
|
message=message,
|
|
261
250
|
write_type=writer_pb2.Notification.WriteType.DELETED,
|
|
262
251
|
)
|
|
@@ -267,7 +256,7 @@ class Processor:
|
|
|
267
256
|
# so we commit it in a different transaction to make it as short as possible
|
|
268
257
|
prev_txn = resource.txn
|
|
269
258
|
try:
|
|
270
|
-
async with self.driver.
|
|
259
|
+
async with self.driver.rw_transaction() as txn:
|
|
271
260
|
resource.txn = txn
|
|
272
261
|
await resource.set_slug()
|
|
273
262
|
await txn.commit()
|
|
@@ -277,15 +266,12 @@ class Processor:
|
|
|
277
266
|
@processor_observer.wrap({"type": "txn"})
|
|
278
267
|
async def txn(
|
|
279
268
|
self,
|
|
280
|
-
|
|
269
|
+
message: writer_pb2.BrokerMessage,
|
|
281
270
|
seqid: int,
|
|
282
271
|
partition: str,
|
|
283
272
|
transaction_check: bool = True,
|
|
284
273
|
) -> None:
|
|
285
|
-
|
|
286
|
-
return None
|
|
287
|
-
|
|
288
|
-
kbid = messages[0].kbid
|
|
274
|
+
kbid = message.kbid
|
|
289
275
|
if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
|
|
290
276
|
logger.info(f"KB {kbid} is deleted: skiping txn")
|
|
291
277
|
if transaction_check:
|
|
@@ -294,60 +280,57 @@ class Processor:
|
|
|
294
280
|
await txn.commit()
|
|
295
281
|
return None
|
|
296
282
|
|
|
297
|
-
async with self.driver.
|
|
283
|
+
async with self.driver.rw_transaction() as txn:
|
|
298
284
|
try:
|
|
299
|
-
multi = messages[0].multiid
|
|
300
285
|
kb = KnowledgeBox(txn, self.storage, kbid)
|
|
301
|
-
uuid = await self.get_resource_uuid(kb,
|
|
286
|
+
uuid = await self.get_resource_uuid(kb, message)
|
|
287
|
+
|
|
302
288
|
resource: Optional[Resource] = None
|
|
303
289
|
handled_exception = None
|
|
304
290
|
created = False
|
|
305
291
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
if resource is None:
|
|
313
|
-
# It's a new resource
|
|
314
|
-
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
|
315
|
-
created = True
|
|
316
|
-
else:
|
|
317
|
-
# It's an update from writer for an existing resource
|
|
318
|
-
...
|
|
319
|
-
|
|
320
|
-
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
321
|
-
resource = await kb.get(uuid)
|
|
322
|
-
if resource is None:
|
|
323
|
-
logger.info(
|
|
324
|
-
f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
|
|
325
|
-
)
|
|
326
|
-
continue
|
|
327
|
-
else:
|
|
328
|
-
# It's an update from processor for an existing resource
|
|
329
|
-
...
|
|
330
|
-
|
|
331
|
-
generated_fields = await get_generated_fields(message, resource)
|
|
332
|
-
if generated_fields.is_not_empty():
|
|
333
|
-
await send_generated_fields_to_process(
|
|
334
|
-
kbid, resource, generated_fields, message
|
|
335
|
-
)
|
|
336
|
-
# TODO: remove this when processor sends the field set
|
|
337
|
-
for generated_text in generated_fields.texts:
|
|
338
|
-
message.texts[
|
|
339
|
-
generated_text
|
|
340
|
-
].generated_by.data_augmentation.SetInParent()
|
|
341
|
-
|
|
292
|
+
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
|
293
|
+
resource = await kb.get(uuid)
|
|
294
|
+
if resource is None:
|
|
295
|
+
# It's a new resource
|
|
296
|
+
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
|
297
|
+
created = True
|
|
342
298
|
else:
|
|
343
|
-
|
|
299
|
+
# It's an update from writer for an existing resource
|
|
300
|
+
...
|
|
301
|
+
|
|
302
|
+
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
303
|
+
resource = await kb.get(uuid)
|
|
304
|
+
if resource is None:
|
|
305
|
+
logger.info(
|
|
306
|
+
f"Processor message for resource received but the resource does not exist, ignoring.",
|
|
307
|
+
extra={
|
|
308
|
+
"kbid": kbid,
|
|
309
|
+
"rid": uuid,
|
|
310
|
+
"seqid": seqid,
|
|
311
|
+
},
|
|
312
|
+
)
|
|
313
|
+
return None
|
|
314
|
+
else:
|
|
315
|
+
# It's an update from processor for an existing resource
|
|
316
|
+
...
|
|
317
|
+
|
|
318
|
+
generated_fields = await get_generated_fields(message, resource)
|
|
319
|
+
if generated_fields.is_not_empty():
|
|
320
|
+
await send_generated_fields_to_process(kbid, resource, generated_fields, message)
|
|
321
|
+
# TODO: remove this when processor sends the field set
|
|
322
|
+
for generated_text in generated_fields.texts:
|
|
323
|
+
message.texts[generated_text].generated_by.data_augmentation.SetInParent()
|
|
324
|
+
|
|
325
|
+
else: # pragma: no cover
|
|
326
|
+
raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
|
|
344
327
|
|
|
345
|
-
|
|
346
|
-
|
|
328
|
+
# apply changes from the broker message to the resource
|
|
329
|
+
await self.apply_resource(message, resource, update=(not created))
|
|
347
330
|
|
|
348
331
|
# index message
|
|
349
332
|
if resource and resource.modified:
|
|
350
|
-
index_message = await self.generate_index_message(resource,
|
|
333
|
+
index_message = await self.generate_index_message(resource, message, created)
|
|
351
334
|
try:
|
|
352
335
|
warnings = await self.index_resource(
|
|
353
336
|
index_message=index_message,
|
|
@@ -357,7 +340,7 @@ class Processor:
|
|
|
357
340
|
seqid=seqid,
|
|
358
341
|
partition=partition,
|
|
359
342
|
kb=kb,
|
|
360
|
-
source=
|
|
343
|
+
source=to_index_message_source(message),
|
|
361
344
|
)
|
|
362
345
|
# Save indexing warnings
|
|
363
346
|
for field_id, warning in warnings:
|
|
@@ -374,8 +357,7 @@ class Processor:
|
|
|
374
357
|
index_message.labels.remove(current_status[0])
|
|
375
358
|
index_message.labels.append("/n/s/ERROR")
|
|
376
359
|
|
|
377
|
-
await
|
|
378
|
-
|
|
360
|
+
await catalog_update(txn, kbid, resource, index_message)
|
|
379
361
|
if transaction_check:
|
|
380
362
|
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
|
381
363
|
await txn.commit()
|
|
@@ -386,7 +368,6 @@ class Processor:
|
|
|
386
368
|
await self.notify_commit(
|
|
387
369
|
partition=partition,
|
|
388
370
|
seqid=seqid,
|
|
389
|
-
multi=multi,
|
|
390
371
|
message=message,
|
|
391
372
|
write_type=(
|
|
392
373
|
writer_pb2.Notification.WriteType.CREATED
|
|
@@ -399,7 +380,6 @@ class Processor:
|
|
|
399
380
|
await self.notify_abort(
|
|
400
381
|
partition=partition,
|
|
401
382
|
seqid=seqid,
|
|
402
|
-
multi=multi,
|
|
403
383
|
kbid=kbid,
|
|
404
384
|
rid=uuid,
|
|
405
385
|
source=message.source,
|
|
@@ -419,7 +399,6 @@ class Processor:
|
|
|
419
399
|
await self.notify_abort(
|
|
420
400
|
partition=partition,
|
|
421
401
|
seqid=seqid,
|
|
422
|
-
multi=multi,
|
|
423
402
|
kbid=kbid,
|
|
424
403
|
rid=uuid,
|
|
425
404
|
source=message.source,
|
|
@@ -429,11 +408,10 @@ class Processor:
|
|
|
429
408
|
# As we are in the middle of a transaction, we cannot let the exception raise directly
|
|
430
409
|
# as we need to do some cleanup. The exception will be reraised at the end of the function
|
|
431
410
|
# and then handled by the top caller, so errors can be handled in the same place.
|
|
432
|
-
await self.deadletter(
|
|
411
|
+
await self.deadletter(message, partition, seqid)
|
|
433
412
|
await self.notify_abort(
|
|
434
413
|
partition=partition,
|
|
435
414
|
seqid=seqid,
|
|
436
|
-
multi=multi,
|
|
437
415
|
kbid=kbid,
|
|
438
416
|
rid=uuid,
|
|
439
417
|
source=message.source,
|
|
@@ -468,22 +446,27 @@ class Processor:
|
|
|
468
446
|
# a resource was move to another shard while it was being indexed
|
|
469
447
|
shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
|
|
470
448
|
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
449
|
+
shard = None
|
|
450
|
+
if shard_id is not None:
|
|
451
|
+
# Resource already has a shard assigned
|
|
452
|
+
shard = await kb.get_resource_shard(shard_id)
|
|
453
|
+
if shard is None:
|
|
454
|
+
raise AttributeError("Shard not available")
|
|
455
|
+
else:
|
|
456
|
+
# It's a new resource, get KB's current active shard to place new resource on
|
|
457
|
+
shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
|
|
458
|
+
if shard is None:
|
|
459
|
+
# No current shard available, create a new one
|
|
460
|
+
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
|
461
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
462
|
+
prewarm = kb_config is not None and kb_config.prewarm_enabled
|
|
463
|
+
shard = await self.index_node_shard_manager.create_shard_by_kbid(
|
|
464
|
+
txn, kbid, prewarm_enabled=prewarm
|
|
465
|
+
)
|
|
466
|
+
await datamanagers.resources.set_resource_shard_id(
|
|
467
|
+
txn, kbid=kbid, rid=uuid, shard=shard.shard
|
|
468
|
+
)
|
|
469
|
+
return shard
|
|
487
470
|
|
|
488
471
|
@processor_observer.wrap({"type": "index_resource"})
|
|
489
472
|
async def index_resource(
|
|
@@ -519,17 +502,16 @@ class Processor:
|
|
|
519
502
|
async def generate_index_message(
|
|
520
503
|
self,
|
|
521
504
|
resource: Resource,
|
|
522
|
-
|
|
505
|
+
message: writer_pb2.BrokerMessage,
|
|
523
506
|
resource_created: bool,
|
|
524
507
|
) -> PBBrainResource:
|
|
525
508
|
builder = IndexMessageBuilder(resource)
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
return await builder.for_processor_bm(messages)
|
|
509
|
+
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
|
510
|
+
return await builder.for_writer_bm(message, resource_created)
|
|
511
|
+
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
512
|
+
return await builder.for_processor_bm(message)
|
|
531
513
|
else: # pragma: no cover
|
|
532
|
-
raise InvalidBrokerMessage(f"Unknown broker message source: {
|
|
514
|
+
raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
|
|
533
515
|
|
|
534
516
|
async def external_index_delete_resource(
|
|
535
517
|
self, external_index_manager: ExternalIndexManager, resource_uuid: str
|
|
@@ -582,35 +564,8 @@ class Processor:
|
|
|
582
564
|
resource_uuid=resource_uuid, resource_data=index_message
|
|
583
565
|
)
|
|
584
566
|
|
|
585
|
-
async def
|
|
586
|
-
self.
|
|
587
|
-
|
|
588
|
-
async def commit(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
|
|
589
|
-
if message.multiid not in self.messages:
|
|
590
|
-
# Error
|
|
591
|
-
logger.error(f"Closed multi {message.multiid}")
|
|
592
|
-
await self.deadletter([message], partition, seqid)
|
|
593
|
-
else:
|
|
594
|
-
await self.txn(self.messages[message.multiid], seqid, partition)
|
|
595
|
-
|
|
596
|
-
async def rollback(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
|
|
597
|
-
# Error
|
|
598
|
-
logger.error(f"Closed multi {message.multiid}")
|
|
599
|
-
del self.messages[message.multiid]
|
|
600
|
-
await self.notify_abort(
|
|
601
|
-
partition=partition,
|
|
602
|
-
seqid=seqid,
|
|
603
|
-
multi=message.multiid,
|
|
604
|
-
kbid=message.kbid,
|
|
605
|
-
rid=message.uuid,
|
|
606
|
-
source=message.source,
|
|
607
|
-
)
|
|
608
|
-
|
|
609
|
-
async def deadletter(
|
|
610
|
-
self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
|
|
611
|
-
) -> None:
|
|
612
|
-
for seq, message in enumerate(messages):
|
|
613
|
-
await self.storage.deadletter(message, seq, seqid, partition)
|
|
567
|
+
async def deadletter(self, message: writer_pb2.BrokerMessage, partition: str, seqid: int) -> None:
|
|
568
|
+
await self.storage.deadletter(message, 0, seqid, partition)
|
|
614
569
|
|
|
615
570
|
@processor_observer.wrap({"type": "apply_resource"})
|
|
616
571
|
async def apply_resource(
|
|
@@ -670,7 +625,6 @@ class Processor:
|
|
|
670
625
|
*,
|
|
671
626
|
partition: str,
|
|
672
627
|
seqid: int,
|
|
673
|
-
multi: str,
|
|
674
628
|
message: writer_pb2.BrokerMessage,
|
|
675
629
|
write_type: writer_pb2.Notification.WriteType.ValueType,
|
|
676
630
|
):
|
|
@@ -678,7 +632,7 @@ class Processor:
|
|
|
678
632
|
notification = writer_pb2.Notification(
|
|
679
633
|
partition=int(partition),
|
|
680
634
|
seqid=seqid,
|
|
681
|
-
multi=
|
|
635
|
+
multi="",
|
|
682
636
|
uuid=message.uuid,
|
|
683
637
|
kbid=message.kbid,
|
|
684
638
|
action=writer_pb2.Notification.Action.COMMIT,
|
|
@@ -698,7 +652,6 @@ class Processor:
|
|
|
698
652
|
*,
|
|
699
653
|
partition: str,
|
|
700
654
|
seqid: int,
|
|
701
|
-
multi: str,
|
|
702
655
|
kbid: str,
|
|
703
656
|
rid: str,
|
|
704
657
|
source: writer_pb2.BrokerMessage.MessageSource.ValueType,
|
|
@@ -706,7 +659,7 @@ class Processor:
|
|
|
706
659
|
message = writer_pb2.Notification(
|
|
707
660
|
partition=int(partition),
|
|
708
661
|
seqid=seqid,
|
|
709
|
-
multi=
|
|
662
|
+
multi="",
|
|
710
663
|
uuid=rid,
|
|
711
664
|
kbid=kbid,
|
|
712
665
|
action=writer_pb2.Notification.ABORT,
|
|
@@ -731,7 +684,7 @@ class Processor:
|
|
|
731
684
|
logger.info(f"Skip when resource does not even have basic metadata: {resource}")
|
|
732
685
|
return
|
|
733
686
|
try:
|
|
734
|
-
async with self.driver.
|
|
687
|
+
async with self.driver.rw_transaction() as txn:
|
|
735
688
|
kb.txn = resource.txn = txn
|
|
736
689
|
resource.basic.metadata.status = resources_pb2.Metadata.Status.ERROR
|
|
737
690
|
await resource.set_basic(resource.basic)
|
|
@@ -759,23 +712,16 @@ class Processor:
|
|
|
759
712
|
return kbobj
|
|
760
713
|
|
|
761
714
|
|
|
762
|
-
def
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
(message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR for message in messages)
|
|
768
|
-
)
|
|
769
|
-
if from_writer:
|
|
770
|
-
source = nodewriter_pb2.IndexMessageSource.WRITER
|
|
771
|
-
elif from_processor:
|
|
772
|
-
source = nodewriter_pb2.IndexMessageSource.PROCESSOR
|
|
715
|
+
def to_index_message_source(message: writer_pb2.BrokerMessage):
|
|
716
|
+
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
|
717
|
+
return nodewriter_pb2.IndexMessageSource.WRITER
|
|
718
|
+
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
|
719
|
+
return nodewriter_pb2.IndexMessageSource.PROCESSOR
|
|
773
720
|
else: # pragma: no cover
|
|
774
|
-
msg = "Processor received
|
|
721
|
+
msg = f"Processor received a broker message with unexpected source! {message.source}"
|
|
775
722
|
logger.error(msg)
|
|
776
723
|
errors.capture_exception(Exception(msg))
|
|
777
|
-
|
|
778
|
-
return source
|
|
724
|
+
return nodewriter_pb2.IndexMessageSource.PROCESSOR
|
|
779
725
|
|
|
780
726
|
|
|
781
727
|
def has_vectors_operation(index_message: PBBrainResource) -> bool:
|
|
@@ -31,7 +31,7 @@ async def get_last_seqid(driver: Driver, worker: str) -> Optional[int]:
|
|
|
31
31
|
This is oriented towards the ingest consumer and processor,
|
|
32
32
|
which is the only one that should be writing to this key.
|
|
33
33
|
"""
|
|
34
|
-
async with driver.
|
|
34
|
+
async with driver.ro_transaction() as txn:
|
|
35
35
|
key = TXNID.format(worker=worker)
|
|
36
36
|
last_seq = await txn.get(key)
|
|
37
37
|
if not last_seq:
|
nucliadb/ingest/orm/resource.py
CHANGED
|
@@ -135,7 +135,7 @@ class Resource:
|
|
|
135
135
|
await self.txn.set(new_key, self.uuid.encode())
|
|
136
136
|
|
|
137
137
|
# Basic
|
|
138
|
-
async def get_basic(self) ->
|
|
138
|
+
async def get_basic(self) -> PBBasic:
|
|
139
139
|
if self.basic is None:
|
|
140
140
|
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
|
141
141
|
self.basic = basic if basic is not None else PBBasic()
|
|
@@ -354,7 +354,16 @@ class Resource:
|
|
|
354
354
|
|
|
355
355
|
await field_obj.delete()
|
|
356
356
|
|
|
357
|
+
async def field_exists(self, type: FieldType.ValueType, field: str) -> bool:
|
|
358
|
+
"""Return whether this resource has this field or not."""
|
|
359
|
+
all_fields_ids = await self.get_fields_ids()
|
|
360
|
+
for field_type, field_id in all_fields_ids:
|
|
361
|
+
if field_type == type and field_id == field:
|
|
362
|
+
return True
|
|
363
|
+
return False
|
|
364
|
+
|
|
357
365
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
|
366
|
+
# REVIEW: are we sure we don't want to actually check this?
|
|
358
367
|
return (type, field) in self.fields
|
|
359
368
|
|
|
360
369
|
async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
|
nucliadb/ingest/partitions.py
CHANGED
|
@@ -25,12 +25,17 @@ from nucliadb.ingest.settings import Settings
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def assign_partitions(settings: Settings):
|
|
28
|
+
"""
|
|
29
|
+
This function dynamically assigns the partitions to the current ingest sts
|
|
30
|
+
replica based on its hostname, typically (ingest-0, ingest-1, etc).
|
|
31
|
+
"""
|
|
28
32
|
# partitions start from 1, instead of 0
|
|
29
33
|
all_partitions = [str(part + 1) for part in range(settings.nuclia_partitions)]
|
|
30
34
|
|
|
31
35
|
# get replica number and total replicas from environment
|
|
32
36
|
logger.info(f"PARTITIONS: Total Replicas = {settings.total_replicas}")
|
|
33
37
|
if settings.replica_number == -1:
|
|
38
|
+
# Get replica number from hostname
|
|
34
39
|
hostname = os.environ.get("HOSTNAME")
|
|
35
40
|
if hostname is not None:
|
|
36
41
|
sts_values = hostname.split("-")
|
|
@@ -39,10 +44,16 @@ def assign_partitions(settings: Settings):
|
|
|
39
44
|
settings.replica_number = int(sts_values[-1])
|
|
40
45
|
except Exception:
|
|
41
46
|
logger.error(f"Could not extract replica number from hostname: {hostname}")
|
|
42
|
-
|
|
47
|
+
else:
|
|
48
|
+
logger.warning(f"Could not determine replica number from hostname: {hostname}")
|
|
49
|
+
else:
|
|
50
|
+
logger.warning(f"Could not determine replica number from hostname.")
|
|
43
51
|
|
|
44
52
|
if settings.replica_number == -1:
|
|
45
53
|
settings.replica_number = 0
|
|
54
|
+
else:
|
|
55
|
+
# We assume that replica numbers are set manually via env variables
|
|
56
|
+
pass
|
|
46
57
|
logger.info(f"PARTITIONS: Replica Number = {settings.replica_number}")
|
|
47
58
|
|
|
48
59
|
# calculate assigned partitions based on total replicas and own replica number
|
nucliadb/ingest/serialize.py
CHANGED
|
@@ -135,7 +135,7 @@ async def serialize(
|
|
|
135
135
|
slug: Optional[str] = None,
|
|
136
136
|
) -> Optional[Resource]:
|
|
137
137
|
driver = get_driver()
|
|
138
|
-
async with driver.
|
|
138
|
+
async with driver.ro_transaction() as txn:
|
|
139
139
|
return await managed_serialize(
|
|
140
140
|
txn,
|
|
141
141
|
kbid,
|
|
@@ -392,6 +392,6 @@ async def get_resource_uuid_by_slug(
|
|
|
392
392
|
) -> Optional[str]:
|
|
393
393
|
storage = await get_storage(service_name=service_name)
|
|
394
394
|
driver = get_driver()
|
|
395
|
-
async with driver.
|
|
395
|
+
async with driver.ro_transaction() as txn:
|
|
396
396
|
kb = KnowledgeBox(txn, storage, kbid)
|
|
397
397
|
return await kb.get_resource_uuid_by_slug(slug)
|