nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -28,6 +28,7 @@ from nidx_protos import noderesources_pb2, nodewriter_pb2
28
28
  from nidx_protos.noderesources_pb2 import Resource as PBBrainResource
29
29
 
30
30
  from nucliadb.common import datamanagers, locking
31
+ from nucliadb.common.catalog import catalog_delete, catalog_update
31
32
  from nucliadb.common.cluster.settings import settings as cluster_settings
32
33
  from nucliadb.common.cluster.utils import get_shard_manager
33
34
  from nucliadb.common.external_index_providers.base import ExternalIndexManager
@@ -61,8 +62,6 @@ from nucliadb_utils.cache.pubsub import PubSubDriver
61
62
  from nucliadb_utils.storages.storage import Storage
62
63
  from nucliadb_utils.utilities import get_storage, has_feature
63
64
 
64
- from .pgcatalog import pgcatalog_delete, pgcatalog_update
65
-
66
65
  logger = logging.getLogger("ingest-processor")
67
66
 
68
67
  MESSAGE_TO_NOTIFICATION_SOURCE = {
@@ -142,8 +141,6 @@ class Processor:
142
141
  and can not use the txn id
143
142
  """
144
143
 
145
- messages: dict[str, list[writer_pb2.BrokerMessage]]
146
-
147
144
  def __init__(
148
145
  self,
149
146
  driver: Driver,
@@ -151,7 +148,6 @@ class Processor:
151
148
  pubsub: Optional[PubSubDriver] = None,
152
149
  partition: Optional[str] = None,
153
150
  ):
154
- self.messages = {}
155
151
  self.driver = driver
156
152
  self.storage = storage
157
153
  self.partition = partition
@@ -180,18 +176,12 @@ class Processor:
180
176
  if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
181
177
  await self.delete_resource(message, seqid, partition, transaction_check)
182
178
  elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
183
- await self.txn([message], seqid, partition, transaction_check)
184
- elif message.type == writer_pb2.BrokerMessage.MessageType.MULTI:
185
- # XXX Not supported right now
186
- # MULTI, COMMIT and ROLLBACK are all not supported in transactional mode right now
187
- # This concept is probably not tenable with current architecture because
188
- # of how nats works and how we would need to manage rollbacks.
189
- # XXX Should this be removed?
190
- await self.multi(message, seqid)
191
- elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
192
- await self.commit(message, seqid, partition)
193
- elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
194
- await self.rollback(message, seqid, partition)
179
+ await self.txn(message, seqid, partition, transaction_check)
180
+ else: # pragma: no cover
181
+ logger.error(
182
+ f"Unsupported message type: {message.type}",
183
+ extra={"seqid": seqid, "partition": partition},
184
+ )
195
185
 
196
186
  async def get_resource_uuid(self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage) -> str:
197
187
  if message.uuid is None:
@@ -208,7 +198,7 @@ class Processor:
208
198
  partition: str,
209
199
  transaction_check: bool = True,
210
200
  ) -> None:
211
- async with self.driver.transaction() as txn:
201
+ async with self.driver.rw_transaction() as txn:
212
202
  try:
213
203
  kb = KnowledgeBox(txn, self.storage, message.kbid)
214
204
 
@@ -227,7 +217,8 @@ class Processor:
227
217
  shard = await kb.get_resource_shard(shard_id)
228
218
  if shard is None:
229
219
  raise AttributeError("Shard not available")
230
- await pgcatalog_delete(txn, message.kbid, uuid)
220
+
221
+ await catalog_delete(txn, message.kbid, uuid)
231
222
  external_index_manager = await get_external_index_manager(kbid=message.kbid)
232
223
  if external_index_manager is not None:
233
224
  await self.external_index_delete_resource(external_index_manager, uuid)
@@ -242,7 +233,6 @@ class Processor:
242
233
  await self.notify_abort(
243
234
  partition=partition,
244
235
  seqid=seqid,
245
- multi=message.multiid,
246
236
  kbid=message.kbid,
247
237
  rid=message.uuid,
248
238
  source=message.source,
@@ -256,7 +246,6 @@ class Processor:
256
246
  await self.notify_commit(
257
247
  partition=partition,
258
248
  seqid=seqid,
259
- multi=message.multiid,
260
249
  message=message,
261
250
  write_type=writer_pb2.Notification.WriteType.DELETED,
262
251
  )
@@ -267,7 +256,7 @@ class Processor:
267
256
  # so we commit it in a different transaction to make it as short as possible
268
257
  prev_txn = resource.txn
269
258
  try:
270
- async with self.driver.transaction() as txn:
259
+ async with self.driver.rw_transaction() as txn:
271
260
  resource.txn = txn
272
261
  await resource.set_slug()
273
262
  await txn.commit()
@@ -277,15 +266,12 @@ class Processor:
277
266
  @processor_observer.wrap({"type": "txn"})
278
267
  async def txn(
279
268
  self,
280
- messages: list[writer_pb2.BrokerMessage],
269
+ message: writer_pb2.BrokerMessage,
281
270
  seqid: int,
282
271
  partition: str,
283
272
  transaction_check: bool = True,
284
273
  ) -> None:
285
- if len(messages) == 0:
286
- return None
287
-
288
- kbid = messages[0].kbid
274
+ kbid = message.kbid
289
275
  if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
290
276
  logger.info(f"KB {kbid} is deleted: skiping txn")
291
277
  if transaction_check:
@@ -294,60 +280,57 @@ class Processor:
294
280
  await txn.commit()
295
281
  return None
296
282
 
297
- async with self.driver.transaction() as txn:
283
+ async with self.driver.rw_transaction() as txn:
298
284
  try:
299
- multi = messages[0].multiid
300
285
  kb = KnowledgeBox(txn, self.storage, kbid)
301
- uuid = await self.get_resource_uuid(kb, messages[0])
286
+ uuid = await self.get_resource_uuid(kb, message)
287
+
302
288
  resource: Optional[Resource] = None
303
289
  handled_exception = None
304
290
  created = False
305
291
 
306
- for message in messages:
307
- if resource is not None:
308
- assert resource.uuid == message.uuid
309
-
310
- if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
311
- resource = await kb.get(uuid)
312
- if resource is None:
313
- # It's a new resource
314
- resource = await kb.add_resource(uuid, message.slug, message.basic)
315
- created = True
316
- else:
317
- # It's an update from writer for an existing resource
318
- ...
319
-
320
- elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
321
- resource = await kb.get(uuid)
322
- if resource is None:
323
- logger.info(
324
- f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
325
- )
326
- continue
327
- else:
328
- # It's an update from processor for an existing resource
329
- ...
330
-
331
- generated_fields = await get_generated_fields(message, resource)
332
- if generated_fields.is_not_empty():
333
- await send_generated_fields_to_process(
334
- kbid, resource, generated_fields, message
335
- )
336
- # TODO: remove this when processor sends the field set
337
- for generated_text in generated_fields.texts:
338
- message.texts[
339
- generated_text
340
- ].generated_by.data_augmentation.SetInParent()
341
-
292
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
293
+ resource = await kb.get(uuid)
294
+ if resource is None:
295
+ # It's a new resource
296
+ resource = await kb.add_resource(uuid, message.slug, message.basic)
297
+ created = True
342
298
  else:
343
- raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
299
+ # It's an update from writer for an existing resource
300
+ ...
301
+
302
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
303
+ resource = await kb.get(uuid)
304
+ if resource is None:
305
+ logger.info(
306
+ f"Processor message for resource received but the resource does not exist, ignoring.",
307
+ extra={
308
+ "kbid": kbid,
309
+ "rid": uuid,
310
+ "seqid": seqid,
311
+ },
312
+ )
313
+ return None
314
+ else:
315
+ # It's an update from processor for an existing resource
316
+ ...
317
+
318
+ generated_fields = await get_generated_fields(message, resource)
319
+ if generated_fields.is_not_empty():
320
+ await send_generated_fields_to_process(kbid, resource, generated_fields, message)
321
+ # TODO: remove this when processor sends the field set
322
+ for generated_text in generated_fields.texts:
323
+ message.texts[generated_text].generated_by.data_augmentation.SetInParent()
324
+
325
+ else: # pragma: no cover
326
+ raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
344
327
 
345
- # apply changes from the broker message to the resource
346
- await self.apply_resource(message, resource, update=(not created))
328
+ # apply changes from the broker message to the resource
329
+ await self.apply_resource(message, resource, update=(not created))
347
330
 
348
331
  # index message
349
332
  if resource and resource.modified:
350
- index_message = await self.generate_index_message(resource, messages, created)
333
+ index_message = await self.generate_index_message(resource, message, created)
351
334
  try:
352
335
  warnings = await self.index_resource(
353
336
  index_message=index_message,
@@ -357,7 +340,7 @@ class Processor:
357
340
  seqid=seqid,
358
341
  partition=partition,
359
342
  kb=kb,
360
- source=messages_source(messages),
343
+ source=to_index_message_source(message),
361
344
  )
362
345
  # Save indexing warnings
363
346
  for field_id, warning in warnings:
@@ -374,8 +357,7 @@ class Processor:
374
357
  index_message.labels.remove(current_status[0])
375
358
  index_message.labels.append("/n/s/ERROR")
376
359
 
377
- await pgcatalog_update(txn, kbid, resource, index_message)
378
-
360
+ await catalog_update(txn, kbid, resource, index_message)
379
361
  if transaction_check:
380
362
  await sequence_manager.set_last_seqid(txn, partition, seqid)
381
363
  await txn.commit()
@@ -386,7 +368,6 @@ class Processor:
386
368
  await self.notify_commit(
387
369
  partition=partition,
388
370
  seqid=seqid,
389
- multi=multi,
390
371
  message=message,
391
372
  write_type=(
392
373
  writer_pb2.Notification.WriteType.CREATED
@@ -399,7 +380,6 @@ class Processor:
399
380
  await self.notify_abort(
400
381
  partition=partition,
401
382
  seqid=seqid,
402
- multi=multi,
403
383
  kbid=kbid,
404
384
  rid=uuid,
405
385
  source=message.source,
@@ -419,7 +399,6 @@ class Processor:
419
399
  await self.notify_abort(
420
400
  partition=partition,
421
401
  seqid=seqid,
422
- multi=multi,
423
402
  kbid=kbid,
424
403
  rid=uuid,
425
404
  source=message.source,
@@ -429,11 +408,10 @@ class Processor:
429
408
  # As we are in the middle of a transaction, we cannot let the exception raise directly
430
409
  # as we need to do some cleanup. The exception will be reraised at the end of the function
431
410
  # and then handled by the top caller, so errors can be handled in the same place.
432
- await self.deadletter(messages, partition, seqid)
411
+ await self.deadletter(message, partition, seqid)
433
412
  await self.notify_abort(
434
413
  partition=partition,
435
414
  seqid=seqid,
436
- multi=multi,
437
415
  kbid=kbid,
438
416
  rid=uuid,
439
417
  source=message.source,
@@ -468,22 +446,27 @@ class Processor:
468
446
  # a resource was move to another shard while it was being indexed
469
447
  shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
470
448
 
471
- shard = None
472
- if shard_id is not None:
473
- # Resource already has a shard assigned
474
- shard = await kb.get_resource_shard(shard_id)
475
- if shard is None:
476
- raise AttributeError("Shard not available")
477
- else:
478
- # It's a new resource, get KB's current active shard to place new resource on
479
- shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
480
- if shard is None:
481
- # No current shard available, create a new one
482
- shard = await self.index_node_shard_manager.create_shard_by_kbid(txn, kbid)
483
- await datamanagers.resources.set_resource_shard_id(
484
- txn, kbid=kbid, rid=uuid, shard=shard.shard
485
- )
486
- return shard
449
+ shard = None
450
+ if shard_id is not None:
451
+ # Resource already has a shard assigned
452
+ shard = await kb.get_resource_shard(shard_id)
453
+ if shard is None:
454
+ raise AttributeError("Shard not available")
455
+ else:
456
+ # It's a new resource, get KB's current active shard to place new resource on
457
+ shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
458
+ if shard is None:
459
+ # No current shard available, create a new one
460
+ async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
461
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
462
+ prewarm = kb_config is not None and kb_config.prewarm_enabled
463
+ shard = await self.index_node_shard_manager.create_shard_by_kbid(
464
+ txn, kbid, prewarm_enabled=prewarm
465
+ )
466
+ await datamanagers.resources.set_resource_shard_id(
467
+ txn, kbid=kbid, rid=uuid, shard=shard.shard
468
+ )
469
+ return shard
487
470
 
488
471
  @processor_observer.wrap({"type": "index_resource"})
489
472
  async def index_resource(
@@ -519,17 +502,16 @@ class Processor:
519
502
  async def generate_index_message(
520
503
  self,
521
504
  resource: Resource,
522
- messages: list[writer_pb2.BrokerMessage],
505
+ message: writer_pb2.BrokerMessage,
523
506
  resource_created: bool,
524
507
  ) -> PBBrainResource:
525
508
  builder = IndexMessageBuilder(resource)
526
- message_source = messages_source(messages)
527
- if message_source == nodewriter_pb2.IndexMessageSource.WRITER:
528
- return await builder.for_writer_bm(messages, resource_created)
529
- elif message_source == nodewriter_pb2.IndexMessageSource.PROCESSOR:
530
- return await builder.for_processor_bm(messages)
509
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
510
+ return await builder.for_writer_bm(message, resource_created)
511
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
512
+ return await builder.for_processor_bm(message)
531
513
  else: # pragma: no cover
532
- raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
514
+ raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
533
515
 
534
516
  async def external_index_delete_resource(
535
517
  self, external_index_manager: ExternalIndexManager, resource_uuid: str
@@ -582,35 +564,8 @@ class Processor:
582
564
  resource_uuid=resource_uuid, resource_data=index_message
583
565
  )
584
566
 
585
- async def multi(self, message: writer_pb2.BrokerMessage, seqid: int) -> None:
586
- self.messages.setdefault(message.multiid, []).append(message)
587
-
588
- async def commit(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
589
- if message.multiid not in self.messages:
590
- # Error
591
- logger.error(f"Closed multi {message.multiid}")
592
- await self.deadletter([message], partition, seqid)
593
- else:
594
- await self.txn(self.messages[message.multiid], seqid, partition)
595
-
596
- async def rollback(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
597
- # Error
598
- logger.error(f"Closed multi {message.multiid}")
599
- del self.messages[message.multiid]
600
- await self.notify_abort(
601
- partition=partition,
602
- seqid=seqid,
603
- multi=message.multiid,
604
- kbid=message.kbid,
605
- rid=message.uuid,
606
- source=message.source,
607
- )
608
-
609
- async def deadletter(
610
- self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
611
- ) -> None:
612
- for seq, message in enumerate(messages):
613
- await self.storage.deadletter(message, seq, seqid, partition)
567
+ async def deadletter(self, message: writer_pb2.BrokerMessage, partition: str, seqid: int) -> None:
568
+ await self.storage.deadletter(message, 0, seqid, partition)
614
569
 
615
570
  @processor_observer.wrap({"type": "apply_resource"})
616
571
  async def apply_resource(
@@ -670,7 +625,6 @@ class Processor:
670
625
  *,
671
626
  partition: str,
672
627
  seqid: int,
673
- multi: str,
674
628
  message: writer_pb2.BrokerMessage,
675
629
  write_type: writer_pb2.Notification.WriteType.ValueType,
676
630
  ):
@@ -678,7 +632,7 @@ class Processor:
678
632
  notification = writer_pb2.Notification(
679
633
  partition=int(partition),
680
634
  seqid=seqid,
681
- multi=multi,
635
+ multi="",
682
636
  uuid=message.uuid,
683
637
  kbid=message.kbid,
684
638
  action=writer_pb2.Notification.Action.COMMIT,
@@ -698,7 +652,6 @@ class Processor:
698
652
  *,
699
653
  partition: str,
700
654
  seqid: int,
701
- multi: str,
702
655
  kbid: str,
703
656
  rid: str,
704
657
  source: writer_pb2.BrokerMessage.MessageSource.ValueType,
@@ -706,7 +659,7 @@ class Processor:
706
659
  message = writer_pb2.Notification(
707
660
  partition=int(partition),
708
661
  seqid=seqid,
709
- multi=multi,
662
+ multi="",
710
663
  uuid=rid,
711
664
  kbid=kbid,
712
665
  action=writer_pb2.Notification.ABORT,
@@ -731,7 +684,7 @@ class Processor:
731
684
  logger.info(f"Skip when resource does not even have basic metadata: {resource}")
732
685
  return
733
686
  try:
734
- async with self.driver.transaction() as txn:
687
+ async with self.driver.rw_transaction() as txn:
735
688
  kb.txn = resource.txn = txn
736
689
  resource.basic.metadata.status = resources_pb2.Metadata.Status.ERROR
737
690
  await resource.set_basic(resource.basic)
@@ -759,23 +712,16 @@ class Processor:
759
712
  return kbobj
760
713
 
761
714
 
762
- def messages_source(messages: list[writer_pb2.BrokerMessage]):
763
- from_writer = all(
764
- (message.source == writer_pb2.BrokerMessage.MessageSource.WRITER for message in messages)
765
- )
766
- from_processor = all(
767
- (message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR for message in messages)
768
- )
769
- if from_writer:
770
- source = nodewriter_pb2.IndexMessageSource.WRITER
771
- elif from_processor:
772
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
715
+ def to_index_message_source(message: writer_pb2.BrokerMessage):
716
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
717
+ return nodewriter_pb2.IndexMessageSource.WRITER
718
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
719
+ return nodewriter_pb2.IndexMessageSource.PROCESSOR
773
720
  else: # pragma: no cover
774
- msg = "Processor received multiple broker messages with different sources in the same txn!"
721
+ msg = f"Processor received a broker message with unexpected source! {message.source}"
775
722
  logger.error(msg)
776
723
  errors.capture_exception(Exception(msg))
777
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
778
- return source
724
+ return nodewriter_pb2.IndexMessageSource.PROCESSOR
779
725
 
780
726
 
781
727
  def has_vectors_operation(index_message: PBBrainResource) -> bool:
@@ -31,7 +31,7 @@ async def get_last_seqid(driver: Driver, worker: str) -> Optional[int]:
31
31
  This is oriented towards the ingest consumer and processor,
32
32
  which is the only one that should be writing to this key.
33
33
  """
34
- async with driver.transaction(read_only=True) as txn:
34
+ async with driver.ro_transaction() as txn:
35
35
  key = TXNID.format(worker=worker)
36
36
  last_seq = await txn.get(key)
37
37
  if not last_seq:
@@ -135,7 +135,7 @@ class Resource:
135
135
  await self.txn.set(new_key, self.uuid.encode())
136
136
 
137
137
  # Basic
138
- async def get_basic(self) -> Optional[PBBasic]:
138
+ async def get_basic(self) -> PBBasic:
139
139
  if self.basic is None:
140
140
  basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
141
141
  self.basic = basic if basic is not None else PBBasic()
@@ -354,7 +354,16 @@ class Resource:
354
354
 
355
355
  await field_obj.delete()
356
356
 
357
+ async def field_exists(self, type: FieldType.ValueType, field: str) -> bool:
358
+ """Return whether this resource has this field or not."""
359
+ all_fields_ids = await self.get_fields_ids()
360
+ for field_type, field_id in all_fields_ids:
361
+ if field_type == type and field_id == field:
362
+ return True
363
+ return False
364
+
357
365
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
366
+ # REVIEW: are we sure we don't want to actually check this?
358
367
  return (type, field) in self.fields
359
368
 
360
369
  async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
@@ -25,12 +25,17 @@ from nucliadb.ingest.settings import Settings
25
25
 
26
26
 
27
27
  def assign_partitions(settings: Settings):
28
+ """
29
+ This function dynamically assigns the partitions to the current ingest sts
30
+ replica based on its hostname, typically (ingest-0, ingest-1, etc).
31
+ """
28
32
  # partitions start from 1, instead of 0
29
33
  all_partitions = [str(part + 1) for part in range(settings.nuclia_partitions)]
30
34
 
31
35
  # get replica number and total replicas from environment
32
36
  logger.info(f"PARTITIONS: Total Replicas = {settings.total_replicas}")
33
37
  if settings.replica_number == -1:
38
+ # Get replica number from hostname
34
39
  hostname = os.environ.get("HOSTNAME")
35
40
  if hostname is not None:
36
41
  sts_values = hostname.split("-")
@@ -39,10 +44,16 @@ def assign_partitions(settings: Settings):
39
44
  settings.replica_number = int(sts_values[-1])
40
45
  except Exception:
41
46
  logger.error(f"Could not extract replica number from hostname: {hostname}")
42
- pass
47
+ else:
48
+ logger.warning(f"Could not determine replica number from hostname: {hostname}")
49
+ else:
50
+ logger.warning(f"Could not determine replica number from hostname.")
43
51
 
44
52
  if settings.replica_number == -1:
45
53
  settings.replica_number = 0
54
+ else:
55
+ # We assume that replica numbers are set manually via env variables
56
+ pass
46
57
  logger.info(f"PARTITIONS: Replica Number = {settings.replica_number}")
47
58
 
48
59
  # calculate assigned partitions based on total replicas and own replica number
@@ -135,7 +135,7 @@ async def serialize(
135
135
  slug: Optional[str] = None,
136
136
  ) -> Optional[Resource]:
137
137
  driver = get_driver()
138
- async with driver.transaction(read_only=True) as txn:
138
+ async with driver.ro_transaction() as txn:
139
139
  return await managed_serialize(
140
140
  txn,
141
141
  kbid,
@@ -392,6 +392,6 @@ async def get_resource_uuid_by_slug(
392
392
  ) -> Optional[str]:
393
393
  storage = await get_storage(service_name=service_name)
394
394
  driver = get_driver()
395
- async with driver.transaction(read_only=True) as txn:
395
+ async with driver.ro_transaction() as txn:
396
396
  kb = KnowledgeBox(txn, storage, kbid)
397
397
  return await kb.get_resource_uuid_by_slug(slug)