nucliadb 6.9.0.post5008__py3-none-any.whl → 6.9.0.post5016__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

@@ -20,7 +20,7 @@
20
20
 
21
21
 
22
22
  import asyncio
23
- from typing import Optional
23
+ from typing import Optional, Sequence
24
24
 
25
25
  from nidx_protos.noderesources_pb2 import Resource as IndexMessage
26
26
 
@@ -150,7 +150,7 @@ class IndexMessageBuilder:
150
150
  def _apply_field_deletions(
151
151
  self,
152
152
  brain: ResourceBrain,
153
- field_ids: list[FieldID],
153
+ field_ids: Sequence[FieldID],
154
154
  ) -> None:
155
155
  for field_id in field_ids:
156
156
  brain.delete_field(self.resource.generate_field_id(field_id))
@@ -158,20 +158,19 @@ class IndexMessageBuilder:
158
158
  @observer.wrap({"type": "writer_bm"})
159
159
  async def for_writer_bm(
160
160
  self,
161
- messages: list[BrokerMessage],
161
+ message: BrokerMessage,
162
162
  resource_created: bool,
163
163
  ) -> IndexMessage:
164
164
  """
165
- Builds the index message for the broker messages coming from the writer.
165
+ Builds the index message for the broker message coming from the writer.
166
166
  The writer messages are not adding new vectors to the index.
167
167
  """
168
- assert all(message.source == BrokerMessage.MessageSource.WRITER for message in messages)
168
+ assert message.source == BrokerMessage.MessageSource.WRITER
169
169
 
170
- deleted_fields = get_bm_deleted_fields(messages)
171
- self._apply_field_deletions(self.brain, deleted_fields)
170
+ self._apply_field_deletions(self.brain, message.delete_fields)
172
171
  await self._apply_resource_index_data(self.brain)
173
172
  basic = await self.get_basic()
174
- prefilter_update = needs_prefilter_update(messages)
173
+ prefilter_update = needs_prefilter_update(message)
175
174
  if prefilter_update:
176
175
  # Changes on some metadata at the resource level that is used for filtering require that we reindex all the fields
177
176
  # in the texts index (as it is the one used for prefiltering).
@@ -181,16 +180,16 @@ class IndexMessageBuilder:
181
180
  ]
182
181
  else:
183
182
  # Simply process the fields that are in the message
184
- fields_to_index = get_bm_modified_fields(messages)
183
+ fields_to_index = get_bm_modified_fields(message)
185
184
  for fieldid in fields_to_index:
186
- if fieldid in deleted_fields:
185
+ if fieldid in message.delete_fields:
187
186
  continue
188
187
  await self._apply_field_index_data(
189
188
  self.brain,
190
189
  fieldid,
191
190
  basic,
192
- texts=prefilter_update or needs_texts_update(fieldid, messages),
193
- paragraphs=needs_paragraphs_update(fieldid, messages),
191
+ texts=prefilter_update or needs_texts_update(fieldid, message),
192
+ paragraphs=needs_paragraphs_update(fieldid, message),
194
193
  relations=False, # Relations at the field level are not modified by the writer
195
194
  vectors=False, # Vectors are never added by the writer
196
195
  replace=not resource_created,
@@ -200,30 +199,29 @@ class IndexMessageBuilder:
200
199
  @observer.wrap({"type": "processor_bm"})
201
200
  async def for_processor_bm(
202
201
  self,
203
- messages: list[BrokerMessage],
202
+ message: BrokerMessage,
204
203
  ) -> IndexMessage:
205
204
  """
206
205
  Builds the index message for the broker messages coming from the processor.
207
206
  The processor can index new data to any index.
208
207
  """
209
- assert all(message.source == BrokerMessage.MessageSource.PROCESSOR for message in messages)
210
- deleted_fields = get_bm_deleted_fields(messages)
211
- self._apply_field_deletions(self.brain, deleted_fields)
208
+ assert message.source == BrokerMessage.MessageSource.PROCESSOR
209
+ self._apply_field_deletions(self.brain, message.delete_fields)
212
210
  await self._apply_resource_index_data(self.brain)
213
211
  basic = await self.get_basic()
214
- fields_to_index = get_bm_modified_fields(messages)
212
+ fields_to_index = get_bm_modified_fields(message)
215
213
  vectorsets_configs = await self.get_vectorsets_configs()
216
214
  for fieldid in fields_to_index:
217
- if fieldid in deleted_fields:
215
+ if fieldid in message.delete_fields:
218
216
  continue
219
217
  await self._apply_field_index_data(
220
218
  self.brain,
221
219
  fieldid,
222
220
  basic,
223
- texts=needs_texts_update(fieldid, messages),
224
- paragraphs=needs_paragraphs_update(fieldid, messages),
225
- relations=needs_relations_update(fieldid, messages),
226
- vectors=needs_vectors_update(fieldid, messages),
221
+ texts=needs_texts_update(fieldid, message),
222
+ paragraphs=needs_paragraphs_update(fieldid, message),
223
+ relations=needs_relations_update(fieldid, message),
224
+ vectors=needs_vectors_update(fieldid, message),
227
225
  replace=True,
228
226
  vectorset_configs=vectorsets_configs,
229
227
  )
@@ -270,130 +268,97 @@ class IndexMessageBuilder:
270
268
  return vectorset_configs
271
269
 
272
270
 
273
- def get_bm_deleted_fields(
274
- messages: list[BrokerMessage],
275
- ) -> list[FieldID]:
276
- deleted = []
277
- for message in messages:
278
- for field in message.delete_fields:
279
- if field not in deleted:
280
- deleted.append(field)
281
- return deleted
282
-
283
-
284
- def get_bm_modified_fields(messages: list[BrokerMessage]) -> list[FieldID]:
285
- message_source = get_messages_source(messages)
271
+ def get_bm_modified_fields(message: BrokerMessage) -> list[FieldID]:
286
272
  modified = set()
287
- for message in messages:
288
- # Added or modified fields need indexing
289
- for link in message.links:
290
- modified.add((link, FieldType.LINK))
291
- for file in message.files:
292
- modified.add((file, FieldType.FILE))
293
- for conv in message.conversations:
294
- modified.add((conv, FieldType.CONVERSATION))
295
- for text in message.texts:
296
- modified.add((text, FieldType.TEXT))
273
+ # Added or modified fields need indexing
274
+ for link in message.links:
275
+ modified.add((link, FieldType.LINK))
276
+ for file in message.files:
277
+ modified.add((file, FieldType.FILE))
278
+ for conv in message.conversations:
279
+ modified.add((conv, FieldType.CONVERSATION))
280
+ for text in message.texts:
281
+ modified.add((text, FieldType.TEXT))
282
+ if message.HasField("basic"):
283
+ # Add title and summary only if they have changed
284
+ if message.basic.title != "":
285
+ modified.add(("title", FieldType.GENERIC))
286
+ if message.basic.summary != "":
287
+ modified.add(("summary", FieldType.GENERIC))
288
+
289
+ if message.source == BrokerMessage.MessageSource.PROCESSOR:
290
+ # Messages with field metadata, extracted text or field vectors need indexing
291
+ for fm in message.field_metadata:
292
+ modified.add((fm.field.field, fm.field.field_type))
293
+ for et in message.extracted_text:
294
+ modified.add((et.field.field, et.field.field_type))
295
+ for fv in message.field_vectors:
296
+ modified.add((fv.field.field, fv.field.field_type))
297
+
298
+ if message.source == BrokerMessage.MessageSource.WRITER:
299
+ # Any field that has fieldmetadata annotations should be considered as modified
300
+ # and needs to be reindexed
297
301
  if message.HasField("basic"):
298
- # Add title and summary only if they have changed
299
- if message.basic.title != "":
300
- modified.add(("title", FieldType.GENERIC))
301
- if message.basic.summary != "":
302
- modified.add(("summary", FieldType.GENERIC))
303
-
304
- if message_source == BrokerMessage.MessageSource.PROCESSOR:
305
- # Messages with field metadata, extracted text or field vectors need indexing
306
- for fm in message.field_metadata:
307
- modified.add((fm.field.field, fm.field.field_type))
308
- for et in message.extracted_text:
309
- modified.add((et.field.field, et.field.field_type))
310
- for fv in message.field_vectors:
311
- modified.add((fv.field.field, fv.field.field_type))
312
-
313
- if message_source == BrokerMessage.MessageSource.WRITER:
314
- # Any field that has fieldmetadata annotations should be considered as modified
315
- # and needs to be reindexed
316
- if message.HasField("basic"):
317
- for ufm in message.basic.fieldmetadata:
318
- modified.add((ufm.field.field, ufm.field.field_type))
302
+ for ufm in message.basic.fieldmetadata:
303
+ modified.add((ufm.field.field, ufm.field.field_type))
319
304
  return [FieldID(field=field, field_type=field_type) for field, field_type in modified]
320
305
 
321
306
 
322
- def get_messages_source(messages: list[BrokerMessage]) -> BrokerMessage.MessageSource.ValueType:
323
- assert len(set(message.source for message in messages)) == 1
324
- return messages[0].source
325
-
326
-
327
- def needs_prefilter_update(messages: list[BrokerMessage]) -> bool:
328
- return any(message.reindex for message in messages)
307
+ def needs_prefilter_update(message: BrokerMessage) -> bool:
308
+ return message.reindex
329
309
 
330
310
 
331
- def needs_paragraphs_update(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
311
+ def needs_paragraphs_update(field_id: FieldID, message: BrokerMessage) -> bool:
332
312
  return (
333
- has_paragraph_annotations(field_id, messages)
334
- or has_new_extracted_text(field_id, messages)
335
- or has_new_field_metadata(field_id, messages)
313
+ has_paragraph_annotations(field_id, message)
314
+ or has_new_extracted_text(field_id, message)
315
+ or has_new_field_metadata(field_id, message)
336
316
  )
337
317
 
338
318
 
339
- def has_paragraph_annotations(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
340
- for message in messages:
341
- ufm = next(
342
- (fm for fm in message.basic.fieldmetadata if fm.field == field_id),
343
- None,
344
- )
345
- if ufm is None:
346
- continue
347
- if len(ufm.paragraphs) > 0:
348
- return True
349
- return False
319
+ def has_paragraph_annotations(field_id: FieldID, message: BrokerMessage) -> bool:
320
+ ufm = next(
321
+ (fm for fm in message.basic.fieldmetadata if fm.field == field_id),
322
+ None,
323
+ )
324
+ if ufm is None:
325
+ return False
326
+ return len(ufm.paragraphs) > 0
350
327
 
351
328
 
352
329
  def has_new_field_metadata(
353
330
  field_id: FieldID,
354
- messages: list[BrokerMessage],
331
+ message: BrokerMessage,
355
332
  ) -> bool:
356
- for message in messages:
357
- for field_metadata in message.field_metadata:
358
- if field_metadata.field == field_id:
359
- return True
360
- return False
333
+ return any(field_metadata.field == field_id for field_metadata in message.field_metadata)
361
334
 
362
335
 
363
336
  def has_new_extracted_text(
364
337
  field_id: FieldID,
365
- messages: list[BrokerMessage],
338
+ message: BrokerMessage,
366
339
  ) -> bool:
367
- for message in messages:
368
- for extracted_text in message.extracted_text:
369
- if extracted_text.field == field_id:
370
- return True
371
- return False
340
+ return any(extracted_text.field == field_id for extracted_text in message.extracted_text)
372
341
 
373
342
 
374
343
  def needs_texts_update(
375
344
  field_id: FieldID,
376
- messages: list[BrokerMessage],
345
+ message: BrokerMessage,
377
346
  ) -> bool:
378
- return has_new_extracted_text(field_id, messages) or has_new_field_metadata(field_id, messages)
347
+ return has_new_extracted_text(field_id, message) or has_new_field_metadata(field_id, message)
379
348
 
380
349
 
381
350
  def needs_vectors_update(
382
351
  field_id: FieldID,
383
- messages: list[BrokerMessage],
352
+ message: BrokerMessage,
384
353
  ) -> bool:
385
- for message in messages:
386
- for field_vectors in message.field_vectors:
387
- if field_vectors.field == field_id:
388
- return True
389
- return False
354
+ return any(field_vectors.field == field_id for field_vectors in message.field_vectors)
390
355
 
391
356
 
392
357
  def needs_relations_update(
393
358
  field_id: FieldID,
394
- messages: list[BrokerMessage],
359
+ message: BrokerMessage,
395
360
  ) -> bool:
396
- return has_new_field_metadata(field_id, messages) or has_new_extracted_text(field_id, messages)
361
+ return has_new_field_metadata(field_id, message) or has_new_extracted_text(field_id, message)
397
362
 
398
363
 
399
364
  async def get_resource_index_message(
@@ -141,8 +141,6 @@ class Processor:
141
141
  and can not use the txn id
142
142
  """
143
143
 
144
- messages: dict[str, list[writer_pb2.BrokerMessage]]
145
-
146
144
  def __init__(
147
145
  self,
148
146
  driver: Driver,
@@ -150,7 +148,6 @@ class Processor:
150
148
  pubsub: Optional[PubSubDriver] = None,
151
149
  partition: Optional[str] = None,
152
150
  ):
153
- self.messages = {}
154
151
  self.driver = driver
155
152
  self.storage = storage
156
153
  self.partition = partition
@@ -179,18 +176,12 @@ class Processor:
179
176
  if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
180
177
  await self.delete_resource(message, seqid, partition, transaction_check)
181
178
  elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
182
- await self.txn([message], seqid, partition, transaction_check)
183
- elif message.type == writer_pb2.BrokerMessage.MessageType.MULTI:
184
- # XXX Not supported right now
185
- # MULTI, COMMIT and ROLLBACK are all not supported in transactional mode right now
186
- # This concept is probably not tenable with current architecture because
187
- # of how nats works and how we would need to manage rollbacks.
188
- # XXX Should this be removed?
189
- await self.multi(message, seqid)
190
- elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
191
- await self.commit(message, seqid, partition)
192
- elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
193
- await self.rollback(message, seqid, partition)
179
+ await self.txn(message, seqid, partition, transaction_check)
180
+ else: # pragma: no cover
181
+ logger.error(
182
+ f"Unsupported message type: {message.type}",
183
+ extra={"seqid": seqid, "partition": partition},
184
+ )
194
185
 
195
186
  async def get_resource_uuid(self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage) -> str:
196
187
  if message.uuid is None:
@@ -242,7 +233,6 @@ class Processor:
242
233
  await self.notify_abort(
243
234
  partition=partition,
244
235
  seqid=seqid,
245
- multi=message.multiid,
246
236
  kbid=message.kbid,
247
237
  rid=message.uuid,
248
238
  source=message.source,
@@ -256,7 +246,6 @@ class Processor:
256
246
  await self.notify_commit(
257
247
  partition=partition,
258
248
  seqid=seqid,
259
- multi=message.multiid,
260
249
  message=message,
261
250
  write_type=writer_pb2.Notification.WriteType.DELETED,
262
251
  )
@@ -277,15 +266,12 @@ class Processor:
277
266
  @processor_observer.wrap({"type": "txn"})
278
267
  async def txn(
279
268
  self,
280
- messages: list[writer_pb2.BrokerMessage],
269
+ message: writer_pb2.BrokerMessage,
281
270
  seqid: int,
282
271
  partition: str,
283
272
  transaction_check: bool = True,
284
273
  ) -> None:
285
- if len(messages) == 0:
286
- return None
287
-
288
- kbid = messages[0].kbid
274
+ kbid = message.kbid
289
275
  if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
290
276
  logger.info(f"KB {kbid} is deleted: skiping txn")
291
277
  if transaction_check:
@@ -296,58 +282,55 @@ class Processor:
296
282
 
297
283
  async with self.driver.rw_transaction() as txn:
298
284
  try:
299
- multi = messages[0].multiid
300
285
  kb = KnowledgeBox(txn, self.storage, kbid)
301
- uuid = await self.get_resource_uuid(kb, messages[0])
286
+ uuid = await self.get_resource_uuid(kb, message)
287
+
302
288
  resource: Optional[Resource] = None
303
289
  handled_exception = None
304
290
  created = False
305
291
 
306
- for message in messages:
307
- if resource is not None:
308
- assert resource.uuid == message.uuid
309
-
310
- if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
311
- resource = await kb.get(uuid)
312
- if resource is None:
313
- # It's a new resource
314
- resource = await kb.add_resource(uuid, message.slug, message.basic)
315
- created = True
316
- else:
317
- # It's an update from writer for an existing resource
318
- ...
319
-
320
- elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
321
- resource = await kb.get(uuid)
322
- if resource is None:
323
- logger.info(
324
- f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
325
- )
326
- continue
327
- else:
328
- # It's an update from processor for an existing resource
329
- ...
330
-
331
- generated_fields = await get_generated_fields(message, resource)
332
- if generated_fields.is_not_empty():
333
- await send_generated_fields_to_process(
334
- kbid, resource, generated_fields, message
335
- )
336
- # TODO: remove this when processor sends the field set
337
- for generated_text in generated_fields.texts:
338
- message.texts[
339
- generated_text
340
- ].generated_by.data_augmentation.SetInParent()
341
-
292
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
293
+ resource = await kb.get(uuid)
294
+ if resource is None:
295
+ # It's a new resource
296
+ resource = await kb.add_resource(uuid, message.slug, message.basic)
297
+ created = True
298
+ else:
299
+ # It's an update from writer for an existing resource
300
+ ...
301
+
302
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
303
+ resource = await kb.get(uuid)
304
+ if resource is None:
305
+ logger.info(
306
+ f"Processor message for resource received but the resource does not exist, ignoring.",
307
+ extra={
308
+ "kbid": kbid,
309
+ "rid": uuid,
310
+ "seqid": seqid,
311
+ },
312
+ )
313
+ return None
342
314
  else:
343
- raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
315
+ # It's an update from processor for an existing resource
316
+ ...
344
317
 
345
- # apply changes from the broker message to the resource
346
- await self.apply_resource(message, resource, update=(not created))
318
+ generated_fields = await get_generated_fields(message, resource)
319
+ if generated_fields.is_not_empty():
320
+ await send_generated_fields_to_process(kbid, resource, generated_fields, message)
321
+ # TODO: remove this when processor sends the field set
322
+ for generated_text in generated_fields.texts:
323
+ message.texts[generated_text].generated_by.data_augmentation.SetInParent()
324
+
325
+ else: # pragma: no cover
326
+ raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
327
+
328
+ # apply changes from the broker message to the resource
329
+ await self.apply_resource(message, resource, update=(not created))
347
330
 
348
331
  # index message
349
332
  if resource and resource.modified:
350
- index_message = await self.generate_index_message(resource, messages, created)
333
+ index_message = await self.generate_index_message(resource, message, created)
351
334
  try:
352
335
  warnings = await self.index_resource(
353
336
  index_message=index_message,
@@ -357,7 +340,7 @@ class Processor:
357
340
  seqid=seqid,
358
341
  partition=partition,
359
342
  kb=kb,
360
- source=messages_source(messages),
343
+ source=to_index_message_source(message),
361
344
  )
362
345
  # Save indexing warnings
363
346
  for field_id, warning in warnings:
@@ -385,7 +368,6 @@ class Processor:
385
368
  await self.notify_commit(
386
369
  partition=partition,
387
370
  seqid=seqid,
388
- multi=multi,
389
371
  message=message,
390
372
  write_type=(
391
373
  writer_pb2.Notification.WriteType.CREATED
@@ -398,7 +380,6 @@ class Processor:
398
380
  await self.notify_abort(
399
381
  partition=partition,
400
382
  seqid=seqid,
401
- multi=multi,
402
383
  kbid=kbid,
403
384
  rid=uuid,
404
385
  source=message.source,
@@ -418,7 +399,6 @@ class Processor:
418
399
  await self.notify_abort(
419
400
  partition=partition,
420
401
  seqid=seqid,
421
- multi=multi,
422
402
  kbid=kbid,
423
403
  rid=uuid,
424
404
  source=message.source,
@@ -428,11 +408,10 @@ class Processor:
428
408
  # As we are in the middle of a transaction, we cannot let the exception raise directly
429
409
  # as we need to do some cleanup. The exception will be reraised at the end of the function
430
410
  # and then handled by the top caller, so errors can be handled in the same place.
431
- await self.deadletter(messages, partition, seqid)
411
+ await self.deadletter(message, partition, seqid)
432
412
  await self.notify_abort(
433
413
  partition=partition,
434
414
  seqid=seqid,
435
- multi=multi,
436
415
  kbid=kbid,
437
416
  rid=uuid,
438
417
  source=message.source,
@@ -518,17 +497,16 @@ class Processor:
518
497
  async def generate_index_message(
519
498
  self,
520
499
  resource: Resource,
521
- messages: list[writer_pb2.BrokerMessage],
500
+ message: writer_pb2.BrokerMessage,
522
501
  resource_created: bool,
523
502
  ) -> PBBrainResource:
524
503
  builder = IndexMessageBuilder(resource)
525
- message_source = messages_source(messages)
526
- if message_source == nodewriter_pb2.IndexMessageSource.WRITER:
527
- return await builder.for_writer_bm(messages, resource_created)
528
- elif message_source == nodewriter_pb2.IndexMessageSource.PROCESSOR:
529
- return await builder.for_processor_bm(messages)
504
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
505
+ return await builder.for_writer_bm(message, resource_created)
506
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
507
+ return await builder.for_processor_bm(message)
530
508
  else: # pragma: no cover
531
- raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
509
+ raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
532
510
 
533
511
  async def external_index_delete_resource(
534
512
  self, external_index_manager: ExternalIndexManager, resource_uuid: str
@@ -581,35 +559,8 @@ class Processor:
581
559
  resource_uuid=resource_uuid, resource_data=index_message
582
560
  )
583
561
 
584
- async def multi(self, message: writer_pb2.BrokerMessage, seqid: int) -> None:
585
- self.messages.setdefault(message.multiid, []).append(message)
586
-
587
- async def commit(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
588
- if message.multiid not in self.messages:
589
- # Error
590
- logger.error(f"Closed multi {message.multiid}")
591
- await self.deadletter([message], partition, seqid)
592
- else:
593
- await self.txn(self.messages[message.multiid], seqid, partition)
594
-
595
- async def rollback(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
596
- # Error
597
- logger.error(f"Closed multi {message.multiid}")
598
- del self.messages[message.multiid]
599
- await self.notify_abort(
600
- partition=partition,
601
- seqid=seqid,
602
- multi=message.multiid,
603
- kbid=message.kbid,
604
- rid=message.uuid,
605
- source=message.source,
606
- )
607
-
608
- async def deadletter(
609
- self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
610
- ) -> None:
611
- for seq, message in enumerate(messages):
612
- await self.storage.deadletter(message, seq, seqid, partition)
562
+ async def deadletter(self, message: writer_pb2.BrokerMessage, partition: str, seqid: int) -> None:
563
+ await self.storage.deadletter(message, 0, seqid, partition)
613
564
 
614
565
  @processor_observer.wrap({"type": "apply_resource"})
615
566
  async def apply_resource(
@@ -669,7 +620,6 @@ class Processor:
669
620
  *,
670
621
  partition: str,
671
622
  seqid: int,
672
- multi: str,
673
623
  message: writer_pb2.BrokerMessage,
674
624
  write_type: writer_pb2.Notification.WriteType.ValueType,
675
625
  ):
@@ -677,7 +627,7 @@ class Processor:
677
627
  notification = writer_pb2.Notification(
678
628
  partition=int(partition),
679
629
  seqid=seqid,
680
- multi=multi,
630
+ multi="",
681
631
  uuid=message.uuid,
682
632
  kbid=message.kbid,
683
633
  action=writer_pb2.Notification.Action.COMMIT,
@@ -697,7 +647,6 @@ class Processor:
697
647
  *,
698
648
  partition: str,
699
649
  seqid: int,
700
- multi: str,
701
650
  kbid: str,
702
651
  rid: str,
703
652
  source: writer_pb2.BrokerMessage.MessageSource.ValueType,
@@ -705,7 +654,7 @@ class Processor:
705
654
  message = writer_pb2.Notification(
706
655
  partition=int(partition),
707
656
  seqid=seqid,
708
- multi=multi,
657
+ multi="",
709
658
  uuid=rid,
710
659
  kbid=kbid,
711
660
  action=writer_pb2.Notification.ABORT,
@@ -758,23 +707,16 @@ class Processor:
758
707
  return kbobj
759
708
 
760
709
 
761
- def messages_source(messages: list[writer_pb2.BrokerMessage]):
762
- from_writer = all(
763
- (message.source == writer_pb2.BrokerMessage.MessageSource.WRITER for message in messages)
764
- )
765
- from_processor = all(
766
- (message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR for message in messages)
767
- )
768
- if from_writer:
769
- source = nodewriter_pb2.IndexMessageSource.WRITER
770
- elif from_processor:
771
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
710
+ def to_index_message_source(message: writer_pb2.BrokerMessage):
711
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
712
+ return nodewriter_pb2.IndexMessageSource.WRITER
713
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
714
+ return nodewriter_pb2.IndexMessageSource.PROCESSOR
772
715
  else: # pragma: no cover
773
- msg = "Processor received multiple broker messages with different sources in the same txn!"
716
+ msg = f"Processor received a broker message with unexpected source! {message.source}"
774
717
  logger.error(msg)
775
718
  errors.capture_exception(Exception(msg))
776
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
777
- return source
719
+ return nodewriter_pb2.IndexMessageSource.PROCESSOR
778
720
 
779
721
 
780
722
  def has_vectors_operation(index_message: PBBrainResource) -> bool:
@@ -30,12 +30,16 @@ from nucliadb_protos import resources_pb2
30
30
  from nucliadb_utils.utilities import get_storage
31
31
 
32
32
 
33
- async def paragraph_source_image(kbid: str, paragraph: resources_pb2.Paragraph) -> Optional[Image]:
33
+ async def paragraph_source_image(
34
+ kbid: str, paragraph_id: ParagraphId, paragraph: resources_pb2.Paragraph
35
+ ) -> Optional[Image]:
34
36
  """Certain paragraphs are extracted from images using techniques like OCR or
35
37
  inception. If that's the case, return the original image for this paragraph.
36
38
 
37
39
  """
38
40
  source_image = paragraph.representation.reference_file
41
+ if not source_image:
42
+ return None
39
43
 
40
44
  if paragraph.kind not in (
41
45
  resources_pb2.Paragraph.TypeParagraph.OCR,
@@ -43,7 +47,7 @@ async def paragraph_source_image(kbid: str, paragraph: resources_pb2.Paragraph)
43
47
  ):
44
48
  return None
45
49
 
46
- field_id = ParagraphId.from_string(paragraph.key).field_id
50
+ field_id = paragraph_id.field_id
47
51
 
48
52
  # Paragraphs extracted from an image store its original image representation
49
53
  # in the reference file. The path is incomplete though, as it's stored in
@@ -219,7 +219,9 @@ async def hydrate_paragraph(
219
219
  hydrated.image = hydration_models.HydratedParagraphImage()
220
220
 
221
221
  if config.image.source_image:
222
- hydrated.image.source_image = await paragraph_source_image(kbid, paragraph)
222
+ hydrated.image.source_image = await paragraph_source_image(
223
+ kbid, paragraph_id, paragraph
224
+ )
223
225
 
224
226
  if config.page:
225
227
  if hydrated.page is None:
@@ -116,10 +116,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
116
116
  if split is not None:
117
117
  text = extracted_text.split_text[split]
118
118
  for paragraph in field_metadata.split_metadata[split].paragraphs:
119
- if paragraph.key == "":
120
- key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
121
- else:
122
- key = paragraph.key
119
+ key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
123
120
  if key == result:
124
121
  for sentence in paragraph.sentences:
125
122
  splitted_text = text[sentence.start : sentence.end]
@@ -127,10 +124,7 @@ async def get_sentences(kbid: str, result: str) -> list[str]:
127
124
  else:
128
125
  text = extracted_text.text
129
126
  for paragraph in field_metadata.metadata.paragraphs:
130
- if paragraph.key == "":
131
- key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
132
- else:
133
- key = paragraph.key
127
+ key = f"{rid}/{field_type}/{field}/{paragraph.start}-{paragraph.end}"
134
128
  if key == result:
135
129
  for sentence in paragraph.sentences:
136
130
  splitted_text = text[sentence.start : sentence.end]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.9.0.post5008
3
+ Version: 6.9.0.post5016
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: <4,>=3.9
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: nucliadb-telemetry[all]>=6.9.0.post5008
23
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.0.post5008
24
- Requires-Dist: nucliadb-protos>=6.9.0.post5008
25
- Requires-Dist: nucliadb-models>=6.9.0.post5008
26
- Requires-Dist: nidx-protos>=6.9.0.post5008
22
+ Requires-Dist: nucliadb-telemetry[all]>=6.9.0.post5016
23
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.9.0.post5016
24
+ Requires-Dist: nucliadb-protos>=6.9.0.post5016
25
+ Requires-Dist: nucliadb-models>=6.9.0.post5016
26
+ Requires-Dist: nidx-protos>=6.9.0.post5016
27
27
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
28
28
  Requires-Dist: nuclia-models>=0.50.0
29
29
  Requires-Dist: uvicorn[standard]
@@ -165,7 +165,7 @@ nucliadb/ingest/orm/brain_v2.py,sha256=8MAo1N_nhoGy73TvKBuaw-NaMxIanRPCNttw6dFY4
165
165
  nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjUJNftfoVw,7514
166
166
  nucliadb/ingest/orm/entities.py,sha256=kXyeF6XOpFKhEsGLcY-GLIk21Exp0cJst4XQQ9jJoug,14791
167
167
  nucliadb/ingest/orm/exceptions.py,sha256=gsp7TtVNQPiIEh-zf_UEJClwuFU0iu-5vzj0OrKMScg,1550
168
- nucliadb/ingest/orm/index_message.py,sha256=DWMTHJoVamUbK8opKl5csDvxfgz7c2j7phG1Ut4yIxk,15724
168
+ nucliadb/ingest/orm/index_message.py,sha256=DlGLuuuCsXR_rqxd6CEZMYuOx1TIiq1mR5ue114rGUk,14473
169
169
  nucliadb/ingest/orm/knowledgebox.py,sha256=OG9dmfklYf1PgTHwQd_iFZOociLEvUSMMv1ZKeUgecE,23910
170
170
  nucliadb/ingest/orm/metrics.py,sha256=OiuggTh-n3kZHA2G73NEUdIlh8c3yFrbusI88DK-Mko,1273
171
171
  nucliadb/ingest/orm/resource.py,sha256=zQeZyZ-tCxr-DhonLobfZRkz_iEew0Y-cGfXeNNIHG0,40432
@@ -173,7 +173,7 @@ nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,
173
173
  nucliadb/ingest/orm/processor/__init__.py,sha256=xhDNKCxY0XNOlIVKEtM8QT75vDUkJIt7K-_VgGbbOQU,904
174
174
  nucliadb/ingest/orm/processor/auditing.py,sha256=gxn5v30KVaH0TnIjo715mWjzKGJ-DMviElEXJG9BNN4,4612
175
175
  nucliadb/ingest/orm/processor/data_augmentation.py,sha256=v-pj4GbBWSuO8dQyahs5UDr5ghsyfhCZDS0ftKd6ZYc,5179
176
- nucliadb/ingest/orm/processor/processor.py,sha256=CPOf5KZgTcKLO6LxkFia2LueskcKBlv5bTifnTOZ1KE,33845
176
+ nucliadb/ingest/orm/processor/processor.py,sha256=3wlGiCXXaVpqLyCJ3AHWHfpKTwtywU78_LkNtpqOiuQ,31391
177
177
  nucliadb/ingest/orm/processor/sequence_manager.py,sha256=kUH0bCuM6NqpA0xSwfyb9igig3Btu57pc8VYnKggqx4,1693
178
178
  nucliadb/ingest/service/__init__.py,sha256=LHQFUkdmNBOWqBG0Md9sMMI7g5TQZ-hLAnhw6ZblrJg,2002
179
179
  nucliadb/ingest/service/exceptions.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
@@ -274,8 +274,8 @@ nucliadb/search/search/chat/prompt.py,sha256=Bk69WVki5XIzXFYO6o1uQw1feHtuMwfEx_A
274
274
  nucliadb/search/search/chat/query.py,sha256=qWrwVEX_GrDV7LFRyC21BURtl-WsRt8BkIocY8njkKM,17147
275
275
  nucliadb/search/search/hydrator/__init__.py,sha256=3Pc-rcax4TI174qcrllnReE728DoJTaA8tpvBUFf98g,7005
276
276
  nucliadb/search/search/hydrator/fields.py,sha256=LhKw-aNU5eJqfZADtq3iB7AGXm0l_QabAAoSHJTk8Is,5962
277
- nucliadb/search/search/hydrator/images.py,sha256=qqjQbn82DgMUrNG21r8CNnpnOK4RnqHtnJ0jdQsvfyU,4550
278
- nucliadb/search/search/hydrator/paragraphs.py,sha256=yZXsTFAzH_c9Yf8sClLHsr2OwjUxoc2KQtUcV1RT7ug,12616
277
+ nucliadb/search/search/hydrator/images.py,sha256=gS7-dr1e_DpRQ6XaGxwMW1AMEV9a-u73h_jTVy7XY38,4602
278
+ nucliadb/search/search/hydrator/paragraphs.py,sha256=7JjifwhvY7V2TPSnXoIM6aBmh-i6WwTT29D5CvF2Nu0,12676
279
279
  nucliadb/search/search/hydrator/resources.py,sha256=1pNyUac8xWRnQVXU6FkDwsXHzmOZwqKRgKhGPCUdEhE,2004
280
280
  nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
281
281
  nucliadb/search/search/query_parser/exceptions.py,sha256=sVl9gRNzhE-s480LBBVkiXzNRbKhYRQN5F3it5tNNp8,939
@@ -343,7 +343,7 @@ nucliadb/train/generators/image_classifier.py,sha256=46YShcl7nf1_iLXZklWTAFvUNII
343
343
  nucliadb/train/generators/paragraph_classifier.py,sha256=Jk3B8a2zkAf-2-59RECiNmNP1Nz7f0-hfnS4rIE69xA,2817
344
344
  nucliadb/train/generators/paragraph_streaming.py,sha256=axGNYjOTgxGsOcAAvCz_rTYzZCdZf0f1s-Hzn-VvIX0,3646
345
345
  nucliadb/train/generators/question_answer_streaming.py,sha256=YXjWQc_SJ_TyXXJRg8tGkC9IeIzbRDcn0spQw9AlRNY,5740
346
- nucliadb/train/generators/sentence_classifier.py,sha256=4JBGtInnWUQrM9wBd-P7Z2lTK1Dka6U3PPDqnLbGnFM,5177
346
+ nucliadb/train/generators/sentence_classifier.py,sha256=izCAbjMgfH6isn48lj6z09gwCunouClqct8MuaYPwVc,4989
347
347
  nucliadb/train/generators/token_classifier.py,sha256=T8JOVR1vv5g7rn7HtcQcIZ3O5TFqh15uv5rOselLBVo,9617
348
348
  nucliadb/train/generators/utils.py,sha256=88cCuHUlOkn9UDP2NR4ru7A-5hbcLGd73sv9j0fMhmY,3590
349
349
  nucliadb/writer/__init__.py,sha256=S298mrZL3vr62OrBqi97mdLxgR5cReMlRJgnaQHZV7s,1304
@@ -384,8 +384,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
384
384
  nucliadb/writer/tus/s3.py,sha256=vu1BGg4VqJ_x2P1u2BxqPKlSfw5orT_a3R-Ln5oPUpU,8483
385
385
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
386
386
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
387
- nucliadb-6.9.0.post5008.dist-info/METADATA,sha256=lWKVwRZU7H-IiUnwX8V_02JAfoHhQ0Namxoqzqw0MDU,4158
388
- nucliadb-6.9.0.post5008.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
389
- nucliadb-6.9.0.post5008.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
390
- nucliadb-6.9.0.post5008.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
391
- nucliadb-6.9.0.post5008.dist-info/RECORD,,
387
+ nucliadb-6.9.0.post5016.dist-info/METADATA,sha256=rGI3PsztyvQi51WADCYUtBem7OTxjTVRTtoPRShwc2s,4158
388
+ nucliadb-6.9.0.post5016.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
389
+ nucliadb-6.9.0.post5016.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
390
+ nucliadb-6.9.0.post5016.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
391
+ nucliadb-6.9.0.post5016.dist-info/RECORD,,