nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -21,13 +21,14 @@ from __future__ import annotations
21
21
 
22
22
  import asyncio
23
23
  import logging
24
+ from collections import defaultdict
24
25
  from concurrent.futures import ThreadPoolExecutor
25
26
  from functools import partial
26
- from typing import TYPE_CHECKING, Any, AsyncIterator, MutableMapping, Optional, Type
27
+ from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
27
28
 
28
29
  from nucliadb.common import datamanagers
29
30
  from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
30
- from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR
31
+ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
31
32
  from nucliadb.common.maindb.driver import Transaction
32
33
  from nucliadb.ingest.fields.base import Field
33
34
  from nucliadb.ingest.fields.conversation import Conversation
@@ -49,8 +50,8 @@ from nucliadb_protos.resources_pb2 import (
49
50
  ExtractedVectorsWrapper,
50
51
  FieldClassifications,
51
52
  FieldComputedMetadataWrapper,
53
+ FieldFile,
52
54
  FieldID,
53
- FieldMetadata,
54
55
  FieldQuestionAnswerWrapper,
55
56
  FieldText,
56
57
  FieldType,
@@ -59,7 +60,6 @@ from nucliadb_protos.resources_pb2 import (
59
60
  LinkExtractedData,
60
61
  Metadata,
61
62
  Paragraph,
62
- ParagraphAnnotation,
63
63
  )
64
64
  from nucliadb_protos.resources_pb2 import Basic as PBBasic
65
65
  from nucliadb_protos.resources_pb2 import Conversation as PBConversation
@@ -67,18 +67,11 @@ from nucliadb_protos.resources_pb2 import Extra as PBExtra
67
67
  from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
68
68
  from nucliadb_protos.resources_pb2 import Origin as PBOrigin
69
69
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
70
- from nucliadb_protos.train_pb2 import (
71
- EnabledMetadata,
72
- TrainField,
73
- TrainMetadata,
74
- TrainParagraph,
75
- TrainResource,
76
- TrainSentence,
77
- )
78
- from nucliadb_protos.train_pb2 import Position as TrainPosition
79
70
  from nucliadb_protos.utils_pb2 import Relation as PBRelation
80
71
  from nucliadb_protos.writer_pb2 import BrokerMessage
72
+ from nucliadb_utils import const
81
73
  from nucliadb_utils.storages.storage import Storage
74
+ from nucliadb_utils.utilities import has_feature
82
75
 
83
76
  if TYPE_CHECKING: # pragma: no cover
84
77
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
@@ -226,6 +219,7 @@ class Resource:
226
219
  page_positions=page_positions,
227
220
  extracted_text=await field_obj.get_extracted_text(),
228
221
  basic_user_field_metadata=user_field_metadata,
222
+ replace_field=True,
229
223
  )
230
224
 
231
225
  # Some basic fields are computed off field metadata.
@@ -336,39 +330,28 @@ class Resource:
336
330
  page_positions=page_positions,
337
331
  extracted_text=await field.get_extracted_text(),
338
332
  basic_user_field_metadata=user_field_metadata,
333
+ replace_field=reindex,
339
334
  )
340
335
 
341
336
  if self.disable_vectors is False:
342
- # XXX: while we don't remove the "default" vectorset concept, we
343
- # need to do use None as the default one
344
- vo = await field.get_vectors()
345
- if vo is not None:
346
- async with datamanagers.with_ro_transaction() as ro_txn:
347
- dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
348
- ro_txn, kbid=self.kb.kbid
349
- )
350
- brain.apply_field_vectors(
351
- field_key,
352
- vo,
353
- matryoshka_vector_dimension=dimension,
354
- replace_field=reindex,
355
- )
356
-
357
337
  vectorset_configs = []
358
- async with datamanagers.with_ro_transaction() as ro_txn:
359
- async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
360
- ro_txn, kbid=self.kb.kbid
361
- ):
362
- vectorset_configs.append(vectorset_config)
338
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
339
+ self.txn, kbid=self.kb.kbid
340
+ ):
341
+ vectorset_configs.append(vectorset_config)
342
+
363
343
  for vectorset_config in vectorset_configs:
364
- vo = await field.get_vectors(vectorset=vectorset_config.vectorset_id)
344
+ vo = await field.get_vectors(
345
+ vectorset=vectorset_config.vectorset_id,
346
+ storage_key_kind=vectorset_config.storage_key_kind,
347
+ )
365
348
  if vo is not None:
366
349
  dimension = vectorset_config.vectorset_index_config.vector_dimension
367
350
  brain.apply_field_vectors(
368
351
  field_key,
369
352
  vo,
370
353
  vectorset=vectorset_config.vectorset_id,
371
- matryoshka_vector_dimension=dimension,
354
+ vector_dimension=dimension,
372
355
  replace_field=reindex,
373
356
  )
374
357
  return brain
@@ -501,7 +484,6 @@ class Resource:
501
484
  @processor_observer.wrap({"type": "apply_fields"})
502
485
  async def apply_fields(self, message: BrokerMessage):
503
486
  message_updated_fields = []
504
-
505
487
  for field, text in message.texts.items():
506
488
  fid = FieldID(field_type=FieldType.TEXT, field=field)
507
489
  await self.set_field(fid.field_type, fid.field, text)
@@ -532,14 +514,99 @@ class Resource:
532
514
  errors=message.errors, # type: ignore
533
515
  )
534
516
 
517
+ @processor_observer.wrap({"type": "apply_fields_status"})
518
+ async def apply_fields_status(self, message: BrokerMessage, updated_fields: list[FieldID]):
519
+ # Dictionary of all errors per field (we may have several due to DA tasks)
520
+ errors_by_field: dict[tuple[FieldType.ValueType, str], list[writer_pb2.Error]] = defaultdict(
521
+ list
522
+ )
523
+
524
+ # Make sure if a file is updated without errors, it ends up in errors_by_field
525
+ for field_id in updated_fields:
526
+ errors_by_field[(field_id.field_type, field_id.field)] = []
527
+ for fs in message.field_statuses:
528
+ errors_by_field[(fs.id.field_type, fs.id.field)] = []
529
+
530
+ for error in message.errors:
531
+ errors_by_field[(error.field_type, error.field)].append(error)
532
+
533
+ # If this message comes from the processor (not a DA worker), we clear all previous errors
534
+ # TODO: When generated_by is populated with DA tasks by processor, remove only related errors
535
+ from_processor = any((x.WhichOneof("generator") == "processor" for x in message.generated_by))
536
+
537
+ for (field_type, field), errors in errors_by_field.items():
538
+ field_obj = await self.get_field(field, field_type, load=False)
539
+ if from_processor:
540
+ # Create a new field status to clear all errors
541
+ status = writer_pb2.FieldStatus()
542
+ else:
543
+ status = await field_obj.get_status() or writer_pb2.FieldStatus()
544
+
545
+ for error in errors:
546
+ field_error = writer_pb2.FieldError(
547
+ source_error=error,
548
+ )
549
+ field_error.created.GetCurrentTime()
550
+ status.errors.append(field_error)
551
+
552
+ # We infer the status for processor messages
553
+ if message.source == BrokerMessage.MessageSource.PROCESSOR:
554
+ if len(status.errors) > 0:
555
+ status.status = writer_pb2.FieldStatus.Status.ERROR
556
+ else:
557
+ status.status = writer_pb2.FieldStatus.Status.PROCESSED
558
+ else:
559
+ field_status = next(
560
+ (
561
+ fs.status
562
+ for fs in message.field_statuses
563
+ if fs.id.field_type == field_type and fs.id.field == field
564
+ ),
565
+ None,
566
+ )
567
+ if field_status is not None:
568
+ status.status = field_status
569
+ # If the field was not found and the message comes from the writer, this implicitly sets the
570
+ # status to the default value, which is PROCESSING. This covers the case of new field creation.
571
+
572
+ await field_obj.set_status(status)
573
+
574
+ async def update_status(self):
575
+ field_ids = await self.get_all_field_ids(for_update=False)
576
+ if field_ids is None:
577
+ return
578
+ field_statuses = await datamanagers.fields.get_statuses(
579
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, fields=field_ids.fields
580
+ )
581
+
582
+ # If any field is processing -> PENDING
583
+ if any((f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses)):
584
+ self.basic.metadata.status = PBMetadata.Status.PENDING
585
+ # If we have any non-DA error -> ERROR
586
+ elif any(
587
+ (
588
+ f.status == writer_pb2.FieldStatus.Status.ERROR
589
+ and any(
590
+ (
591
+ e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
592
+ for e in f.errors
593
+ )
594
+ )
595
+ for f in field_statuses
596
+ )
597
+ ):
598
+ self.basic.metadata.status = PBMetadata.Status.ERROR
599
+ # Otherwise (everything processed or we only have DA errors) -> PROCESSED
600
+ else:
601
+ self.basic.metadata.status = PBMetadata.Status.PROCESSED
602
+
535
603
  @processor_observer.wrap({"type": "apply_extracted"})
536
604
  async def apply_extracted(self, message: BrokerMessage):
537
- errors = False
538
- field_obj: Field
539
- for error in message.errors:
540
- field_obj = await self.get_field(error.field, error.field_type, load=False)
541
- await field_obj.set_error(error)
542
- errors = True
605
+ if not has_feature(const.Features.FIELD_STATUS):
606
+ field_obj: Field
607
+ for error in message.errors:
608
+ field_obj = await self.get_field(error.field, error.field_type, load=False)
609
+ await field_obj.set_error(error)
543
610
 
544
611
  await self.get_basic()
545
612
  if self.basic is None:
@@ -548,19 +615,29 @@ class Resource:
548
615
  previous_basic = Basic()
549
616
  previous_basic.CopyFrom(self.basic)
550
617
 
551
- if errors:
552
- self.basic.metadata.status = PBMetadata.Status.ERROR
553
- elif errors is False and message.source is message.MessageSource.PROCESSOR:
554
- self.basic.metadata.status = PBMetadata.Status.PROCESSED
555
-
556
618
  maybe_update_basic_icon(self.basic, get_text_field_mimetype(message))
557
619
 
558
620
  for question_answers in message.question_answers:
559
621
  await self._apply_question_answers(question_answers)
560
622
 
623
+ for field_id in message.delete_question_answers:
624
+ await self._delete_question_answers(field_id)
625
+
561
626
  for extracted_text in message.extracted_text:
562
627
  await self._apply_extracted_text(extracted_text)
563
628
 
629
+ # Update field and resource status depending on processing results
630
+ await self.apply_fields_status(message, self._modified_extracted_text)
631
+ if has_feature(const.Features.FIELD_STATUS):
632
+ # Compute resource status based on all fields statuses
633
+ await self.update_status()
634
+ else:
635
+ # Old code path, compute resource status based on the presence of errors in this BrokerMessage
636
+ if message.errors:
637
+ self.basic.metadata.status = PBMetadata.Status.ERROR
638
+ elif message.source is message.MessageSource.PROCESSOR:
639
+ self.basic.metadata.status = PBMetadata.Status.PROCESSED
640
+
564
641
  extracted_languages = []
565
642
 
566
643
  for link_extracted_data in message.link_extracted_data:
@@ -584,8 +661,7 @@ class Resource:
584
661
  # Upload to binary storage
585
662
  # Vector indexing
586
663
  if self.disable_vectors is False:
587
- for field_vectors in message.field_vectors:
588
- await self._apply_extracted_vectors(field_vectors)
664
+ await self._apply_extracted_vectors(message.field_vectors)
589
665
 
590
666
  # Only uploading to binary storage
591
667
  for field_large_metadata in message.field_large_metadata:
@@ -614,6 +690,10 @@ class Resource:
614
690
  field_obj = await self.get_field(field.field, field.field_type, load=False)
615
691
  await field_obj.set_question_answers(question_answers)
616
692
 
693
+ async def _delete_question_answers(self, field_id: FieldID):
694
+ field_obj = await self.get_field(field_id.field, field_id.field_type, load=False)
695
+ await field_obj.delete_question_answers()
696
+
617
697
  async def _apply_link_extracted_data(self, link_extracted_data: LinkExtractedData):
618
698
  assert self.basic is not None
619
699
  field_link: Link = await self.get_field(
@@ -679,15 +759,52 @@ class Resource:
679
759
  maybe_update_basic_icon(self.basic, file_extracted_data.icon)
680
760
  maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
681
761
 
762
+ async def _should_update_resource_title_from_file_metadata(self) -> bool:
763
+ """
764
+ We only want to update resource title from file metadata if the title is empty,
765
+ equal to the resource uuid or equal to any of the file filenames in the resource.
766
+ """
767
+ basic = await self.get_basic()
768
+ if basic is None:
769
+ return True
770
+ current_title = basic.title
771
+ if current_title == "":
772
+ # If the title is empty, we should update it
773
+ return True
774
+ if current_title == self.uuid:
775
+ # If the title is the same as the resource uuid, we should update it
776
+ return True
777
+ fields = await self.get_fields(force=True)
778
+ filenames = set()
779
+ for (field_type, _), field_obj in fields.items():
780
+ if field_type == FieldType.FILE:
781
+ field_value: Optional[FieldFile] = await field_obj.get_value()
782
+ if field_value is not None:
783
+ if field_value.file.filename not in ("", None):
784
+ filenames.add(field_value.file.filename)
785
+ if current_title in filenames:
786
+ # If the title is equal to any of the file filenames, we should update it
787
+ return True
788
+ return False
789
+
682
790
  async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
683
791
  """
684
792
  Update the resource title with the first file that has a title extracted.
685
793
  """
686
- for file_extracted_data in message.file_extracted_data:
687
- if file_extracted_data.title != "":
688
- await self.update_resource_title(file_extracted_data.title)
689
- # Break after the first file with a title is found
690
- break
794
+ if not await self._should_update_resource_title_from_file_metadata():
795
+ return
796
+ for fed in message.file_extracted_data:
797
+ if fed.title == "":
798
+ # Skip if the extracted title is empty
799
+ continue
800
+ fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
801
+ logger.info(
802
+ "Updating resource title from file extracted data",
803
+ extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
804
+ )
805
+ await self.update_resource_title(fed.title)
806
+ # Break after the first file with a title is found
807
+ break
691
808
 
692
809
  async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
693
810
  assert self.basic is not None
@@ -723,6 +840,7 @@ class Resource:
723
840
  page_positions=page_positions,
724
841
  extracted_text=extracted_text,
725
842
  basic_user_field_metadata=user_field_metadata,
843
+ replace_field=True,
726
844
  )
727
845
  loop = asyncio.get_running_loop()
728
846
  await loop.run_in_executor(_executor, apply_field_metadata)
@@ -731,55 +849,69 @@ class Resource:
731
849
 
732
850
  add_field_classifications(self.basic, field_metadata)
733
851
 
734
- async def _apply_extracted_vectors(self, field_vectors: ExtractedVectorsWrapper):
735
- # Store vectors in the resource
736
-
737
- if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
738
- # skipping because field does not exist
739
- logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
740
- return
741
-
742
- field_obj = await self.get_field(
743
- field_vectors.field.field,
744
- field_vectors.field.field_type,
745
- load=False,
746
- )
747
- vo = await field_obj.set_vectors(field_vectors)
748
-
749
- # Prepare vectors to be indexed
852
+ async def _apply_extracted_vectors(
853
+ self,
854
+ fields_vectors: Sequence[ExtractedVectorsWrapper],
855
+ ):
856
+ await self.get_fields(force=True)
857
+ vectorsets = {
858
+ vectorset_id: vs
859
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kb.kbid)
860
+ }
861
+
862
+ for field_vectors in fields_vectors:
863
+ # Bw/c with extracted vectors without vectorsets
864
+ if not field_vectors.vectorset_id:
865
+ assert (
866
+ len(vectorsets) == 1
867
+ ), "Invalid broker message, can't ingest vectors from unknown vectorset to KB with multiple vectorsets"
868
+ vectorset = list(vectorsets.values())[0]
750
869
 
751
- field_key = self.generate_field_id(field_vectors.field)
752
- if vo is not None:
753
- vectorset_id = field_vectors.vectorset_id or None
754
- if vectorset_id is None:
755
- dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
756
- self.txn, kbid=self.kb.kbid
757
- )
758
870
  else:
759
- config = await datamanagers.vectorsets.get(
760
- self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
761
- )
762
- if config is None:
871
+ if field_vectors.vectorset_id not in vectorsets:
763
872
  logger.warning(
764
- f"Trying to apply a resource on vectorset '{vectorset_id}' that doesn't exist."
873
+ "Dropping extracted vectors for unknown vectorset",
874
+ extra={"kbid": self.kb.kbid, "vectorset": field_vectors.vectorset_id},
765
875
  )
766
- return
767
- dimension = config.vectorset_index_config.vector_dimension
768
- if not dimension:
769
- raise ValueError(f"Vector dimension not set for vectorset '{vectorset_id}'")
876
+ continue
877
+
878
+ vectorset = vectorsets[field_vectors.vectorset_id]
879
+
880
+ # Store vectors in the resource
881
+
882
+ if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
883
+ # skipping because field does not exist
884
+ logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
885
+ return
886
+
887
+ field_obj = await self.get_field(
888
+ field_vectors.field.field,
889
+ field_vectors.field.field_type,
890
+ load=False,
891
+ )
892
+ vo = await field_obj.set_vectors(
893
+ field_vectors, vectorset.vectorset_id, vectorset.storage_key_kind
894
+ )
895
+ if vo is None:
896
+ raise AttributeError("Vector object not found on set_vectors")
897
+
898
+ # Prepare vectors to be indexed
899
+
900
+ field_key = self.generate_field_id(field_vectors.field)
901
+ dimension = vectorset.vectorset_index_config.vector_dimension
902
+ if not dimension:
903
+ raise ValueError(f"Vector dimension not set for vectorset '{vectorset.vectorset_id}'")
770
904
 
771
905
  apply_field_vectors_partial = partial(
772
906
  self.indexer.apply_field_vectors,
773
907
  field_key,
774
908
  vo,
775
- vectorset=vectorset_id,
909
+ vectorset=vectorset.vectorset_id,
776
910
  replace_field=True,
777
- matryoshka_vector_dimension=dimension,
911
+ vector_dimension=dimension,
778
912
  )
779
913
  loop = asyncio.get_running_loop()
780
914
  await loop.run_in_executor(_executor, apply_field_vectors_partial)
781
- else:
782
- raise AttributeError("VO not found on set")
783
915
 
784
916
  async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
785
917
  field_obj = await self.get_field(
@@ -852,291 +984,6 @@ class Resource:
852
984
  self._indexer = None
853
985
  self.txn = None
854
986
 
855
- async def iterate_sentences(
856
- self, enabled_metadata: EnabledMetadata
857
- ) -> AsyncIterator[TrainSentence]: # pragma: no cover
858
- fields = await self.get_fields(force=True)
859
- metadata = TrainMetadata()
860
- userdefinedparagraphclass: dict[str, ParagraphAnnotation] = {}
861
- if enabled_metadata.labels:
862
- if self.basic is None:
863
- self.basic = await self.get_basic()
864
- if self.basic is not None:
865
- metadata.labels.resource.extend(self.basic.usermetadata.classifications)
866
- for fieldmetadata in self.basic.fieldmetadata:
867
- field_id = self.generate_field_id(fieldmetadata.field)
868
- for annotationparagraph in fieldmetadata.paragraphs:
869
- userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
870
-
871
- for (type_id, field_id), field in fields.items():
872
- fieldid = FieldID(field_type=type_id, field=field_id)
873
- field_key = self.generate_field_id(fieldid)
874
- fm = await field.get_field_metadata()
875
- extracted_text = None
876
- vo = None
877
- text = None
878
-
879
- if enabled_metadata.vector:
880
- vo = await field.get_vectors()
881
-
882
- extracted_text = await field.get_extracted_text()
883
-
884
- if fm is None:
885
- continue
886
-
887
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
888
- for subfield_metadata, splitted_metadata in fm.split_metadata.items():
889
- field_metadatas.append((subfield_metadata, splitted_metadata))
890
-
891
- for subfield, field_metadata in field_metadatas:
892
- if enabled_metadata.labels:
893
- metadata.labels.ClearField("field")
894
- metadata.labels.field.extend(field_metadata.classifications)
895
-
896
- entities: dict[str, str] = {}
897
- if enabled_metadata.entities:
898
- _update_entities_dict(entities, field_metadata)
899
-
900
- precomputed_vectors = {}
901
- if vo is not None:
902
- if subfield is not None:
903
- vectors = vo.split_vectors[subfield]
904
- base_vector_key = f"{self.uuid}/{field_key}/{subfield}"
905
- else:
906
- vectors = vo.vectors
907
- base_vector_key = f"{self.uuid}/{field_key}"
908
- for index, vector in enumerate(vectors.vectors):
909
- vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
910
- precomputed_vectors[vector_key] = vector.vector
911
-
912
- if extracted_text is not None:
913
- if subfield is not None:
914
- text = extracted_text.split_text[subfield]
915
- else:
916
- text = extracted_text.text
917
-
918
- for paragraph in field_metadata.paragraphs:
919
- if subfield is not None:
920
- paragraph_key = (
921
- f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
922
- )
923
- else:
924
- paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
925
-
926
- if enabled_metadata.labels:
927
- metadata.labels.ClearField("field")
928
- metadata.labels.paragraph.extend(paragraph.classifications)
929
- if paragraph_key in userdefinedparagraphclass:
930
- metadata.labels.paragraph.extend(
931
- userdefinedparagraphclass[paragraph_key].classifications
932
- )
933
-
934
- for index, sentence in enumerate(paragraph.sentences):
935
- if subfield is not None:
936
- sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
937
- else:
938
- sentence_key = (
939
- f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
940
- )
941
-
942
- if vo is not None:
943
- metadata.ClearField("vector")
944
- vector_tmp = precomputed_vectors.get(sentence_key)
945
- if vector_tmp:
946
- metadata.vector.extend(vector_tmp)
947
-
948
- if extracted_text is not None and text is not None:
949
- metadata.text = text[sentence.start : sentence.end]
950
-
951
- metadata.ClearField("entities")
952
- metadata.ClearField("entity_positions")
953
- if enabled_metadata.entities and text is not None:
954
- local_text = text[sentence.start : sentence.end]
955
- add_entities_to_metadata(entities, local_text, metadata)
956
-
957
- pb_sentence = TrainSentence()
958
- pb_sentence.uuid = self.uuid
959
- pb_sentence.field.CopyFrom(fieldid)
960
- pb_sentence.paragraph = paragraph_key
961
- pb_sentence.sentence = sentence_key
962
- pb_sentence.metadata.CopyFrom(metadata)
963
- yield pb_sentence
964
-
965
- async def iterate_paragraphs(
966
- self, enabled_metadata: EnabledMetadata
967
- ) -> AsyncIterator[TrainParagraph]:
968
- fields = await self.get_fields(force=True)
969
- metadata = TrainMetadata()
970
- userdefinedparagraphclass: dict[str, ParagraphAnnotation] = {}
971
- if enabled_metadata.labels:
972
- if self.basic is None:
973
- self.basic = await self.get_basic()
974
- if self.basic is not None:
975
- metadata.labels.resource.extend(self.basic.usermetadata.classifications)
976
- for fieldmetadata in self.basic.fieldmetadata:
977
- field_id = self.generate_field_id(fieldmetadata.field)
978
- for annotationparagraph in fieldmetadata.paragraphs:
979
- userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
980
-
981
- for (type_id, field_id), field in fields.items():
982
- fieldid = FieldID(field_type=type_id, field=field_id)
983
- field_key = self.generate_field_id(fieldid)
984
- fm = await field.get_field_metadata()
985
- extracted_text = None
986
- text = None
987
-
988
- extracted_text = await field.get_extracted_text()
989
-
990
- if fm is None:
991
- continue
992
-
993
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
994
- for subfield_metadata, splitted_metadata in fm.split_metadata.items():
995
- field_metadatas.append((subfield_metadata, splitted_metadata))
996
-
997
- for subfield, field_metadata in field_metadatas:
998
- if enabled_metadata.labels:
999
- metadata.labels.ClearField("field")
1000
- metadata.labels.field.extend(field_metadata.classifications)
1001
-
1002
- entities: dict[str, str] = {}
1003
- if enabled_metadata.entities:
1004
- _update_entities_dict(entities, field_metadata)
1005
-
1006
- if extracted_text is not None:
1007
- if subfield is not None:
1008
- text = extracted_text.split_text[subfield]
1009
- else:
1010
- text = extracted_text.text
1011
-
1012
- for paragraph in field_metadata.paragraphs:
1013
- if subfield is not None:
1014
- paragraph_key = (
1015
- f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1016
- )
1017
- else:
1018
- paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1019
-
1020
- if enabled_metadata.labels:
1021
- metadata.labels.ClearField("paragraph")
1022
- metadata.labels.paragraph.extend(paragraph.classifications)
1023
-
1024
- if extracted_text is not None and text is not None:
1025
- metadata.text = text[paragraph.start : paragraph.end]
1026
-
1027
- metadata.ClearField("entities")
1028
- metadata.ClearField("entity_positions")
1029
- if enabled_metadata.entities and text is not None:
1030
- local_text = text[paragraph.start : paragraph.end]
1031
- add_entities_to_metadata(entities, local_text, metadata)
1032
-
1033
- if paragraph_key in userdefinedparagraphclass:
1034
- metadata.labels.paragraph.extend(
1035
- userdefinedparagraphclass[paragraph_key].classifications
1036
- )
1037
-
1038
- pb_paragraph = TrainParagraph()
1039
- pb_paragraph.uuid = self.uuid
1040
- pb_paragraph.field.CopyFrom(fieldid)
1041
- pb_paragraph.paragraph = paragraph_key
1042
- pb_paragraph.metadata.CopyFrom(metadata)
1043
-
1044
- yield pb_paragraph
1045
-
1046
- async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
1047
- fields = await self.get_fields(force=True)
1048
- metadata = TrainMetadata()
1049
- if enabled_metadata.labels:
1050
- if self.basic is None:
1051
- self.basic = await self.get_basic()
1052
- if self.basic is not None:
1053
- metadata.labels.resource.extend(self.basic.usermetadata.classifications)
1054
-
1055
- for (type_id, field_id), field in fields.items():
1056
- fieldid = FieldID(field_type=type_id, field=field_id)
1057
- fm = await field.get_field_metadata()
1058
- extracted_text = None
1059
-
1060
- if enabled_metadata.text:
1061
- extracted_text = await field.get_extracted_text()
1062
-
1063
- if fm is None:
1064
- continue
1065
-
1066
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1067
- for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1068
- field_metadatas.append((subfield_metadata, splitted_metadata))
1069
-
1070
- for subfield, splitted_metadata in field_metadatas:
1071
- if enabled_metadata.labels:
1072
- metadata.labels.ClearField("field")
1073
- metadata.labels.field.extend(splitted_metadata.classifications)
1074
-
1075
- if extracted_text is not None:
1076
- if subfield is not None:
1077
- metadata.text = extracted_text.split_text[subfield]
1078
- else:
1079
- metadata.text = extracted_text.text
1080
-
1081
- if enabled_metadata.entities:
1082
- metadata.ClearField("entities")
1083
- _update_entities_dict(metadata.entities, splitted_metadata)
1084
-
1085
- pb_field = TrainField()
1086
- pb_field.uuid = self.uuid
1087
- pb_field.field.CopyFrom(fieldid)
1088
- pb_field.metadata.CopyFrom(metadata)
1089
- yield pb_field
1090
-
1091
- async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
1092
- fields = await self.get_fields(force=True)
1093
- metadata = TrainMetadata()
1094
- if enabled_metadata.labels:
1095
- if self.basic is None:
1096
- self.basic = await self.get_basic()
1097
- if self.basic is not None:
1098
- metadata.labels.resource.extend(self.basic.usermetadata.classifications)
1099
-
1100
- metadata.labels.ClearField("field")
1101
- metadata.ClearField("entities")
1102
-
1103
- for (_, _), field in fields.items():
1104
- extracted_text = None
1105
- fm = await field.get_field_metadata()
1106
-
1107
- if enabled_metadata.text:
1108
- extracted_text = await field.get_extracted_text()
1109
-
1110
- if extracted_text is not None:
1111
- metadata.text += extracted_text.text
1112
- for text in extracted_text.split_text.values():
1113
- metadata.text += f" {text}"
1114
-
1115
- if fm is None:
1116
- continue
1117
-
1118
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1119
- for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1120
- field_metadatas.append((subfield_metadata, splitted_metadata))
1121
-
1122
- for _, splitted_metadata in field_metadatas:
1123
- if enabled_metadata.labels:
1124
- metadata.labels.field.extend(splitted_metadata.classifications)
1125
-
1126
- if enabled_metadata.entities:
1127
- _update_entities_dict(metadata.entities, splitted_metadata)
1128
-
1129
- pb_resource = TrainResource()
1130
- pb_resource.uuid = self.uuid
1131
- if self.basic is not None:
1132
- pb_resource.title = self.basic.title
1133
- pb_resource.icon = self.basic.icon
1134
- pb_resource.slug = self.basic.slug
1135
- pb_resource.modified.CopyFrom(self.basic.modified)
1136
- pb_resource.created.CopyFrom(self.basic.created)
1137
- pb_resource.metadata.CopyFrom(metadata)
1138
- return pb_resource
1139
-
1140
987
 
1141
988
  async def get_file_page_positions(field: File) -> FilePagePositions:
1142
989
  positions: FilePagePositions = {}
@@ -1181,24 +1028,6 @@ def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper
1181
1028
  return True
1182
1029
 
1183
1030
 
1184
- def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
1185
- for entity_key, entity_value in entities.items():
1186
- if entity_key not in local_text:
1187
- # Add the entity only if found in text
1188
- continue
1189
- metadata.entities[entity_key] = entity_value
1190
-
1191
- # Add positions for the entity relative to the local text
1192
- poskey = f"{entity_value}/{entity_key}"
1193
- metadata.entity_positions[poskey].entity = entity_key
1194
- last_occurrence_end = 0
1195
- for _ in range(local_text.count(entity_key)):
1196
- start = local_text.index(entity_key, last_occurrence_end)
1197
- end = start + len(entity_key)
1198
- metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
1199
- last_occurrence_end = end
1200
-
1201
-
1202
1031
  def maybe_update_basic_summary(basic: PBBasic, summary_text: str) -> bool:
1203
1032
  if basic.summary or not summary_text:
1204
1033
  return False
@@ -1267,23 +1096,3 @@ def extract_field_metadata_languages(
1267
1096
  for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
1268
1097
  languages.add(splitted_metadata.language)
1269
1098
  return list(languages)
1270
-
1271
-
1272
- def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
1273
- """
1274
- Update the entities dict with the entities from the field metadata.
1275
- Method created to ease the transition from legacy ner field to new entities field.
1276
- """
1277
- # Data Augmentation + Processor entities
1278
- # This will overwrite entities detected from more than one data augmentation task
1279
- # TODO: Change TrainMetadata proto to accept multiple entities with the same text
1280
- entity_map = {
1281
- entity.text: entity.label
1282
- for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
1283
- for entity in entities_wrapper.entities
1284
- }
1285
- target_entites_dict.update(entity_map)
1286
-
1287
- # Legacy processor entities
1288
- # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
1289
- target_entites_dict.update(field_metadata.ner)