nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0028_extracted_vectors_reference.py +61 -0
- migrations/0029_backfill_field_status.py +149 -0
- migrations/0030_label_deduplication.py +60 -0
- nucliadb/common/cluster/manager.py +41 -331
- nucliadb/common/cluster/rebalance.py +2 -2
- nucliadb/common/cluster/rollover.py +12 -71
- nucliadb/common/cluster/settings.py +3 -0
- nucliadb/common/cluster/standalone/utils.py +0 -43
- nucliadb/common/cluster/utils.py +0 -16
- nucliadb/common/counters.py +1 -0
- nucliadb/common/datamanagers/fields.py +48 -7
- nucliadb/common/datamanagers/vectorsets.py +11 -2
- nucliadb/common/external_index_providers/base.py +2 -1
- nucliadb/common/external_index_providers/pinecone.py +3 -5
- nucliadb/common/ids.py +18 -4
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +76 -37
- nucliadb/export_import/models.py +3 -3
- nucliadb/health.py +0 -7
- nucliadb/ingest/app.py +0 -8
- nucliadb/ingest/consumer/auditing.py +1 -1
- nucliadb/ingest/consumer/shard_creator.py +1 -1
- nucliadb/ingest/fields/base.py +83 -21
- nucliadb/ingest/orm/brain.py +55 -56
- nucliadb/ingest/orm/broker_message.py +12 -2
- nucliadb/ingest/orm/entities.py +6 -17
- nucliadb/ingest/orm/knowledgebox.py +44 -22
- nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +222 -413
- nucliadb/ingest/processing.py +8 -2
- nucliadb/ingest/serialize.py +77 -46
- nucliadb/ingest/service/writer.py +2 -56
- nucliadb/ingest/settings.py +1 -4
- nucliadb/learning_proxy.py +6 -4
- nucliadb/purge/__init__.py +102 -12
- nucliadb/purge/orphan_shards.py +6 -4
- nucliadb/reader/api/models.py +3 -3
- nucliadb/reader/api/v1/__init__.py +1 -0
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +23 -12
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/reader/api/v1/vectorsets.py +48 -0
- nucliadb/search/api/v1/ask.py +11 -1
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +8 -13
- nucliadb/search/api/v1/search.py +3 -2
- nucliadb/search/api/v1/suggest.py +0 -2
- nucliadb/search/predict.py +6 -4
- nucliadb/search/requesters/utils.py +1 -2
- nucliadb/search/search/chat/ask.py +77 -13
- nucliadb/search/search/chat/prompt.py +16 -5
- nucliadb/search/search/chat/query.py +74 -34
- nucliadb/search/search/exceptions.py +2 -7
- nucliadb/search/search/find.py +9 -5
- nucliadb/search/search/find_merge.py +10 -4
- nucliadb/search/search/graph_strategy.py +884 -0
- nucliadb/search/search/hydrator.py +6 -0
- nucliadb/search/search/merge.py +79 -24
- nucliadb/search/search/query.py +74 -245
- nucliadb/search/search/query_parser/exceptions.py +11 -1
- nucliadb/search/search/query_parser/fetcher.py +405 -0
- nucliadb/search/search/query_parser/models.py +0 -3
- nucliadb/search/search/query_parser/parser.py +22 -21
- nucliadb/search/search/rerankers.py +1 -42
- nucliadb/search/search/shards.py +19 -0
- nucliadb/standalone/api_router.py +2 -14
- nucliadb/standalone/settings.py +4 -0
- nucliadb/train/generators/field_streaming.py +7 -3
- nucliadb/train/lifecycle.py +3 -6
- nucliadb/train/nodes.py +14 -12
- nucliadb/train/resource.py +380 -0
- nucliadb/writer/api/constants.py +20 -16
- nucliadb/writer/api/v1/__init__.py +1 -0
- nucliadb/writer/api/v1/export_import.py +1 -1
- nucliadb/writer/api/v1/field.py +13 -7
- nucliadb/writer/api/v1/knowledgebox.py +3 -46
- nucliadb/writer/api/v1/resource.py +20 -13
- nucliadb/writer/api/v1/services.py +10 -1
- nucliadb/writer/api/v1/upload.py +61 -34
- nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
- nucliadb/writer/back_pressure.py +17 -46
- nucliadb/writer/resource/basic.py +9 -7
- nucliadb/writer/resource/field.py +42 -9
- nucliadb/writer/settings.py +2 -2
- nucliadb/writer/tus/gcs.py +11 -10
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
- nucliadb/common/cluster/discovery/base.py +0 -178
- nucliadb/common/cluster/discovery/k8s.py +0 -301
- nucliadb/common/cluster/discovery/manual.py +0 -57
- nucliadb/common/cluster/discovery/single.py +0 -51
- nucliadb/common/cluster/discovery/types.py +0 -32
- nucliadb/common/cluster/discovery/utils.py +0 -67
- nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
- nucliadb/common/cluster/standalone/index_node.py +0 -123
- nucliadb/common/cluster/standalone/service.py +0 -84
- nucliadb/standalone/introspect.py +0 -208
- nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
- /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
nucliadb/ingest/orm/resource.py
CHANGED
@@ -21,13 +21,14 @@ from __future__ import annotations
|
|
21
21
|
|
22
22
|
import asyncio
|
23
23
|
import logging
|
24
|
+
from collections import defaultdict
|
24
25
|
from concurrent.futures import ThreadPoolExecutor
|
25
26
|
from functools import partial
|
26
|
-
from typing import TYPE_CHECKING, Any,
|
27
|
+
from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
|
27
28
|
|
28
29
|
from nucliadb.common import datamanagers
|
29
30
|
from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
|
30
|
-
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR
|
31
|
+
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
|
31
32
|
from nucliadb.common.maindb.driver import Transaction
|
32
33
|
from nucliadb.ingest.fields.base import Field
|
33
34
|
from nucliadb.ingest.fields.conversation import Conversation
|
@@ -49,8 +50,8 @@ from nucliadb_protos.resources_pb2 import (
|
|
49
50
|
ExtractedVectorsWrapper,
|
50
51
|
FieldClassifications,
|
51
52
|
FieldComputedMetadataWrapper,
|
53
|
+
FieldFile,
|
52
54
|
FieldID,
|
53
|
-
FieldMetadata,
|
54
55
|
FieldQuestionAnswerWrapper,
|
55
56
|
FieldText,
|
56
57
|
FieldType,
|
@@ -59,7 +60,6 @@ from nucliadb_protos.resources_pb2 import (
|
|
59
60
|
LinkExtractedData,
|
60
61
|
Metadata,
|
61
62
|
Paragraph,
|
62
|
-
ParagraphAnnotation,
|
63
63
|
)
|
64
64
|
from nucliadb_protos.resources_pb2 import Basic as PBBasic
|
65
65
|
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
@@ -67,18 +67,11 @@ from nucliadb_protos.resources_pb2 import Extra as PBExtra
|
|
67
67
|
from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
68
68
|
from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
69
69
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
70
|
-
from nucliadb_protos.train_pb2 import (
|
71
|
-
EnabledMetadata,
|
72
|
-
TrainField,
|
73
|
-
TrainMetadata,
|
74
|
-
TrainParagraph,
|
75
|
-
TrainResource,
|
76
|
-
TrainSentence,
|
77
|
-
)
|
78
|
-
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
79
70
|
from nucliadb_protos.utils_pb2 import Relation as PBRelation
|
80
71
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
72
|
+
from nucliadb_utils import const
|
81
73
|
from nucliadb_utils.storages.storage import Storage
|
74
|
+
from nucliadb_utils.utilities import has_feature
|
82
75
|
|
83
76
|
if TYPE_CHECKING: # pragma: no cover
|
84
77
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
@@ -226,6 +219,7 @@ class Resource:
|
|
226
219
|
page_positions=page_positions,
|
227
220
|
extracted_text=await field_obj.get_extracted_text(),
|
228
221
|
basic_user_field_metadata=user_field_metadata,
|
222
|
+
replace_field=True,
|
229
223
|
)
|
230
224
|
|
231
225
|
# Some basic fields are computed off field metadata.
|
@@ -336,39 +330,28 @@ class Resource:
|
|
336
330
|
page_positions=page_positions,
|
337
331
|
extracted_text=await field.get_extracted_text(),
|
338
332
|
basic_user_field_metadata=user_field_metadata,
|
333
|
+
replace_field=reindex,
|
339
334
|
)
|
340
335
|
|
341
336
|
if self.disable_vectors is False:
|
342
|
-
# XXX: while we don't remove the "default" vectorset concept, we
|
343
|
-
# need to do use None as the default one
|
344
|
-
vo = await field.get_vectors()
|
345
|
-
if vo is not None:
|
346
|
-
async with datamanagers.with_ro_transaction() as ro_txn:
|
347
|
-
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
348
|
-
ro_txn, kbid=self.kb.kbid
|
349
|
-
)
|
350
|
-
brain.apply_field_vectors(
|
351
|
-
field_key,
|
352
|
-
vo,
|
353
|
-
matryoshka_vector_dimension=dimension,
|
354
|
-
replace_field=reindex,
|
355
|
-
)
|
356
|
-
|
357
337
|
vectorset_configs = []
|
358
|
-
async
|
359
|
-
|
360
|
-
|
361
|
-
)
|
362
|
-
|
338
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
339
|
+
self.txn, kbid=self.kb.kbid
|
340
|
+
):
|
341
|
+
vectorset_configs.append(vectorset_config)
|
342
|
+
|
363
343
|
for vectorset_config in vectorset_configs:
|
364
|
-
vo = await field.get_vectors(
|
344
|
+
vo = await field.get_vectors(
|
345
|
+
vectorset=vectorset_config.vectorset_id,
|
346
|
+
storage_key_kind=vectorset_config.storage_key_kind,
|
347
|
+
)
|
365
348
|
if vo is not None:
|
366
349
|
dimension = vectorset_config.vectorset_index_config.vector_dimension
|
367
350
|
brain.apply_field_vectors(
|
368
351
|
field_key,
|
369
352
|
vo,
|
370
353
|
vectorset=vectorset_config.vectorset_id,
|
371
|
-
|
354
|
+
vector_dimension=dimension,
|
372
355
|
replace_field=reindex,
|
373
356
|
)
|
374
357
|
return brain
|
@@ -501,7 +484,6 @@ class Resource:
|
|
501
484
|
@processor_observer.wrap({"type": "apply_fields"})
|
502
485
|
async def apply_fields(self, message: BrokerMessage):
|
503
486
|
message_updated_fields = []
|
504
|
-
|
505
487
|
for field, text in message.texts.items():
|
506
488
|
fid = FieldID(field_type=FieldType.TEXT, field=field)
|
507
489
|
await self.set_field(fid.field_type, fid.field, text)
|
@@ -532,14 +514,99 @@ class Resource:
|
|
532
514
|
errors=message.errors, # type: ignore
|
533
515
|
)
|
534
516
|
|
517
|
+
@processor_observer.wrap({"type": "apply_fields_status"})
|
518
|
+
async def apply_fields_status(self, message: BrokerMessage, updated_fields: list[FieldID]):
|
519
|
+
# Dictionary of all errors per field (we may have several due to DA tasks)
|
520
|
+
errors_by_field: dict[tuple[FieldType.ValueType, str], list[writer_pb2.Error]] = defaultdict(
|
521
|
+
list
|
522
|
+
)
|
523
|
+
|
524
|
+
# Make sure if a file is updated without errors, it ends up in errors_by_field
|
525
|
+
for field_id in updated_fields:
|
526
|
+
errors_by_field[(field_id.field_type, field_id.field)] = []
|
527
|
+
for fs in message.field_statuses:
|
528
|
+
errors_by_field[(fs.id.field_type, fs.id.field)] = []
|
529
|
+
|
530
|
+
for error in message.errors:
|
531
|
+
errors_by_field[(error.field_type, error.field)].append(error)
|
532
|
+
|
533
|
+
# If this message comes from the processor (not a DA worker), we clear all previous errors
|
534
|
+
# TODO: When generated_by is populated with DA tasks by processor, remove only related errors
|
535
|
+
from_processor = any((x.WhichOneof("generator") == "processor" for x in message.generated_by))
|
536
|
+
|
537
|
+
for (field_type, field), errors in errors_by_field.items():
|
538
|
+
field_obj = await self.get_field(field, field_type, load=False)
|
539
|
+
if from_processor:
|
540
|
+
# Create a new field status to clear all errors
|
541
|
+
status = writer_pb2.FieldStatus()
|
542
|
+
else:
|
543
|
+
status = await field_obj.get_status() or writer_pb2.FieldStatus()
|
544
|
+
|
545
|
+
for error in errors:
|
546
|
+
field_error = writer_pb2.FieldError(
|
547
|
+
source_error=error,
|
548
|
+
)
|
549
|
+
field_error.created.GetCurrentTime()
|
550
|
+
status.errors.append(field_error)
|
551
|
+
|
552
|
+
# We infer the status for processor messages
|
553
|
+
if message.source == BrokerMessage.MessageSource.PROCESSOR:
|
554
|
+
if len(status.errors) > 0:
|
555
|
+
status.status = writer_pb2.FieldStatus.Status.ERROR
|
556
|
+
else:
|
557
|
+
status.status = writer_pb2.FieldStatus.Status.PROCESSED
|
558
|
+
else:
|
559
|
+
field_status = next(
|
560
|
+
(
|
561
|
+
fs.status
|
562
|
+
for fs in message.field_statuses
|
563
|
+
if fs.id.field_type == field_type and fs.id.field == field
|
564
|
+
),
|
565
|
+
None,
|
566
|
+
)
|
567
|
+
if field_status is not None:
|
568
|
+
status.status = field_status
|
569
|
+
# If the field was not found and the message comes from the writer, this implicitly sets the
|
570
|
+
# status to the default value, which is PROCESSING. This covers the case of new field creation.
|
571
|
+
|
572
|
+
await field_obj.set_status(status)
|
573
|
+
|
574
|
+
async def update_status(self):
|
575
|
+
field_ids = await self.get_all_field_ids(for_update=False)
|
576
|
+
if field_ids is None:
|
577
|
+
return
|
578
|
+
field_statuses = await datamanagers.fields.get_statuses(
|
579
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, fields=field_ids.fields
|
580
|
+
)
|
581
|
+
|
582
|
+
# If any field is processing -> PENDING
|
583
|
+
if any((f.status == writer_pb2.FieldStatus.Status.PENDING for f in field_statuses)):
|
584
|
+
self.basic.metadata.status = PBMetadata.Status.PENDING
|
585
|
+
# If we have any non-DA error -> ERROR
|
586
|
+
elif any(
|
587
|
+
(
|
588
|
+
f.status == writer_pb2.FieldStatus.Status.ERROR
|
589
|
+
and any(
|
590
|
+
(
|
591
|
+
e.source_error.code != writer_pb2.Error.ErrorCode.DATAAUGMENTATION
|
592
|
+
for e in f.errors
|
593
|
+
)
|
594
|
+
)
|
595
|
+
for f in field_statuses
|
596
|
+
)
|
597
|
+
):
|
598
|
+
self.basic.metadata.status = PBMetadata.Status.ERROR
|
599
|
+
# Otherwise (everything processed or we only have DA errors) -> PROCESSED
|
600
|
+
else:
|
601
|
+
self.basic.metadata.status = PBMetadata.Status.PROCESSED
|
602
|
+
|
535
603
|
@processor_observer.wrap({"type": "apply_extracted"})
|
536
604
|
async def apply_extracted(self, message: BrokerMessage):
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
errors = True
|
605
|
+
if not has_feature(const.Features.FIELD_STATUS):
|
606
|
+
field_obj: Field
|
607
|
+
for error in message.errors:
|
608
|
+
field_obj = await self.get_field(error.field, error.field_type, load=False)
|
609
|
+
await field_obj.set_error(error)
|
543
610
|
|
544
611
|
await self.get_basic()
|
545
612
|
if self.basic is None:
|
@@ -548,19 +615,29 @@ class Resource:
|
|
548
615
|
previous_basic = Basic()
|
549
616
|
previous_basic.CopyFrom(self.basic)
|
550
617
|
|
551
|
-
if errors:
|
552
|
-
self.basic.metadata.status = PBMetadata.Status.ERROR
|
553
|
-
elif errors is False and message.source is message.MessageSource.PROCESSOR:
|
554
|
-
self.basic.metadata.status = PBMetadata.Status.PROCESSED
|
555
|
-
|
556
618
|
maybe_update_basic_icon(self.basic, get_text_field_mimetype(message))
|
557
619
|
|
558
620
|
for question_answers in message.question_answers:
|
559
621
|
await self._apply_question_answers(question_answers)
|
560
622
|
|
623
|
+
for field_id in message.delete_question_answers:
|
624
|
+
await self._delete_question_answers(field_id)
|
625
|
+
|
561
626
|
for extracted_text in message.extracted_text:
|
562
627
|
await self._apply_extracted_text(extracted_text)
|
563
628
|
|
629
|
+
# Update field and resource status depending on processing results
|
630
|
+
await self.apply_fields_status(message, self._modified_extracted_text)
|
631
|
+
if has_feature(const.Features.FIELD_STATUS):
|
632
|
+
# Compute resource status based on all fields statuses
|
633
|
+
await self.update_status()
|
634
|
+
else:
|
635
|
+
# Old code path, compute resource status based on the presence of errors in this BrokerMessage
|
636
|
+
if message.errors:
|
637
|
+
self.basic.metadata.status = PBMetadata.Status.ERROR
|
638
|
+
elif message.source is message.MessageSource.PROCESSOR:
|
639
|
+
self.basic.metadata.status = PBMetadata.Status.PROCESSED
|
640
|
+
|
564
641
|
extracted_languages = []
|
565
642
|
|
566
643
|
for link_extracted_data in message.link_extracted_data:
|
@@ -584,8 +661,7 @@ class Resource:
|
|
584
661
|
# Upload to binary storage
|
585
662
|
# Vector indexing
|
586
663
|
if self.disable_vectors is False:
|
587
|
-
|
588
|
-
await self._apply_extracted_vectors(field_vectors)
|
664
|
+
await self._apply_extracted_vectors(message.field_vectors)
|
589
665
|
|
590
666
|
# Only uploading to binary storage
|
591
667
|
for field_large_metadata in message.field_large_metadata:
|
@@ -614,6 +690,10 @@ class Resource:
|
|
614
690
|
field_obj = await self.get_field(field.field, field.field_type, load=False)
|
615
691
|
await field_obj.set_question_answers(question_answers)
|
616
692
|
|
693
|
+
async def _delete_question_answers(self, field_id: FieldID):
|
694
|
+
field_obj = await self.get_field(field_id.field, field_id.field_type, load=False)
|
695
|
+
await field_obj.delete_question_answers()
|
696
|
+
|
617
697
|
async def _apply_link_extracted_data(self, link_extracted_data: LinkExtractedData):
|
618
698
|
assert self.basic is not None
|
619
699
|
field_link: Link = await self.get_field(
|
@@ -679,15 +759,52 @@ class Resource:
|
|
679
759
|
maybe_update_basic_icon(self.basic, file_extracted_data.icon)
|
680
760
|
maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
|
681
761
|
|
762
|
+
async def _should_update_resource_title_from_file_metadata(self) -> bool:
|
763
|
+
"""
|
764
|
+
We only want to update resource title from file metadata if the title is empty,
|
765
|
+
equal to the resource uuid or equal to any of the file filenames in the resource.
|
766
|
+
"""
|
767
|
+
basic = await self.get_basic()
|
768
|
+
if basic is None:
|
769
|
+
return True
|
770
|
+
current_title = basic.title
|
771
|
+
if current_title == "":
|
772
|
+
# If the title is empty, we should update it
|
773
|
+
return True
|
774
|
+
if current_title == self.uuid:
|
775
|
+
# If the title is the same as the resource uuid, we should update it
|
776
|
+
return True
|
777
|
+
fields = await self.get_fields(force=True)
|
778
|
+
filenames = set()
|
779
|
+
for (field_type, _), field_obj in fields.items():
|
780
|
+
if field_type == FieldType.FILE:
|
781
|
+
field_value: Optional[FieldFile] = await field_obj.get_value()
|
782
|
+
if field_value is not None:
|
783
|
+
if field_value.file.filename not in ("", None):
|
784
|
+
filenames.add(field_value.file.filename)
|
785
|
+
if current_title in filenames:
|
786
|
+
# If the title is equal to any of the file filenames, we should update it
|
787
|
+
return True
|
788
|
+
return False
|
789
|
+
|
682
790
|
async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
|
683
791
|
"""
|
684
792
|
Update the resource title with the first file that has a title extracted.
|
685
793
|
"""
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
794
|
+
if not await self._should_update_resource_title_from_file_metadata():
|
795
|
+
return
|
796
|
+
for fed in message.file_extracted_data:
|
797
|
+
if fed.title == "":
|
798
|
+
# Skip if the extracted title is empty
|
799
|
+
continue
|
800
|
+
fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
|
801
|
+
logger.info(
|
802
|
+
"Updating resource title from file extracted data",
|
803
|
+
extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
|
804
|
+
)
|
805
|
+
await self.update_resource_title(fed.title)
|
806
|
+
# Break after the first file with a title is found
|
807
|
+
break
|
691
808
|
|
692
809
|
async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
|
693
810
|
assert self.basic is not None
|
@@ -723,6 +840,7 @@ class Resource:
|
|
723
840
|
page_positions=page_positions,
|
724
841
|
extracted_text=extracted_text,
|
725
842
|
basic_user_field_metadata=user_field_metadata,
|
843
|
+
replace_field=True,
|
726
844
|
)
|
727
845
|
loop = asyncio.get_running_loop()
|
728
846
|
await loop.run_in_executor(_executor, apply_field_metadata)
|
@@ -731,55 +849,69 @@ class Resource:
|
|
731
849
|
|
732
850
|
add_field_classifications(self.basic, field_metadata)
|
733
851
|
|
734
|
-
async def _apply_extracted_vectors(
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
852
|
+
async def _apply_extracted_vectors(
|
853
|
+
self,
|
854
|
+
fields_vectors: Sequence[ExtractedVectorsWrapper],
|
855
|
+
):
|
856
|
+
await self.get_fields(force=True)
|
857
|
+
vectorsets = {
|
858
|
+
vectorset_id: vs
|
859
|
+
async for vectorset_id, vs in datamanagers.vectorsets.iter(self.txn, kbid=self.kb.kbid)
|
860
|
+
}
|
861
|
+
|
862
|
+
for field_vectors in fields_vectors:
|
863
|
+
# Bw/c with extracted vectors without vectorsets
|
864
|
+
if not field_vectors.vectorset_id:
|
865
|
+
assert (
|
866
|
+
len(vectorsets) == 1
|
867
|
+
), "Invalid broker message, can't ingest vectors from unknown vectorset to KB with multiple vectorsets"
|
868
|
+
vectorset = list(vectorsets.values())[0]
|
750
869
|
|
751
|
-
field_key = self.generate_field_id(field_vectors.field)
|
752
|
-
if vo is not None:
|
753
|
-
vectorset_id = field_vectors.vectorset_id or None
|
754
|
-
if vectorset_id is None:
|
755
|
-
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
756
|
-
self.txn, kbid=self.kb.kbid
|
757
|
-
)
|
758
870
|
else:
|
759
|
-
|
760
|
-
self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
|
761
|
-
)
|
762
|
-
if config is None:
|
871
|
+
if field_vectors.vectorset_id not in vectorsets:
|
763
872
|
logger.warning(
|
764
|
-
|
873
|
+
"Dropping extracted vectors for unknown vectorset",
|
874
|
+
extra={"kbid": self.kb.kbid, "vectorset": field_vectors.vectorset_id},
|
765
875
|
)
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
876
|
+
continue
|
877
|
+
|
878
|
+
vectorset = vectorsets[field_vectors.vectorset_id]
|
879
|
+
|
880
|
+
# Store vectors in the resource
|
881
|
+
|
882
|
+
if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
|
883
|
+
# skipping because field does not exist
|
884
|
+
logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
|
885
|
+
return
|
886
|
+
|
887
|
+
field_obj = await self.get_field(
|
888
|
+
field_vectors.field.field,
|
889
|
+
field_vectors.field.field_type,
|
890
|
+
load=False,
|
891
|
+
)
|
892
|
+
vo = await field_obj.set_vectors(
|
893
|
+
field_vectors, vectorset.vectorset_id, vectorset.storage_key_kind
|
894
|
+
)
|
895
|
+
if vo is None:
|
896
|
+
raise AttributeError("Vector object not found on set_vectors")
|
897
|
+
|
898
|
+
# Prepare vectors to be indexed
|
899
|
+
|
900
|
+
field_key = self.generate_field_id(field_vectors.field)
|
901
|
+
dimension = vectorset.vectorset_index_config.vector_dimension
|
902
|
+
if not dimension:
|
903
|
+
raise ValueError(f"Vector dimension not set for vectorset '{vectorset.vectorset_id}'")
|
770
904
|
|
771
905
|
apply_field_vectors_partial = partial(
|
772
906
|
self.indexer.apply_field_vectors,
|
773
907
|
field_key,
|
774
908
|
vo,
|
775
|
-
vectorset=vectorset_id,
|
909
|
+
vectorset=vectorset.vectorset_id,
|
776
910
|
replace_field=True,
|
777
|
-
|
911
|
+
vector_dimension=dimension,
|
778
912
|
)
|
779
913
|
loop = asyncio.get_running_loop()
|
780
914
|
await loop.run_in_executor(_executor, apply_field_vectors_partial)
|
781
|
-
else:
|
782
|
-
raise AttributeError("VO not found on set")
|
783
915
|
|
784
916
|
async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
|
785
917
|
field_obj = await self.get_field(
|
@@ -852,291 +984,6 @@ class Resource:
|
|
852
984
|
self._indexer = None
|
853
985
|
self.txn = None
|
854
986
|
|
855
|
-
async def iterate_sentences(
|
856
|
-
self, enabled_metadata: EnabledMetadata
|
857
|
-
) -> AsyncIterator[TrainSentence]: # pragma: no cover
|
858
|
-
fields = await self.get_fields(force=True)
|
859
|
-
metadata = TrainMetadata()
|
860
|
-
userdefinedparagraphclass: dict[str, ParagraphAnnotation] = {}
|
861
|
-
if enabled_metadata.labels:
|
862
|
-
if self.basic is None:
|
863
|
-
self.basic = await self.get_basic()
|
864
|
-
if self.basic is not None:
|
865
|
-
metadata.labels.resource.extend(self.basic.usermetadata.classifications)
|
866
|
-
for fieldmetadata in self.basic.fieldmetadata:
|
867
|
-
field_id = self.generate_field_id(fieldmetadata.field)
|
868
|
-
for annotationparagraph in fieldmetadata.paragraphs:
|
869
|
-
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
870
|
-
|
871
|
-
for (type_id, field_id), field in fields.items():
|
872
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
873
|
-
field_key = self.generate_field_id(fieldid)
|
874
|
-
fm = await field.get_field_metadata()
|
875
|
-
extracted_text = None
|
876
|
-
vo = None
|
877
|
-
text = None
|
878
|
-
|
879
|
-
if enabled_metadata.vector:
|
880
|
-
vo = await field.get_vectors()
|
881
|
-
|
882
|
-
extracted_text = await field.get_extracted_text()
|
883
|
-
|
884
|
-
if fm is None:
|
885
|
-
continue
|
886
|
-
|
887
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
888
|
-
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
889
|
-
field_metadatas.append((subfield_metadata, splitted_metadata))
|
890
|
-
|
891
|
-
for subfield, field_metadata in field_metadatas:
|
892
|
-
if enabled_metadata.labels:
|
893
|
-
metadata.labels.ClearField("field")
|
894
|
-
metadata.labels.field.extend(field_metadata.classifications)
|
895
|
-
|
896
|
-
entities: dict[str, str] = {}
|
897
|
-
if enabled_metadata.entities:
|
898
|
-
_update_entities_dict(entities, field_metadata)
|
899
|
-
|
900
|
-
precomputed_vectors = {}
|
901
|
-
if vo is not None:
|
902
|
-
if subfield is not None:
|
903
|
-
vectors = vo.split_vectors[subfield]
|
904
|
-
base_vector_key = f"{self.uuid}/{field_key}/{subfield}"
|
905
|
-
else:
|
906
|
-
vectors = vo.vectors
|
907
|
-
base_vector_key = f"{self.uuid}/{field_key}"
|
908
|
-
for index, vector in enumerate(vectors.vectors):
|
909
|
-
vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
910
|
-
precomputed_vectors[vector_key] = vector.vector
|
911
|
-
|
912
|
-
if extracted_text is not None:
|
913
|
-
if subfield is not None:
|
914
|
-
text = extracted_text.split_text[subfield]
|
915
|
-
else:
|
916
|
-
text = extracted_text.text
|
917
|
-
|
918
|
-
for paragraph in field_metadata.paragraphs:
|
919
|
-
if subfield is not None:
|
920
|
-
paragraph_key = (
|
921
|
-
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
922
|
-
)
|
923
|
-
else:
|
924
|
-
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
925
|
-
|
926
|
-
if enabled_metadata.labels:
|
927
|
-
metadata.labels.ClearField("field")
|
928
|
-
metadata.labels.paragraph.extend(paragraph.classifications)
|
929
|
-
if paragraph_key in userdefinedparagraphclass:
|
930
|
-
metadata.labels.paragraph.extend(
|
931
|
-
userdefinedparagraphclass[paragraph_key].classifications
|
932
|
-
)
|
933
|
-
|
934
|
-
for index, sentence in enumerate(paragraph.sentences):
|
935
|
-
if subfield is not None:
|
936
|
-
sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
|
937
|
-
else:
|
938
|
-
sentence_key = (
|
939
|
-
f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
|
940
|
-
)
|
941
|
-
|
942
|
-
if vo is not None:
|
943
|
-
metadata.ClearField("vector")
|
944
|
-
vector_tmp = precomputed_vectors.get(sentence_key)
|
945
|
-
if vector_tmp:
|
946
|
-
metadata.vector.extend(vector_tmp)
|
947
|
-
|
948
|
-
if extracted_text is not None and text is not None:
|
949
|
-
metadata.text = text[sentence.start : sentence.end]
|
950
|
-
|
951
|
-
metadata.ClearField("entities")
|
952
|
-
metadata.ClearField("entity_positions")
|
953
|
-
if enabled_metadata.entities and text is not None:
|
954
|
-
local_text = text[sentence.start : sentence.end]
|
955
|
-
add_entities_to_metadata(entities, local_text, metadata)
|
956
|
-
|
957
|
-
pb_sentence = TrainSentence()
|
958
|
-
pb_sentence.uuid = self.uuid
|
959
|
-
pb_sentence.field.CopyFrom(fieldid)
|
960
|
-
pb_sentence.paragraph = paragraph_key
|
961
|
-
pb_sentence.sentence = sentence_key
|
962
|
-
pb_sentence.metadata.CopyFrom(metadata)
|
963
|
-
yield pb_sentence
|
964
|
-
|
965
|
-
async def iterate_paragraphs(
|
966
|
-
self, enabled_metadata: EnabledMetadata
|
967
|
-
) -> AsyncIterator[TrainParagraph]:
|
968
|
-
fields = await self.get_fields(force=True)
|
969
|
-
metadata = TrainMetadata()
|
970
|
-
userdefinedparagraphclass: dict[str, ParagraphAnnotation] = {}
|
971
|
-
if enabled_metadata.labels:
|
972
|
-
if self.basic is None:
|
973
|
-
self.basic = await self.get_basic()
|
974
|
-
if self.basic is not None:
|
975
|
-
metadata.labels.resource.extend(self.basic.usermetadata.classifications)
|
976
|
-
for fieldmetadata in self.basic.fieldmetadata:
|
977
|
-
field_id = self.generate_field_id(fieldmetadata.field)
|
978
|
-
for annotationparagraph in fieldmetadata.paragraphs:
|
979
|
-
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
980
|
-
|
981
|
-
for (type_id, field_id), field in fields.items():
|
982
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
983
|
-
field_key = self.generate_field_id(fieldid)
|
984
|
-
fm = await field.get_field_metadata()
|
985
|
-
extracted_text = None
|
986
|
-
text = None
|
987
|
-
|
988
|
-
extracted_text = await field.get_extracted_text()
|
989
|
-
|
990
|
-
if fm is None:
|
991
|
-
continue
|
992
|
-
|
993
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
994
|
-
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
995
|
-
field_metadatas.append((subfield_metadata, splitted_metadata))
|
996
|
-
|
997
|
-
for subfield, field_metadata in field_metadatas:
|
998
|
-
if enabled_metadata.labels:
|
999
|
-
metadata.labels.ClearField("field")
|
1000
|
-
metadata.labels.field.extend(field_metadata.classifications)
|
1001
|
-
|
1002
|
-
entities: dict[str, str] = {}
|
1003
|
-
if enabled_metadata.entities:
|
1004
|
-
_update_entities_dict(entities, field_metadata)
|
1005
|
-
|
1006
|
-
if extracted_text is not None:
|
1007
|
-
if subfield is not None:
|
1008
|
-
text = extracted_text.split_text[subfield]
|
1009
|
-
else:
|
1010
|
-
text = extracted_text.text
|
1011
|
-
|
1012
|
-
for paragraph in field_metadata.paragraphs:
|
1013
|
-
if subfield is not None:
|
1014
|
-
paragraph_key = (
|
1015
|
-
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1016
|
-
)
|
1017
|
-
else:
|
1018
|
-
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1019
|
-
|
1020
|
-
if enabled_metadata.labels:
|
1021
|
-
metadata.labels.ClearField("paragraph")
|
1022
|
-
metadata.labels.paragraph.extend(paragraph.classifications)
|
1023
|
-
|
1024
|
-
if extracted_text is not None and text is not None:
|
1025
|
-
metadata.text = text[paragraph.start : paragraph.end]
|
1026
|
-
|
1027
|
-
metadata.ClearField("entities")
|
1028
|
-
metadata.ClearField("entity_positions")
|
1029
|
-
if enabled_metadata.entities and text is not None:
|
1030
|
-
local_text = text[paragraph.start : paragraph.end]
|
1031
|
-
add_entities_to_metadata(entities, local_text, metadata)
|
1032
|
-
|
1033
|
-
if paragraph_key in userdefinedparagraphclass:
|
1034
|
-
metadata.labels.paragraph.extend(
|
1035
|
-
userdefinedparagraphclass[paragraph_key].classifications
|
1036
|
-
)
|
1037
|
-
|
1038
|
-
pb_paragraph = TrainParagraph()
|
1039
|
-
pb_paragraph.uuid = self.uuid
|
1040
|
-
pb_paragraph.field.CopyFrom(fieldid)
|
1041
|
-
pb_paragraph.paragraph = paragraph_key
|
1042
|
-
pb_paragraph.metadata.CopyFrom(metadata)
|
1043
|
-
|
1044
|
-
yield pb_paragraph
|
1045
|
-
|
1046
|
-
async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
|
1047
|
-
fields = await self.get_fields(force=True)
|
1048
|
-
metadata = TrainMetadata()
|
1049
|
-
if enabled_metadata.labels:
|
1050
|
-
if self.basic is None:
|
1051
|
-
self.basic = await self.get_basic()
|
1052
|
-
if self.basic is not None:
|
1053
|
-
metadata.labels.resource.extend(self.basic.usermetadata.classifications)
|
1054
|
-
|
1055
|
-
for (type_id, field_id), field in fields.items():
|
1056
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
1057
|
-
fm = await field.get_field_metadata()
|
1058
|
-
extracted_text = None
|
1059
|
-
|
1060
|
-
if enabled_metadata.text:
|
1061
|
-
extracted_text = await field.get_extracted_text()
|
1062
|
-
|
1063
|
-
if fm is None:
|
1064
|
-
continue
|
1065
|
-
|
1066
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1067
|
-
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1068
|
-
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1069
|
-
|
1070
|
-
for subfield, splitted_metadata in field_metadatas:
|
1071
|
-
if enabled_metadata.labels:
|
1072
|
-
metadata.labels.ClearField("field")
|
1073
|
-
metadata.labels.field.extend(splitted_metadata.classifications)
|
1074
|
-
|
1075
|
-
if extracted_text is not None:
|
1076
|
-
if subfield is not None:
|
1077
|
-
metadata.text = extracted_text.split_text[subfield]
|
1078
|
-
else:
|
1079
|
-
metadata.text = extracted_text.text
|
1080
|
-
|
1081
|
-
if enabled_metadata.entities:
|
1082
|
-
metadata.ClearField("entities")
|
1083
|
-
_update_entities_dict(metadata.entities, splitted_metadata)
|
1084
|
-
|
1085
|
-
pb_field = TrainField()
|
1086
|
-
pb_field.uuid = self.uuid
|
1087
|
-
pb_field.field.CopyFrom(fieldid)
|
1088
|
-
pb_field.metadata.CopyFrom(metadata)
|
1089
|
-
yield pb_field
|
1090
|
-
|
1091
|
-
async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
|
1092
|
-
fields = await self.get_fields(force=True)
|
1093
|
-
metadata = TrainMetadata()
|
1094
|
-
if enabled_metadata.labels:
|
1095
|
-
if self.basic is None:
|
1096
|
-
self.basic = await self.get_basic()
|
1097
|
-
if self.basic is not None:
|
1098
|
-
metadata.labels.resource.extend(self.basic.usermetadata.classifications)
|
1099
|
-
|
1100
|
-
metadata.labels.ClearField("field")
|
1101
|
-
metadata.ClearField("entities")
|
1102
|
-
|
1103
|
-
for (_, _), field in fields.items():
|
1104
|
-
extracted_text = None
|
1105
|
-
fm = await field.get_field_metadata()
|
1106
|
-
|
1107
|
-
if enabled_metadata.text:
|
1108
|
-
extracted_text = await field.get_extracted_text()
|
1109
|
-
|
1110
|
-
if extracted_text is not None:
|
1111
|
-
metadata.text += extracted_text.text
|
1112
|
-
for text in extracted_text.split_text.values():
|
1113
|
-
metadata.text += f" {text}"
|
1114
|
-
|
1115
|
-
if fm is None:
|
1116
|
-
continue
|
1117
|
-
|
1118
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1119
|
-
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1120
|
-
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1121
|
-
|
1122
|
-
for _, splitted_metadata in field_metadatas:
|
1123
|
-
if enabled_metadata.labels:
|
1124
|
-
metadata.labels.field.extend(splitted_metadata.classifications)
|
1125
|
-
|
1126
|
-
if enabled_metadata.entities:
|
1127
|
-
_update_entities_dict(metadata.entities, splitted_metadata)
|
1128
|
-
|
1129
|
-
pb_resource = TrainResource()
|
1130
|
-
pb_resource.uuid = self.uuid
|
1131
|
-
if self.basic is not None:
|
1132
|
-
pb_resource.title = self.basic.title
|
1133
|
-
pb_resource.icon = self.basic.icon
|
1134
|
-
pb_resource.slug = self.basic.slug
|
1135
|
-
pb_resource.modified.CopyFrom(self.basic.modified)
|
1136
|
-
pb_resource.created.CopyFrom(self.basic.created)
|
1137
|
-
pb_resource.metadata.CopyFrom(metadata)
|
1138
|
-
return pb_resource
|
1139
|
-
|
1140
987
|
|
1141
988
|
async def get_file_page_positions(field: File) -> FilePagePositions:
|
1142
989
|
positions: FilePagePositions = {}
|
@@ -1181,24 +1028,6 @@ def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper
|
|
1181
1028
|
return True
|
1182
1029
|
|
1183
1030
|
|
1184
|
-
def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
|
1185
|
-
for entity_key, entity_value in entities.items():
|
1186
|
-
if entity_key not in local_text:
|
1187
|
-
# Add the entity only if found in text
|
1188
|
-
continue
|
1189
|
-
metadata.entities[entity_key] = entity_value
|
1190
|
-
|
1191
|
-
# Add positions for the entity relative to the local text
|
1192
|
-
poskey = f"{entity_value}/{entity_key}"
|
1193
|
-
metadata.entity_positions[poskey].entity = entity_key
|
1194
|
-
last_occurrence_end = 0
|
1195
|
-
for _ in range(local_text.count(entity_key)):
|
1196
|
-
start = local_text.index(entity_key, last_occurrence_end)
|
1197
|
-
end = start + len(entity_key)
|
1198
|
-
metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
|
1199
|
-
last_occurrence_end = end
|
1200
|
-
|
1201
|
-
|
1202
1031
|
def maybe_update_basic_summary(basic: PBBasic, summary_text: str) -> bool:
|
1203
1032
|
if basic.summary or not summary_text:
|
1204
1033
|
return False
|
@@ -1267,23 +1096,3 @@ def extract_field_metadata_languages(
|
|
1267
1096
|
for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
|
1268
1097
|
languages.add(splitted_metadata.language)
|
1269
1098
|
return list(languages)
|
1270
|
-
|
1271
|
-
|
1272
|
-
def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
|
1273
|
-
"""
|
1274
|
-
Update the entities dict with the entities from the field metadata.
|
1275
|
-
Method created to ease the transition from legacy ner field to new entities field.
|
1276
|
-
"""
|
1277
|
-
# Data Augmentation + Processor entities
|
1278
|
-
# This will overwrite entities detected from more than one data augmentation task
|
1279
|
-
# TODO: Change TrainMetadata proto to accept multiple entities with the same text
|
1280
|
-
entity_map = {
|
1281
|
-
entity.text: entity.label
|
1282
|
-
for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
|
1283
|
-
for entity in entities_wrapper.entities
|
1284
|
-
}
|
1285
|
-
target_entites_dict.update(entity_map)
|
1286
|
-
|
1287
|
-
# Legacy processor entities
|
1288
|
-
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
1289
|
-
target_entites_dict.update(field_metadata.ner)
|