nucliadb 6.4.0.post4213__py3-none-any.whl → 6.4.0.post4227__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/ingest/consumer/service.py +1 -0
- nucliadb/ingest/orm/brain_v2.py +1 -1
- nucliadb/ingest/orm/index_message.py +3 -14
- nucliadb/ingest/orm/processor/processor.py +2 -36
- nucliadb/ingest/orm/resource.py +2 -227
- nucliadb/search/search/chat/query.py +1 -0
- {nucliadb-6.4.0.post4213.dist-info → nucliadb-6.4.0.post4227.dist-info}/METADATA +6 -6
- {nucliadb-6.4.0.post4213.dist-info → nucliadb-6.4.0.post4227.dist-info}/RECORD +11 -12
- nucliadb/ingest/orm/brain.py +0 -695
- {nucliadb-6.4.0.post4213.dist-info → nucliadb-6.4.0.post4227.dist-info}/WHEEL +0 -0
- {nucliadb-6.4.0.post4213.dist-info → nucliadb-6.4.0.post4227.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.4.0.post4213.dist-info → nucliadb-6.4.0.post4227.dist-info}/top_level.txt +0 -0
@@ -59,6 +59,7 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
|
|
59
59
|
|
60
60
|
|
61
61
|
async def start_back_pressure() -> BackPressureMaterializer:
|
62
|
+
logger.info("Starting back pressure materializer")
|
62
63
|
nats_manager = await start_nats_manager(
|
63
64
|
SERVICE_NAME,
|
64
65
|
indexing_settings.index_jetstream_servers,
|
nucliadb/ingest/orm/brain_v2.py
CHANGED
@@ -27,14 +27,12 @@ from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
27
27
|
from nucliadb.common import datamanagers
|
28
28
|
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
29
29
|
from nucliadb.ingest.fields.file import File
|
30
|
-
from nucliadb.ingest.orm.brain_v2 import
|
30
|
+
from nucliadb.ingest.orm.brain_v2 import ResourceBrain
|
31
31
|
from nucliadb.ingest.orm.metrics import index_message_observer as observer
|
32
32
|
from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
|
33
33
|
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
34
34
|
from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
|
35
35
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
36
|
-
from nucliadb_utils import const
|
37
|
-
from nucliadb_utils.utilities import has_feature
|
38
36
|
|
39
37
|
|
40
38
|
class IndexMessageBuilder:
|
@@ -403,14 +401,5 @@ async def get_resource_index_message(
|
|
403
401
|
"""
|
404
402
|
Get the full index message for a resource.
|
405
403
|
"""
|
406
|
-
|
407
|
-
|
408
|
-
context={
|
409
|
-
"kbid": resource.kb.kbid,
|
410
|
-
},
|
411
|
-
):
|
412
|
-
im_builder = IndexMessageBuilder(resource)
|
413
|
-
return await im_builder.full(reindex=reindex)
|
414
|
-
else:
|
415
|
-
# TODO: remove this code when we remove the old index message generation
|
416
|
-
return (await resource.generate_index_message(reindex=reindex)).brain
|
404
|
+
im_builder = IndexMessageBuilder(resource)
|
405
|
+
return await im_builder.full(reindex=reindex)
|
@@ -461,8 +461,8 @@ class Processor:
|
|
461
461
|
source=source,
|
462
462
|
)
|
463
463
|
|
464
|
-
@processor_observer.wrap({"type": "
|
465
|
-
async def
|
464
|
+
@processor_observer.wrap({"type": "generate_index_message"})
|
465
|
+
async def generate_index_message(
|
466
466
|
self,
|
467
467
|
resource: Resource,
|
468
468
|
messages: list[writer_pb2.BrokerMessage],
|
@@ -477,40 +477,6 @@ class Processor:
|
|
477
477
|
else: # pragma: no cover
|
478
478
|
raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
|
479
479
|
|
480
|
-
@processor_observer.wrap({"type": "generate_index_message_v1"})
|
481
|
-
async def generate_index_message_v1(
|
482
|
-
self,
|
483
|
-
resource: Resource,
|
484
|
-
messages: list[writer_pb2.BrokerMessage],
|
485
|
-
) -> PBBrainResource:
|
486
|
-
if any(needs_reindex(m) for m in messages):
|
487
|
-
# when reindexing, let's just generate full new index message
|
488
|
-
# TODO - This should be improved in the future as it's not optimal for very large resources:
|
489
|
-
# As of now, there are some API operations that require fully reindexing all the fields of a resource.
|
490
|
-
# An example of this is classification label changes - we need to reindex all the fields of a resource to
|
491
|
-
# propagate the label changes to the index.
|
492
|
-
resource.replace_indexer(await resource.generate_index_message(reindex=True))
|
493
|
-
else:
|
494
|
-
# TODO - Ideally we should only update the fields that have been changed in the current transaction.
|
495
|
-
await resource.compute_global_text()
|
496
|
-
await resource.compute_global_tags(resource.indexer)
|
497
|
-
await resource.compute_security(resource.indexer)
|
498
|
-
return resource.indexer.brain
|
499
|
-
|
500
|
-
async def generate_index_message(
|
501
|
-
self,
|
502
|
-
resource: Resource,
|
503
|
-
messages: list[writer_pb2.BrokerMessage],
|
504
|
-
resource_created: bool = False,
|
505
|
-
) -> PBBrainResource:
|
506
|
-
if has_feature(
|
507
|
-
const.Features.INDEX_MESSAGE_GENERATION_V2,
|
508
|
-
context={"kbid": resource.kb.kbid},
|
509
|
-
):
|
510
|
-
return await self.generate_index_message_v2(resource, messages, resource_created)
|
511
|
-
else:
|
512
|
-
return await self.generate_index_message_v1(resource, messages)
|
513
|
-
|
514
480
|
async def external_index_delete_resource(
|
515
481
|
self, external_index_manager: ExternalIndexManager, resource_uuid: str
|
516
482
|
):
|
nucliadb/ingest/orm/resource.py
CHANGED
@@ -19,11 +19,9 @@
|
|
19
19
|
#
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
-
import asyncio
|
23
22
|
import logging
|
24
23
|
from collections import defaultdict
|
25
24
|
from concurrent.futures import ThreadPoolExecutor
|
26
|
-
from functools import partial
|
27
25
|
from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
|
28
26
|
|
29
27
|
from nucliadb.common import datamanagers
|
@@ -32,12 +30,11 @@ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
|
|
32
30
|
from nucliadb.common.maindb.driver import Transaction
|
33
31
|
from nucliadb.ingest.fields.base import Field
|
34
32
|
from nucliadb.ingest.fields.conversation import Conversation
|
35
|
-
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
36
33
|
from nucliadb.ingest.fields.file import File
|
37
34
|
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
38
35
|
from nucliadb.ingest.fields.link import Link
|
39
36
|
from nucliadb.ingest.fields.text import Text
|
40
|
-
from nucliadb.ingest.orm.
|
37
|
+
from nucliadb.ingest.orm.brain_v2 import FilePagePositions
|
41
38
|
from nucliadb.ingest.orm.metrics import processor_observer
|
42
39
|
from nucliadb_models import content_types
|
43
40
|
from nucliadb_models.common import CloudLink
|
@@ -69,9 +66,7 @@ from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
|
69
66
|
from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
70
67
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
71
68
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
72
|
-
from nucliadb_utils import const
|
73
69
|
from nucliadb_utils.storages.storage import Storage
|
74
|
-
from nucliadb_utils.utilities import has_feature
|
75
70
|
|
76
71
|
if TYPE_CHECKING: # pragma: no cover
|
77
72
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
@@ -121,7 +116,6 @@ class Resource:
|
|
121
116
|
self.extra: Optional[PBExtra] = None
|
122
117
|
self.security: Optional[utils_pb2.Security] = None
|
123
118
|
self.modified: bool = False
|
124
|
-
self._indexer: Optional[ResourceBrain] = None
|
125
119
|
self._modified_extracted_text: list[FieldID] = []
|
126
120
|
|
127
121
|
self.txn = txn
|
@@ -133,15 +127,6 @@ class Resource:
|
|
133
127
|
self._previous_status: Optional[Metadata.Status.ValueType] = None
|
134
128
|
self.user_relations: Optional[PBRelations] = None
|
135
129
|
|
136
|
-
@property
|
137
|
-
def indexer(self) -> ResourceBrain:
|
138
|
-
if self._indexer is None:
|
139
|
-
self._indexer = ResourceBrain(rid=self.uuid)
|
140
|
-
return self._indexer
|
141
|
-
|
142
|
-
def replace_indexer(self, indexer: ResourceBrain) -> None:
|
143
|
-
self._indexer = indexer
|
144
|
-
|
145
130
|
async def set_slug(self):
|
146
131
|
basic = await self.get_basic()
|
147
132
|
new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
|
@@ -159,14 +144,6 @@ class Resource:
|
|
159
144
|
if basic_in_payload.HasField("metadata") and basic_in_payload.metadata.useful:
|
160
145
|
current_basic.metadata.status = basic_in_payload.metadata.status
|
161
146
|
|
162
|
-
def has_index_message_v2_feature(self) -> bool:
|
163
|
-
return has_feature(
|
164
|
-
const.Features.INDEX_MESSAGE_GENERATION_V2,
|
165
|
-
context={
|
166
|
-
"kbid": self.kb.kbid,
|
167
|
-
},
|
168
|
-
)
|
169
|
-
|
170
147
|
@processor_observer.wrap({"type": "set_basic"})
|
171
148
|
async def set_basic(
|
172
149
|
self,
|
@@ -219,30 +196,6 @@ class Resource:
|
|
219
196
|
del self.basic.fieldmetadata[:]
|
220
197
|
self.basic.fieldmetadata.extend(updated)
|
221
198
|
|
222
|
-
if not self.has_index_message_v2_feature():
|
223
|
-
# TODO: Remove this when we remove the old indexer is removed
|
224
|
-
# All modified field metadata should be indexed
|
225
|
-
# TODO: could be improved to only index the diff
|
226
|
-
for user_field_metadata in self.basic.fieldmetadata:
|
227
|
-
field_id = self.generate_field_id(fieldmetadata.field)
|
228
|
-
field_obj = await self.get_field(
|
229
|
-
fieldmetadata.field.field, fieldmetadata.field.field_type
|
230
|
-
)
|
231
|
-
field_metadata = await field_obj.get_field_metadata()
|
232
|
-
if field_metadata is not None:
|
233
|
-
page_positions: Optional[FilePagePositions] = None
|
234
|
-
if isinstance(field_obj, File):
|
235
|
-
page_positions = await get_file_page_positions(field_obj)
|
236
|
-
|
237
|
-
self.indexer.apply_field_metadata(
|
238
|
-
field_id,
|
239
|
-
field_metadata,
|
240
|
-
page_positions=page_positions,
|
241
|
-
extracted_text=await field_obj.get_extracted_text(),
|
242
|
-
basic_user_field_metadata=user_field_metadata,
|
243
|
-
replace_field=True,
|
244
|
-
)
|
245
|
-
|
246
199
|
# Some basic fields are computed off field metadata.
|
247
200
|
# This means we need to recompute upon field deletions.
|
248
201
|
if deleted_fields is not None and len(deleted_fields) > 0:
|
@@ -313,66 +266,6 @@ class Resource:
|
|
313
266
|
self.modified = True
|
314
267
|
self.user_relations = payload
|
315
268
|
|
316
|
-
@processor_observer.wrap({"type": "generate_index_message_old"})
|
317
|
-
async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
|
318
|
-
brain = ResourceBrain(rid=self.uuid)
|
319
|
-
basic = await self.get_basic()
|
320
|
-
await self.compute_security(brain)
|
321
|
-
await self.compute_global_tags(brain)
|
322
|
-
fields = await self.get_fields(force=True)
|
323
|
-
for (type_id, field_id), field in fields.items():
|
324
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
325
|
-
await self.compute_global_text_field(fieldid, brain)
|
326
|
-
|
327
|
-
field_metadata = await field.get_field_metadata()
|
328
|
-
field_key = self.generate_field_id(fieldid)
|
329
|
-
if field_metadata is not None:
|
330
|
-
page_positions: Optional[FilePagePositions] = None
|
331
|
-
if type_id == FieldType.FILE and isinstance(field, File):
|
332
|
-
page_positions = await get_file_page_positions(field)
|
333
|
-
|
334
|
-
user_field_metadata = None
|
335
|
-
if basic is not None:
|
336
|
-
user_field_metadata = next(
|
337
|
-
(
|
338
|
-
fm
|
339
|
-
for fm in basic.fieldmetadata
|
340
|
-
if fm.field.field == field_id and fm.field.field_type == type_id
|
341
|
-
),
|
342
|
-
None,
|
343
|
-
)
|
344
|
-
brain.apply_field_metadata(
|
345
|
-
field_key,
|
346
|
-
field_metadata,
|
347
|
-
page_positions=page_positions,
|
348
|
-
extracted_text=await field.get_extracted_text(),
|
349
|
-
basic_user_field_metadata=user_field_metadata,
|
350
|
-
replace_field=reindex,
|
351
|
-
)
|
352
|
-
|
353
|
-
if self.disable_vectors is False:
|
354
|
-
vectorset_configs = []
|
355
|
-
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
356
|
-
self.txn, kbid=self.kb.kbid
|
357
|
-
):
|
358
|
-
vectorset_configs.append(vectorset_config)
|
359
|
-
|
360
|
-
for vectorset_config in vectorset_configs:
|
361
|
-
vo = await field.get_vectors(
|
362
|
-
vectorset=vectorset_config.vectorset_id,
|
363
|
-
storage_key_kind=vectorset_config.storage_key_kind,
|
364
|
-
)
|
365
|
-
if vo is not None:
|
366
|
-
dimension = vectorset_config.vectorset_index_config.vector_dimension
|
367
|
-
brain.apply_field_vectors(
|
368
|
-
field_key,
|
369
|
-
vo,
|
370
|
-
vectorset=vectorset_config.vectorset_id,
|
371
|
-
vector_dimension=dimension,
|
372
|
-
replace_field=reindex,
|
373
|
-
)
|
374
|
-
return brain
|
375
|
-
|
376
269
|
# Fields
|
377
270
|
async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
|
378
271
|
# Get all fields
|
@@ -445,11 +338,6 @@ class Resource:
|
|
445
338
|
if field in self.all_fields_keys:
|
446
339
|
self.all_fields_keys.remove(field)
|
447
340
|
|
448
|
-
# TODO: Remove this when we remove the old indexer
|
449
|
-
if not self.has_index_message_v2_feature():
|
450
|
-
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
451
|
-
self.indexer.delete_field(field_key=field_key)
|
452
|
-
|
453
341
|
await field_obj.delete()
|
454
342
|
|
455
343
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
@@ -668,7 +556,6 @@ class Resource:
|
|
668
556
|
update_basic_languages(self.basic, extracted_languages)
|
669
557
|
|
670
558
|
# Upload to binary storage
|
671
|
-
# Vector indexing
|
672
559
|
if self.disable_vectors is False:
|
673
560
|
await self._apply_extracted_vectors(message.field_vectors)
|
674
561
|
|
@@ -828,38 +715,7 @@ class Resource:
|
|
828
715
|
field_metadata.field.field_type,
|
829
716
|
load=False,
|
830
717
|
)
|
831
|
-
|
832
|
-
|
833
|
-
# TODO: Remove this when we remove the old indexer
|
834
|
-
if not self.has_index_message_v2_feature():
|
835
|
-
field_key = self.generate_field_id(field_metadata.field)
|
836
|
-
|
837
|
-
page_positions: Optional[FilePagePositions] = None
|
838
|
-
if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
|
839
|
-
page_positions = await get_file_page_positions(field_obj)
|
840
|
-
|
841
|
-
user_field_metadata = next(
|
842
|
-
(
|
843
|
-
fm
|
844
|
-
for fm in self.basic.fieldmetadata
|
845
|
-
if fm.field.field == field_metadata.field.field
|
846
|
-
and fm.field.field_type == field_metadata.field.field_type
|
847
|
-
),
|
848
|
-
None,
|
849
|
-
)
|
850
|
-
|
851
|
-
extracted_text = await field_obj.get_extracted_text()
|
852
|
-
apply_field_metadata = partial(
|
853
|
-
self.indexer.apply_field_metadata,
|
854
|
-
field_key,
|
855
|
-
metadata,
|
856
|
-
page_positions=page_positions,
|
857
|
-
extracted_text=extracted_text,
|
858
|
-
basic_user_field_metadata=user_field_metadata,
|
859
|
-
replace_field=True,
|
860
|
-
)
|
861
|
-
loop = asyncio.get_running_loop()
|
862
|
-
await loop.run_in_executor(_executor, apply_field_metadata)
|
718
|
+
await field_obj.set_field_metadata(field_metadata)
|
863
719
|
|
864
720
|
maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
|
865
721
|
|
@@ -913,27 +769,6 @@ class Resource:
|
|
913
769
|
if vo is None:
|
914
770
|
raise AttributeError("Vector object not found on set_vectors")
|
915
771
|
|
916
|
-
if self.has_index_message_v2_feature():
|
917
|
-
continue
|
918
|
-
|
919
|
-
# TODO: Remove this when we remove the old indexer
|
920
|
-
# Prepare vectors to be indexed
|
921
|
-
field_key = self.generate_field_id(field_vectors.field)
|
922
|
-
dimension = vectorset.vectorset_index_config.vector_dimension
|
923
|
-
if not dimension:
|
924
|
-
raise ValueError(f"Vector dimension not set for vectorset '{vectorset.vectorset_id}'")
|
925
|
-
|
926
|
-
apply_field_vectors_partial = partial(
|
927
|
-
self.indexer.apply_field_vectors,
|
928
|
-
field_key,
|
929
|
-
vo,
|
930
|
-
vectorset=vectorset.vectorset_id,
|
931
|
-
replace_field=True,
|
932
|
-
vector_dimension=dimension,
|
933
|
-
)
|
934
|
-
loop = asyncio.get_running_loop()
|
935
|
-
await loop.run_in_executor(_executor, apply_field_vectors_partial)
|
936
|
-
|
937
772
|
async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
|
938
773
|
field_obj = await self.get_field(
|
939
774
|
field_large_metadata.field.field,
|
@@ -946,67 +781,7 @@ class Resource:
|
|
946
781
|
def generate_field_id(self, field: FieldID) -> str:
|
947
782
|
return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
|
948
783
|
|
949
|
-
async def compute_security(self, brain: ResourceBrain):
|
950
|
-
security = await self.get_security()
|
951
|
-
if security is None:
|
952
|
-
return
|
953
|
-
brain.set_security(security)
|
954
|
-
|
955
|
-
@processor_observer.wrap({"type": "compute_global_tags"})
|
956
|
-
async def compute_global_tags(self, brain: ResourceBrain):
|
957
|
-
origin = await self.get_origin()
|
958
|
-
basic = await self.get_basic()
|
959
|
-
user_relations = await self.get_user_relations()
|
960
|
-
if basic is None:
|
961
|
-
raise KeyError("Resource not found")
|
962
|
-
|
963
|
-
brain.set_processing_status(basic=basic, previous_status=self._previous_status)
|
964
|
-
brain.set_resource_metadata(basic=basic, origin=origin, user_relations=user_relations)
|
965
|
-
for type, field in await self.get_fields_ids(force=True):
|
966
|
-
fieldobj = await self.get_field(field, type, load=False)
|
967
|
-
fieldid = FieldID(field_type=type, field=field)
|
968
|
-
fieldkey = self.generate_field_id(fieldid)
|
969
|
-
extracted_metadata = await fieldobj.get_field_metadata()
|
970
|
-
valid_user_field_metadata = None
|
971
|
-
for user_field_metadata in basic.fieldmetadata:
|
972
|
-
if (
|
973
|
-
user_field_metadata.field.field == field
|
974
|
-
and user_field_metadata.field.field_type == type
|
975
|
-
):
|
976
|
-
valid_user_field_metadata = user_field_metadata
|
977
|
-
break
|
978
|
-
try:
|
979
|
-
generated_by = await fieldobj.generated_by()
|
980
|
-
except FieldAuthorNotFound:
|
981
|
-
generated_by = None
|
982
|
-
brain.apply_field_labels(
|
983
|
-
fieldkey,
|
984
|
-
extracted_metadata,
|
985
|
-
self.uuid,
|
986
|
-
generated_by,
|
987
|
-
basic.usermetadata,
|
988
|
-
valid_user_field_metadata,
|
989
|
-
)
|
990
|
-
|
991
|
-
@processor_observer.wrap({"type": "compute_global_text"})
|
992
|
-
async def compute_global_text(self):
|
993
|
-
for type, field in await self.get_fields_ids(force=True):
|
994
|
-
fieldid = FieldID(field_type=type, field=field)
|
995
|
-
await self.compute_global_text_field(fieldid, self.indexer)
|
996
|
-
|
997
|
-
async def compute_global_text_field(self, fieldid: FieldID, brain: ResourceBrain):
|
998
|
-
fieldobj = await self.get_field(fieldid.field, fieldid.field_type, load=False)
|
999
|
-
fieldkey = self.generate_field_id(fieldid)
|
1000
|
-
extracted_text = await fieldobj.get_extracted_text()
|
1001
|
-
if extracted_text is None:
|
1002
|
-
return
|
1003
|
-
field_text = extracted_text.text
|
1004
|
-
for _, split in extracted_text.split_text.items():
|
1005
|
-
field_text += f" {split} "
|
1006
|
-
brain.apply_field_text(fieldkey, field_text, replace_field=True)
|
1007
|
-
|
1008
784
|
def clean(self):
|
1009
|
-
self._indexer = None
|
1010
785
|
self.txn = None
|
1011
786
|
|
1012
787
|
|
@@ -178,6 +178,7 @@ def add_resource_filter(request: Union[FindRequest, AskRequest], resources: list
|
|
178
178
|
|
179
179
|
def find_request_from_ask_request(item: AskRequest, query: str) -> FindRequest:
|
180
180
|
find_request = FindRequest()
|
181
|
+
find_request.filter_expression = item.filter_expression
|
181
182
|
find_request.resource_filters = item.resource_filters
|
182
183
|
find_request.features = []
|
183
184
|
if ChatOptions.SEMANTIC in item.features:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: nucliadb
|
3
|
-
Version: 6.4.0.
|
3
|
+
Version: 6.4.0.post4227
|
4
4
|
Summary: NucliaDB
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
6
6
|
License: AGPL
|
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
21
21
|
Requires-Python: <4,>=3.9
|
22
22
|
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.
|
24
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.
|
25
|
-
Requires-Dist: nucliadb-protos>=6.4.0.
|
26
|
-
Requires-Dist: nucliadb-models>=6.4.0.
|
27
|
-
Requires-Dist: nidx-protos>=6.4.0.
|
23
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4227
|
24
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4227
|
25
|
+
Requires-Dist: nucliadb-protos>=6.4.0.post4227
|
26
|
+
Requires-Dist: nucliadb-models>=6.4.0.post4227
|
27
|
+
Requires-Dist: nidx-protos>=6.4.0.post4227
|
28
28
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
29
29
|
Requires-Dist: nuclia-models>=0.24.2
|
30
30
|
Requires-Dist: uvicorn[standard]
|
@@ -135,7 +135,7 @@ nucliadb/ingest/consumer/consumer.py,sha256=OgS1fr5Yo55u-XbC6zypTH1aJ562Y1vZHnPD
|
|
135
135
|
nucliadb/ingest/consumer/materializer.py,sha256=tgD_rDI2twQzcz8kKNiW_L4YIth16IGh9mUfD5wiSD4,3858
|
136
136
|
nucliadb/ingest/consumer/metrics.py,sha256=ji1l_4cKiHJthQd8YNem1ft4iMbw9KThmVvJmLcv3Xg,1075
|
137
137
|
nucliadb/ingest/consumer/pull.py,sha256=vv1AyN0EhVgbgnZyT0D_1_IB4hWy7jPd4lAWPAOHGNc,10374
|
138
|
-
nucliadb/ingest/consumer/service.py,sha256=
|
138
|
+
nucliadb/ingest/consumer/service.py,sha256=GhuqlK-9Lvhzd8kBox8wOlKlJgM3W_gssKoWSfVVdoI,7897
|
139
139
|
nucliadb/ingest/consumer/shard_creator.py,sha256=w0smEu01FU_2cjZnsfBRNqT_Ntho11X17zTMST-vKbc,4359
|
140
140
|
nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
|
141
141
|
nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
@@ -147,21 +147,20 @@ nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54
|
|
147
147
|
nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJs,4172
|
148
148
|
nucliadb/ingest/fields/text.py,sha256=2grxo8twWbpXEd_iwUMBw9q0dWorVmlPONmY5d1ThwQ,1684
|
149
149
|
nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
150
|
-
nucliadb/ingest/orm/
|
151
|
-
nucliadb/ingest/orm/brain_v2.py,sha256=0OYqH9srWghajGh0l1oqTFPBh1Jtlw3ui3Qpww6IC7A,33573
|
150
|
+
nucliadb/ingest/orm/brain_v2.py,sha256=qX81wvU-KCcEZ-hNgkQOskMOlZmdbJqDyAfe7eXbGLw,33571
|
152
151
|
nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjUJNftfoVw,7514
|
153
152
|
nucliadb/ingest/orm/entities.py,sha256=kXyeF6XOpFKhEsGLcY-GLIk21Exp0cJst4XQQ9jJoug,14791
|
154
153
|
nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
|
155
|
-
nucliadb/ingest/orm/index_message.py,sha256=
|
154
|
+
nucliadb/ingest/orm/index_message.py,sha256=hI85nSNVChNLLdEFuEJvOt61Tsir-Gq-2_WZoayAdvk,15617
|
156
155
|
nucliadb/ingest/orm/knowledgebox.py,sha256=_rkeTMIXMhR64gbYtZpFHoUHghV2DTJ2lUBqZsoqC_4,23898
|
157
156
|
nucliadb/ingest/orm/metrics.py,sha256=OiuggTh-n3kZHA2G73NEUdIlh8c3yFrbusI88DK-Mko,1273
|
158
|
-
nucliadb/ingest/orm/resource.py,sha256=
|
157
|
+
nucliadb/ingest/orm/resource.py,sha256=hGELQgnzK2wIWgD478bR5OiVDyAxHn6WrFSq2YuHANU,36896
|
159
158
|
nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,2693
|
160
159
|
nucliadb/ingest/orm/processor/__init__.py,sha256=Aqd9wCNTvggkMkCY3WvoI8spdr94Jnqk-0iq9XpLs18,922
|
161
160
|
nucliadb/ingest/orm/processor/auditing.py,sha256=TeYhXGJRyQ7ROytbb2u8R0fIh_FYi3HgTu3S1ribY3U,4623
|
162
161
|
nucliadb/ingest/orm/processor/data_augmentation.py,sha256=v-pj4GbBWSuO8dQyahs5UDr5ghsyfhCZDS0ftKd6ZYc,5179
|
163
162
|
nucliadb/ingest/orm/processor/pgcatalog.py,sha256=ht9_I5WlPc6sSFTY8PsxHlpjN-EsaBaChwqsLlMXwUk,3100
|
164
|
-
nucliadb/ingest/orm/processor/processor.py,sha256=
|
163
|
+
nucliadb/ingest/orm/processor/processor.py,sha256=jaEBwbv--WyoC8zcdxWAyF0dAzVA5crVDJl56Bqv1eI,31444
|
165
164
|
nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
|
166
165
|
nucliadb/ingest/service/__init__.py,sha256=LHQFUkdmNBOWqBG0Md9sMMI7g5TQZ-hLAnhw6ZblrJg,2002
|
167
166
|
nucliadb/ingest/service/exceptions.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
@@ -260,7 +259,7 @@ nucliadb/search/search/chat/ask.py,sha256=jYOGh2rySV4aFx_D2KlNVbPXHBsbkcy0Ve-eBS
|
|
260
259
|
nucliadb/search/search/chat/exceptions.py,sha256=Siy4GXW2L7oPhIR86H3WHBhE9lkV4A4YaAszuGGUf54,1356
|
261
260
|
nucliadb/search/search/chat/images.py,sha256=PA8VWxT5_HUGfW1ULhKTK46UBsVyINtWWqEM1ulzX1E,3095
|
262
261
|
nucliadb/search/search/chat/prompt.py,sha256=Jnja-Ss7skgnnDY8BymVfdeYsFPnIQFL8tEvcRXTKUE,47356
|
263
|
-
nucliadb/search/search/chat/query.py,sha256=
|
262
|
+
nucliadb/search/search/chat/query.py,sha256=IdVPeKLUbq4hWJ81LePWdUrljeyehnIXg-Ars-37msQ,16878
|
264
263
|
nucliadb/search/search/query_parser/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
|
265
264
|
nucliadb/search/search/query_parser/exceptions.py,sha256=szAOXUZ27oNY-OSa9t2hQ5HHkQQC0EX1FZz_LluJHJE,1224
|
266
265
|
nucliadb/search/search/query_parser/fetcher.py,sha256=SkvBRDfSKmuz-QygNKLAU4AhZhhDo1dnOZmt1zA28RA,16851
|
@@ -369,8 +368,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
|
|
369
368
|
nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
|
370
369
|
nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
|
371
370
|
nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
|
372
|
-
nucliadb-6.4.0.
|
373
|
-
nucliadb-6.4.0.
|
374
|
-
nucliadb-6.4.0.
|
375
|
-
nucliadb-6.4.0.
|
376
|
-
nucliadb-6.4.0.
|
371
|
+
nucliadb-6.4.0.post4227.dist-info/METADATA,sha256=eZVBfu03hI3bpE7r2yLzoiPcUtqH82BsKHO51-dFfN0,4223
|
372
|
+
nucliadb-6.4.0.post4227.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
373
|
+
nucliadb-6.4.0.post4227.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
|
374
|
+
nucliadb-6.4.0.post4227.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
|
375
|
+
nucliadb-6.4.0.post4227.dist-info/RECORD,,
|
nucliadb/ingest/orm/brain.py
DELETED
@@ -1,695 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import logging
|
21
|
-
from copy import deepcopy
|
22
|
-
from dataclasses import dataclass
|
23
|
-
from typing import Optional
|
24
|
-
|
25
|
-
from nidx_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
|
26
|
-
from nidx_protos.noderesources_pb2 import (
|
27
|
-
IndexRelation,
|
28
|
-
ParagraphMetadata,
|
29
|
-
Representation,
|
30
|
-
ResourceID,
|
31
|
-
)
|
32
|
-
from nidx_protos.noderesources_pb2 import Position as TextPosition
|
33
|
-
from nidx_protos.noderesources_pb2 import Resource as PBBrainResource
|
34
|
-
|
35
|
-
from nucliadb.common import ids
|
36
|
-
from nucliadb.ingest import logger
|
37
|
-
from nucliadb.ingest.orm.utils import compute_paragraph_key
|
38
|
-
from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
|
39
|
-
from nucliadb_models.metadata import ResourceProcessingStatus
|
40
|
-
from nucliadb_protos import utils_pb2
|
41
|
-
from nucliadb_protos.resources_pb2 import (
|
42
|
-
Basic,
|
43
|
-
ExtractedText,
|
44
|
-
FieldAuthor,
|
45
|
-
FieldComputedMetadata,
|
46
|
-
FieldMetadata,
|
47
|
-
Metadata,
|
48
|
-
Origin,
|
49
|
-
Paragraph,
|
50
|
-
Relations,
|
51
|
-
UserFieldMetadata,
|
52
|
-
UserMetadata,
|
53
|
-
)
|
54
|
-
from nucliadb_protos.utils_pb2 import Relation, RelationNode
|
55
|
-
|
56
|
-
FilePagePositions = dict[int, tuple[int, int]]
|
57
|
-
|
58
|
-
METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
|
59
|
-
Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
|
60
|
-
Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
|
61
|
-
Metadata.Status.PENDING: ResourceProcessingStatus.PENDING.name,
|
62
|
-
Metadata.Status.BLOCKED: ResourceProcessingStatus.BLOCKED.name,
|
63
|
-
Metadata.Status.EXPIRED: ResourceProcessingStatus.EXPIRED.name,
|
64
|
-
}
|
65
|
-
|
66
|
-
|
67
|
-
@dataclass
|
68
|
-
class ParagraphClassifications:
|
69
|
-
valid: dict[str, list[str]]
|
70
|
-
denied: dict[str, list[str]]
|
71
|
-
|
72
|
-
|
73
|
-
class ResourceBrain:
|
74
|
-
def __init__(self, rid: str):
|
75
|
-
self.rid = rid
|
76
|
-
ridobj = ResourceID(uuid=rid)
|
77
|
-
self.brain: PBBrainResource = PBBrainResource(resource=ridobj)
|
78
|
-
self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
|
79
|
-
|
80
|
-
def apply_field_text(self, field_key: str, text: str, replace_field: bool):
|
81
|
-
self.brain.texts[field_key].text = text
|
82
|
-
if replace_field:
|
83
|
-
field_type, field_name = field_key.split("/")
|
84
|
-
full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
|
85
|
-
self.brain.texts_to_delete.append(full_field_id)
|
86
|
-
|
87
|
-
def _get_paragraph_user_classifications(
|
88
|
-
self, basic_user_field_metadata: Optional[UserFieldMetadata]
|
89
|
-
) -> ParagraphClassifications:
|
90
|
-
pc = ParagraphClassifications(valid={}, denied={})
|
91
|
-
if basic_user_field_metadata is None:
|
92
|
-
return pc
|
93
|
-
for annotated_paragraph in basic_user_field_metadata.paragraphs:
|
94
|
-
for classification in annotated_paragraph.classifications:
|
95
|
-
paragraph_key = compute_paragraph_key(self.rid, annotated_paragraph.key)
|
96
|
-
classif_label = f"/l/{classification.labelset}/{classification.label}"
|
97
|
-
if classification.cancelled_by_user:
|
98
|
-
pc.denied.setdefault(paragraph_key, []).append(classif_label)
|
99
|
-
else:
|
100
|
-
pc.valid.setdefault(paragraph_key, []).append(classif_label)
|
101
|
-
return pc
|
102
|
-
|
103
|
-
def apply_field_metadata(
|
104
|
-
self,
|
105
|
-
field_key: str,
|
106
|
-
metadata: FieldComputedMetadata,
|
107
|
-
page_positions: Optional[FilePagePositions],
|
108
|
-
extracted_text: Optional[ExtractedText],
|
109
|
-
basic_user_field_metadata: Optional[UserFieldMetadata] = None,
|
110
|
-
*,
|
111
|
-
replace_field: bool = False,
|
112
|
-
):
|
113
|
-
# To check for duplicate paragraphs
|
114
|
-
unique_paragraphs: set[str] = set()
|
115
|
-
|
116
|
-
# Expose also user classifications
|
117
|
-
user_paragraph_classifications = self._get_paragraph_user_classifications(
|
118
|
-
basic_user_field_metadata
|
119
|
-
)
|
120
|
-
|
121
|
-
# We should set paragraphs and labels
|
122
|
-
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
123
|
-
for subfield, metadata_split in metadata.split_metadata.items():
|
124
|
-
extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
|
125
|
-
|
126
|
-
# For each split of this field
|
127
|
-
for index, paragraph in enumerate(metadata_split.paragraphs):
|
128
|
-
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
129
|
-
|
130
|
-
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
131
|
-
position = TextPosition(
|
132
|
-
index=index,
|
133
|
-
start=paragraph.start,
|
134
|
-
end=paragraph.end,
|
135
|
-
start_seconds=paragraph.start_seconds,
|
136
|
-
end_seconds=paragraph.end_seconds,
|
137
|
-
)
|
138
|
-
page_with_visual = False
|
139
|
-
if paragraph.HasField("page"):
|
140
|
-
position.page_number = paragraph.page.page
|
141
|
-
page_with_visual = paragraph.page.page_with_visual
|
142
|
-
position.in_page = True
|
143
|
-
elif paragraph_pages:
|
144
|
-
position.page_number = paragraph_pages.get(paragraph.start)
|
145
|
-
position.in_page = True
|
146
|
-
else:
|
147
|
-
position.in_page = False
|
148
|
-
|
149
|
-
representation = Representation()
|
150
|
-
if paragraph.HasField("representation"):
|
151
|
-
representation.file = paragraph.representation.reference_file
|
152
|
-
representation.is_a_table = paragraph.representation.is_a_table
|
153
|
-
|
154
|
-
p = BrainParagraph(
|
155
|
-
start=paragraph.start,
|
156
|
-
end=paragraph.end,
|
157
|
-
field=field_key,
|
158
|
-
split=subfield,
|
159
|
-
index=index,
|
160
|
-
repeated_in_field=is_paragraph_repeated_in_field(
|
161
|
-
paragraph,
|
162
|
-
extracted_text_str,
|
163
|
-
unique_paragraphs,
|
164
|
-
),
|
165
|
-
metadata=ParagraphMetadata(
|
166
|
-
position=position,
|
167
|
-
page_with_visual=page_with_visual,
|
168
|
-
representation=representation,
|
169
|
-
),
|
170
|
-
)
|
171
|
-
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
172
|
-
paragraph_labels = {paragraph_kind_label}
|
173
|
-
paragraph_labels.update(
|
174
|
-
f"/l/{classification.labelset}/{classification.label}"
|
175
|
-
for classification in paragraph.classifications
|
176
|
-
)
|
177
|
-
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
178
|
-
paragraph_labels.difference_update(denied_classifications)
|
179
|
-
p.labels.extend(list(paragraph_labels))
|
180
|
-
|
181
|
-
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
182
|
-
|
183
|
-
extracted_text_str = extracted_text.text if extracted_text else None
|
184
|
-
for index, paragraph in enumerate(metadata.metadata.paragraphs):
|
185
|
-
key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
186
|
-
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
187
|
-
position = TextPosition(
|
188
|
-
index=index,
|
189
|
-
start=paragraph.start,
|
190
|
-
end=paragraph.end,
|
191
|
-
start_seconds=paragraph.start_seconds,
|
192
|
-
end_seconds=paragraph.end_seconds,
|
193
|
-
)
|
194
|
-
page_with_visual = False
|
195
|
-
if paragraph.HasField("page"):
|
196
|
-
position.page_number = paragraph.page.page
|
197
|
-
position.in_page = True
|
198
|
-
page_with_visual = paragraph.page.page_with_visual
|
199
|
-
elif paragraph_pages:
|
200
|
-
position.page_number = paragraph_pages.get(paragraph.start)
|
201
|
-
position.in_page = True
|
202
|
-
else:
|
203
|
-
position.in_page = False
|
204
|
-
|
205
|
-
representation = Representation()
|
206
|
-
if paragraph.HasField("representation"):
|
207
|
-
representation.file = paragraph.representation.reference_file
|
208
|
-
representation.is_a_table = paragraph.representation.is_a_table
|
209
|
-
|
210
|
-
p = BrainParagraph(
|
211
|
-
start=paragraph.start,
|
212
|
-
end=paragraph.end,
|
213
|
-
field=field_key,
|
214
|
-
index=index,
|
215
|
-
repeated_in_field=is_paragraph_repeated_in_field(
|
216
|
-
paragraph, extracted_text_str, unique_paragraphs
|
217
|
-
),
|
218
|
-
metadata=ParagraphMetadata(
|
219
|
-
position=position,
|
220
|
-
page_with_visual=page_with_visual,
|
221
|
-
representation=representation,
|
222
|
-
),
|
223
|
-
)
|
224
|
-
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
225
|
-
paragraph_labels = {paragraph_kind_label}
|
226
|
-
paragraph_labels.update(
|
227
|
-
f"/l/{classification.labelset}/{classification.label}"
|
228
|
-
for classification in paragraph.classifications
|
229
|
-
)
|
230
|
-
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
231
|
-
paragraph_labels.difference_update(denied_classifications)
|
232
|
-
p.labels.extend(list(paragraph_labels))
|
233
|
-
|
234
|
-
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
235
|
-
|
236
|
-
if replace_field:
|
237
|
-
field_type, field_name = field_key.split("/")
|
238
|
-
full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
|
239
|
-
self.brain.paragraphs_to_delete.append(full_field_id)
|
240
|
-
|
241
|
-
field_relations = self.brain.field_relations[field_key].relations
|
242
|
-
for relations in metadata.metadata.relations:
|
243
|
-
for relation in relations.relations:
|
244
|
-
index_relation = IndexRelation(relation=relation)
|
245
|
-
if relation.metadata.HasField("data_augmentation_task_id"):
|
246
|
-
index_relation.facets.append(f"/g/da/{relation.metadata.data_augmentation_task_id}")
|
247
|
-
field_relations.append(index_relation)
|
248
|
-
|
249
|
-
def delete_field(self, field_key: str):
|
250
|
-
ftype, fkey = field_key.split("/")
|
251
|
-
full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
|
252
|
-
self.brain.texts_to_delete.append(full_field_id)
|
253
|
-
self.brain.paragraphs_to_delete.append(full_field_id)
|
254
|
-
self.brain.sentences_to_delete.append(full_field_id)
|
255
|
-
self.brain.relation_fields_to_delete.append(field_key)
|
256
|
-
|
257
|
-
def apply_field_vectors(
|
258
|
-
self,
|
259
|
-
field_id: str,
|
260
|
-
vo: utils_pb2.VectorObject,
|
261
|
-
*,
|
262
|
-
vectorset: str,
|
263
|
-
replace_field: bool = False,
|
264
|
-
# cut to specific dimension if specified
|
265
|
-
vector_dimension: Optional[int] = None,
|
266
|
-
):
|
267
|
-
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
268
|
-
for subfield, vectors in vo.split_vectors.items():
|
269
|
-
_field_id = ids.FieldId(
|
270
|
-
rid=fid.rid,
|
271
|
-
type=fid.type,
|
272
|
-
key=fid.key,
|
273
|
-
subfield_id=subfield,
|
274
|
-
)
|
275
|
-
# For each split of this field
|
276
|
-
for index, vector in enumerate(vectors.vectors):
|
277
|
-
paragraph_key = ids.ParagraphId(
|
278
|
-
field_id=_field_id,
|
279
|
-
paragraph_start=vector.start_paragraph,
|
280
|
-
paragraph_end=vector.end_paragraph,
|
281
|
-
)
|
282
|
-
sentence_key = ids.VectorId(
|
283
|
-
field_id=_field_id,
|
284
|
-
index=index,
|
285
|
-
vector_start=vector.start,
|
286
|
-
vector_end=vector.end,
|
287
|
-
)
|
288
|
-
self._apply_field_vector(
|
289
|
-
field_id,
|
290
|
-
paragraph_key,
|
291
|
-
sentence_key,
|
292
|
-
vector,
|
293
|
-
vectorset=vectorset,
|
294
|
-
vector_dimension=vector_dimension,
|
295
|
-
)
|
296
|
-
|
297
|
-
_field_id = ids.FieldId(
|
298
|
-
rid=fid.rid,
|
299
|
-
type=fid.type,
|
300
|
-
key=fid.key,
|
301
|
-
)
|
302
|
-
for index, vector in enumerate(vo.vectors.vectors):
|
303
|
-
paragraph_key = ids.ParagraphId(
|
304
|
-
field_id=_field_id,
|
305
|
-
paragraph_start=vector.start_paragraph,
|
306
|
-
paragraph_end=vector.end_paragraph,
|
307
|
-
)
|
308
|
-
sentence_key = ids.VectorId(
|
309
|
-
field_id=_field_id,
|
310
|
-
index=index,
|
311
|
-
vector_start=vector.start,
|
312
|
-
vector_end=vector.end,
|
313
|
-
)
|
314
|
-
self._apply_field_vector(
|
315
|
-
field_id,
|
316
|
-
paragraph_key,
|
317
|
-
sentence_key,
|
318
|
-
vector,
|
319
|
-
vectorset=vectorset,
|
320
|
-
vector_dimension=vector_dimension,
|
321
|
-
)
|
322
|
-
|
323
|
-
if replace_field:
|
324
|
-
full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
|
325
|
-
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
326
|
-
|
327
|
-
def _apply_field_vector(
|
328
|
-
self,
|
329
|
-
field_id: str,
|
330
|
-
paragraph_key: ids.ParagraphId,
|
331
|
-
sentence_key: ids.VectorId,
|
332
|
-
vector: utils_pb2.Vector,
|
333
|
-
*,
|
334
|
-
vectorset: str,
|
335
|
-
# cut vectors if a specific dimension is specified
|
336
|
-
vector_dimension: Optional[int] = None,
|
337
|
-
):
|
338
|
-
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
339
|
-
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
340
|
-
|
341
|
-
sentence_pb.ClearField("vector") # clear first to prevent duplicates
|
342
|
-
sentence_pb.vector.extend(vector.vector[:vector_dimension])
|
343
|
-
|
344
|
-
# we only care about start/stop position of the paragraph for a given sentence here
|
345
|
-
# the key has the sentence position
|
346
|
-
sentence_pb.metadata.position.start = vector.start_paragraph
|
347
|
-
sentence_pb.metadata.position.end = vector.end_paragraph
|
348
|
-
|
349
|
-
# does it make sense to copy forward paragraph values here?
|
350
|
-
sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
|
351
|
-
sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
|
352
|
-
|
353
|
-
sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
|
354
|
-
|
355
|
-
sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
|
356
|
-
|
357
|
-
sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
|
358
|
-
|
359
|
-
sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
|
360
|
-
|
361
|
-
def set_processing_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
|
362
|
-
"""
|
363
|
-
We purposefully overwrite what we index as a status and DO NOT reflect
|
364
|
-
actual status with what we index.
|
365
|
-
|
366
|
-
This seems to be is on purpose so the frontend of the product can operate
|
367
|
-
on 2 statuses only -- PENDING and PROCESSED.
|
368
|
-
"""
|
369
|
-
# The value of brain.status will either be PROCESSED or PENDING
|
370
|
-
status = basic.metadata.status
|
371
|
-
if previous_status is not None and previous_status != Metadata.Status.PENDING:
|
372
|
-
# Already processed once, so it stays as PROCESSED
|
373
|
-
self.brain.status = PBBrainResource.PROCESSED
|
374
|
-
return
|
375
|
-
# previos_status is None or PENDING
|
376
|
-
if status == Metadata.Status.PENDING:
|
377
|
-
# Stays in pending
|
378
|
-
self.brain.status = PBBrainResource.PENDING
|
379
|
-
else:
|
380
|
-
# Means it has just been processed
|
381
|
-
self.brain.status = PBBrainResource.PROCESSED
|
382
|
-
|
383
|
-
def set_security(self, security: utils_pb2.Security):
|
384
|
-
self.brain.security.CopyFrom(security)
|
385
|
-
|
386
|
-
def get_processing_status_tag(self, metadata: Metadata) -> str:
|
387
|
-
if not metadata.useful:
|
388
|
-
return "EMPTY"
|
389
|
-
return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
|
390
|
-
|
391
|
-
def set_resource_metadata(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
|
392
|
-
self._set_resource_dates(basic, origin)
|
393
|
-
self._set_resource_labels(basic, origin)
|
394
|
-
self._set_resource_relations(basic, origin, user_relations)
|
395
|
-
|
396
|
-
def _set_resource_dates(self, basic: Basic, origin: Optional[Origin]):
|
397
|
-
if basic.created.seconds > 0:
|
398
|
-
self.brain.metadata.created.CopyFrom(basic.created)
|
399
|
-
else:
|
400
|
-
logging.warning(f"Basic metadata has no created field for {self.rid}")
|
401
|
-
self.brain.metadata.created.GetCurrentTime()
|
402
|
-
if basic.modified.seconds > 0:
|
403
|
-
self.brain.metadata.modified.CopyFrom(basic.modified)
|
404
|
-
else:
|
405
|
-
if basic.created.seconds > 0:
|
406
|
-
self.brain.metadata.modified.CopyFrom(basic.created)
|
407
|
-
else:
|
408
|
-
self.brain.metadata.modified.GetCurrentTime()
|
409
|
-
|
410
|
-
if origin is not None:
|
411
|
-
# overwrite created/modified if provided on origin
|
412
|
-
if origin.HasField("created") and origin.created.seconds > 0:
|
413
|
-
self.brain.metadata.created.CopyFrom(origin.created)
|
414
|
-
if origin.HasField("modified") and origin.modified.seconds > 0:
|
415
|
-
self.brain.metadata.modified.CopyFrom(origin.modified)
|
416
|
-
|
417
|
-
def _set_resource_relations(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
|
418
|
-
relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
|
419
|
-
if origin is not None:
|
420
|
-
# origin contributors
|
421
|
-
for contrib in origin.colaborators:
|
422
|
-
relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
|
423
|
-
relation = Relation(
|
424
|
-
relation=Relation.COLAB,
|
425
|
-
source=relationnodedocument,
|
426
|
-
to=relationnodeuser,
|
427
|
-
)
|
428
|
-
self.brain.field_relations["a/metadata"].relations.append(
|
429
|
-
IndexRelation(relation=relation)
|
430
|
-
)
|
431
|
-
|
432
|
-
# labels
|
433
|
-
for classification in basic.usermetadata.classifications:
|
434
|
-
relation_node_label = RelationNode(
|
435
|
-
value=f"{classification.labelset}/{classification.label}",
|
436
|
-
ntype=RelationNode.NodeType.LABEL,
|
437
|
-
)
|
438
|
-
relation = Relation(
|
439
|
-
relation=Relation.ABOUT,
|
440
|
-
source=relationnodedocument,
|
441
|
-
to=relation_node_label,
|
442
|
-
)
|
443
|
-
self.brain.field_relations["a/metadata"].relations.append(IndexRelation(relation=relation))
|
444
|
-
|
445
|
-
# relations
|
446
|
-
for relation in user_relations.relations:
|
447
|
-
self.brain.field_relations["a/metadata"].relations.append(
|
448
|
-
IndexRelation(relation=relation, facets=["/g/u"])
|
449
|
-
)
|
450
|
-
|
451
|
-
self.brain.relation_fields_to_delete.append("a/metadata")
|
452
|
-
|
453
|
-
def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
|
454
|
-
if origin is not None:
|
455
|
-
if origin.source_id:
|
456
|
-
self.labels["o"] = {origin.source_id}
|
457
|
-
# origin tags
|
458
|
-
for tag in origin.tags:
|
459
|
-
self.labels["t"].add(tag)
|
460
|
-
# origin source
|
461
|
-
if origin.source_id != "":
|
462
|
-
self.labels["u"].add(f"s/{origin.source_id}")
|
463
|
-
|
464
|
-
if origin.path:
|
465
|
-
self.labels["p"].add(origin.path.lstrip("/"))
|
466
|
-
|
467
|
-
# origin contributors
|
468
|
-
for contrib in origin.colaborators:
|
469
|
-
self.labels["u"].add(f"o/{contrib}")
|
470
|
-
|
471
|
-
for key, value in origin.metadata.items():
|
472
|
-
self.labels["m"].add(f"{key[:255]}/{value[:255]}")
|
473
|
-
|
474
|
-
# icon
|
475
|
-
self.labels["n"].add(f"i/{basic.icon}")
|
476
|
-
|
477
|
-
# processing status
|
478
|
-
status_tag = self.get_processing_status_tag(basic.metadata)
|
479
|
-
self.labels["n"].add(f"s/{status_tag}")
|
480
|
-
|
481
|
-
# main language
|
482
|
-
if basic.metadata.language:
|
483
|
-
self.labels["s"].add(f"p/{basic.metadata.language}")
|
484
|
-
|
485
|
-
# all language
|
486
|
-
for lang in basic.metadata.languages:
|
487
|
-
self.labels["s"].add(f"s/{lang}")
|
488
|
-
|
489
|
-
# labels
|
490
|
-
for classification in basic.usermetadata.classifications:
|
491
|
-
self.labels["l"].add(f"{classification.labelset}/{classification.label}")
|
492
|
-
|
493
|
-
# hidden
|
494
|
-
if basic.hidden:
|
495
|
-
_, p1, p2 = LABEL_HIDDEN.split("/")
|
496
|
-
self.labels[p1].add(p2)
|
497
|
-
|
498
|
-
self.brain.ClearField("labels")
|
499
|
-
self.brain.labels.extend(flatten_resource_labels(self.labels))
|
500
|
-
|
501
|
-
def process_field_metadata(
|
502
|
-
self,
|
503
|
-
field_key: str,
|
504
|
-
metadata: FieldMetadata,
|
505
|
-
labels: dict[str, set[str]],
|
506
|
-
relation_node_document: RelationNode,
|
507
|
-
user_canceled_labels: set[str],
|
508
|
-
):
|
509
|
-
if metadata.mime_type != "":
|
510
|
-
labels["mt"].add(metadata.mime_type)
|
511
|
-
|
512
|
-
base_classification_relation = Relation(
|
513
|
-
relation=Relation.ABOUT,
|
514
|
-
source=relation_node_document,
|
515
|
-
to=RelationNode(
|
516
|
-
ntype=RelationNode.NodeType.LABEL,
|
517
|
-
),
|
518
|
-
)
|
519
|
-
for classification in metadata.classifications:
|
520
|
-
label = f"{classification.labelset}/{classification.label}"
|
521
|
-
if label not in user_canceled_labels:
|
522
|
-
labels["l"].add(label)
|
523
|
-
relation = Relation()
|
524
|
-
relation.CopyFrom(base_classification_relation)
|
525
|
-
relation.to.value = label
|
526
|
-
self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
|
527
|
-
|
528
|
-
# Data Augmentation + Processor entities
|
529
|
-
base_entity_relation = Relation(
|
530
|
-
relation=Relation.ENTITY,
|
531
|
-
source=relation_node_document,
|
532
|
-
to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
|
533
|
-
)
|
534
|
-
use_legacy_entities = True
|
535
|
-
for data_augmentation_task_id, entities in metadata.entities.items():
|
536
|
-
# If we recieved the entities from the processor here, we don't want to use the legacy entities
|
537
|
-
# TODO: Remove this when processor doesn't use this anymore
|
538
|
-
if data_augmentation_task_id == "processor":
|
539
|
-
use_legacy_entities = False
|
540
|
-
|
541
|
-
for ent in entities.entities:
|
542
|
-
entity_text = ent.text
|
543
|
-
entity_label = ent.label
|
544
|
-
# Seems like we don't care about where the entity is in the text
|
545
|
-
# entity_positions = entity.positions
|
546
|
-
labels["e"].add(
|
547
|
-
f"{entity_label}/{entity_text}"
|
548
|
-
) # Add data_augmentation_task_id as a prefix?
|
549
|
-
relation = Relation()
|
550
|
-
relation.CopyFrom(base_entity_relation)
|
551
|
-
relation.to.value = entity_text
|
552
|
-
relation.to.subtype = entity_label
|
553
|
-
self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
|
554
|
-
|
555
|
-
# Legacy processor entities
|
556
|
-
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
557
|
-
def _parse_entity(klass_entity: str) -> tuple[str, str]:
|
558
|
-
try:
|
559
|
-
klass, entity = klass_entity.split("/", 1)
|
560
|
-
return klass, entity
|
561
|
-
except ValueError:
|
562
|
-
raise AttributeError(f"Entity should be with type {klass_entity}")
|
563
|
-
|
564
|
-
if use_legacy_entities:
|
565
|
-
for klass_entity in metadata.positions.keys():
|
566
|
-
labels["e"].add(klass_entity)
|
567
|
-
klass, entity = _parse_entity(klass_entity)
|
568
|
-
relation = Relation()
|
569
|
-
relation.CopyFrom(base_entity_relation)
|
570
|
-
relation.to.value = entity
|
571
|
-
relation.to.subtype = klass
|
572
|
-
self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
|
573
|
-
|
574
|
-
def apply_field_labels(
|
575
|
-
self,
|
576
|
-
field_key: str,
|
577
|
-
metadata: Optional[FieldComputedMetadata],
|
578
|
-
uuid: str,
|
579
|
-
generated_by: Optional[FieldAuthor],
|
580
|
-
basic_user_metadata: Optional[UserMetadata] = None,
|
581
|
-
basic_user_fieldmetadata: Optional[UserFieldMetadata] = None,
|
582
|
-
):
|
583
|
-
user_canceled_labels: set[str] = set()
|
584
|
-
if basic_user_metadata is not None:
|
585
|
-
user_canceled_labels.update(
|
586
|
-
f"{classification.labelset}/{classification.label}"
|
587
|
-
for classification in basic_user_metadata.classifications
|
588
|
-
if classification.cancelled_by_user
|
589
|
-
)
|
590
|
-
relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
|
591
|
-
labels: dict[str, set[str]] = {
|
592
|
-
"l": set(), # classification labels
|
593
|
-
"e": set(), # entities
|
594
|
-
"mt": set(), # mime type
|
595
|
-
"g/da": set(), # generated by
|
596
|
-
}
|
597
|
-
if metadata is not None:
|
598
|
-
for meta in metadata.split_metadata.values():
|
599
|
-
self.process_field_metadata(
|
600
|
-
field_key,
|
601
|
-
meta,
|
602
|
-
labels,
|
603
|
-
relation_node_resource,
|
604
|
-
user_canceled_labels,
|
605
|
-
)
|
606
|
-
self.process_field_metadata(
|
607
|
-
field_key,
|
608
|
-
metadata.metadata,
|
609
|
-
labels,
|
610
|
-
relation_node_resource,
|
611
|
-
user_canceled_labels,
|
612
|
-
)
|
613
|
-
|
614
|
-
if basic_user_fieldmetadata is not None:
|
615
|
-
for paragraph_annotation in basic_user_fieldmetadata.paragraphs:
|
616
|
-
for classification in paragraph_annotation.classifications:
|
617
|
-
if not classification.cancelled_by_user:
|
618
|
-
label = f"/l/{classification.labelset}/{classification.label}"
|
619
|
-
# FIXME: this condition avoid adding duplicate labels
|
620
|
-
# while importing a kb. We shouldn't add duplicates on
|
621
|
-
# the first place
|
622
|
-
if (
|
623
|
-
label
|
624
|
-
not in self.brain.paragraphs[field_key]
|
625
|
-
.paragraphs[paragraph_annotation.key]
|
626
|
-
.labels
|
627
|
-
):
|
628
|
-
self.brain.paragraphs[field_key].paragraphs[
|
629
|
-
paragraph_annotation.key
|
630
|
-
].labels.append(label)
|
631
|
-
|
632
|
-
if generated_by is not None and generated_by.WhichOneof("author") == "data_augmentation":
|
633
|
-
field_type, field_id = field_key.split("/")
|
634
|
-
da_task_id = ids.extract_data_augmentation_id(field_id)
|
635
|
-
if da_task_id is None: # pragma: nocover
|
636
|
-
logger.warning(
|
637
|
-
"Data augmentation field id has an unexpected format! Skipping label",
|
638
|
-
extra={
|
639
|
-
"rid": uuid,
|
640
|
-
"field_id": field_id,
|
641
|
-
},
|
642
|
-
)
|
643
|
-
else:
|
644
|
-
labels["g/da"].add(da_task_id)
|
645
|
-
|
646
|
-
flat_labels = flatten_resource_labels(labels)
|
647
|
-
if len(flat_labels) > 0:
|
648
|
-
self.brain.texts[field_key].labels.extend(flat_labels)
|
649
|
-
|
650
|
-
|
651
|
-
def is_paragraph_repeated_in_field(
|
652
|
-
paragraph: Paragraph,
|
653
|
-
extracted_text: Optional[str],
|
654
|
-
unique_paragraphs: set[str],
|
655
|
-
) -> bool:
|
656
|
-
if extracted_text is None:
|
657
|
-
return False
|
658
|
-
|
659
|
-
paragraph_text = extracted_text[paragraph.start : paragraph.end]
|
660
|
-
if len(paragraph_text) == 0:
|
661
|
-
return False
|
662
|
-
|
663
|
-
if paragraph_text in unique_paragraphs:
|
664
|
-
repeated_in_field = True
|
665
|
-
else:
|
666
|
-
repeated_in_field = False
|
667
|
-
unique_paragraphs.add(paragraph_text)
|
668
|
-
return repeated_in_field
|
669
|
-
|
670
|
-
|
671
|
-
class ParagraphPages:
|
672
|
-
"""
|
673
|
-
Class to get the page number for a given paragraph in an optimized way.
|
674
|
-
"""
|
675
|
-
|
676
|
-
def __init__(self, positions: FilePagePositions):
|
677
|
-
self.positions = positions
|
678
|
-
self._materialized = self._materialize_page_numbers(positions)
|
679
|
-
|
680
|
-
def _materialize_page_numbers(self, positions: FilePagePositions) -> list[int]:
|
681
|
-
page_numbers_by_index = []
|
682
|
-
for page_number, (page_start, page_end) in positions.items():
|
683
|
-
page_numbers_by_index.extend([page_number] * (page_end - page_start + 1))
|
684
|
-
return page_numbers_by_index
|
685
|
-
|
686
|
-
def get(self, paragraph_start_index: int) -> int:
|
687
|
-
try:
|
688
|
-
return self._materialized[paragraph_start_index]
|
689
|
-
except IndexError:
|
690
|
-
logger.error(
|
691
|
-
f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}" # noqa
|
692
|
-
)
|
693
|
-
if len(self._materialized) > 0:
|
694
|
-
return self._materialized[-1]
|
695
|
-
return 0
|
File without changes
|
File without changes
|
File without changes
|