nucliadb 6.4.0.post4213__py3-none-any.whl → 6.4.0.post4224__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -59,6 +59,7 @@ async def _exit_tasks(tasks: list[asyncio.Task]) -> None:
59
59
 
60
60
 
61
61
  async def start_back_pressure() -> BackPressureMaterializer:
62
+ logger.info("Starting back pressure materializer")
62
63
  nats_manager = await start_nats_manager(
63
64
  SERVICE_NAME,
64
65
  indexing_settings.index_jetstream_servers,
@@ -70,7 +70,7 @@ class ParagraphClassifications:
70
70
  denied: dict[str, list[str]]
71
71
 
72
72
 
73
- class ResourceBrainV2:
73
+ class ResourceBrain:
74
74
  def __init__(self, rid: str):
75
75
  self.rid = rid
76
76
  self.brain: PBBrainResource = PBBrainResource(resource=ResourceID(uuid=rid))
@@ -27,14 +27,12 @@ from nidx_protos.noderesources_pb2 import Resource as IndexMessage
27
27
  from nucliadb.common import datamanagers
28
28
  from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
29
29
  from nucliadb.ingest.fields.file import File
30
- from nucliadb.ingest.orm.brain_v2 import ResourceBrainV2 as ResourceBrain
30
+ from nucliadb.ingest.orm.brain_v2 import ResourceBrain
31
31
  from nucliadb.ingest.orm.metrics import index_message_observer as observer
32
32
  from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
33
33
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
34
34
  from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
35
35
  from nucliadb_protos.writer_pb2 import BrokerMessage
36
- from nucliadb_utils import const
37
- from nucliadb_utils.utilities import has_feature
38
36
 
39
37
 
40
38
  class IndexMessageBuilder:
@@ -403,14 +401,5 @@ async def get_resource_index_message(
403
401
  """
404
402
  Get the full index message for a resource.
405
403
  """
406
- if has_feature(
407
- const.Features.INDEX_MESSAGE_GENERATION_V2,
408
- context={
409
- "kbid": resource.kb.kbid,
410
- },
411
- ):
412
- im_builder = IndexMessageBuilder(resource)
413
- return await im_builder.full(reindex=reindex)
414
- else:
415
- # TODO: remove this code when we remove the old index message generation
416
- return (await resource.generate_index_message(reindex=reindex)).brain
404
+ im_builder = IndexMessageBuilder(resource)
405
+ return await im_builder.full(reindex=reindex)
@@ -461,8 +461,8 @@ class Processor:
461
461
  source=source,
462
462
  )
463
463
 
464
- @processor_observer.wrap({"type": "generate_index_message_v2"})
465
- async def generate_index_message_v2(
464
+ @processor_observer.wrap({"type": "generate_index_message"})
465
+ async def generate_index_message(
466
466
  self,
467
467
  resource: Resource,
468
468
  messages: list[writer_pb2.BrokerMessage],
@@ -477,40 +477,6 @@ class Processor:
477
477
  else: # pragma: no cover
478
478
  raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
479
479
 
480
- @processor_observer.wrap({"type": "generate_index_message_v1"})
481
- async def generate_index_message_v1(
482
- self,
483
- resource: Resource,
484
- messages: list[writer_pb2.BrokerMessage],
485
- ) -> PBBrainResource:
486
- if any(needs_reindex(m) for m in messages):
487
- # when reindexing, let's just generate full new index message
488
- # TODO - This should be improved in the future as it's not optimal for very large resources:
489
- # As of now, there are some API operations that require fully reindexing all the fields of a resource.
490
- # An example of this is classification label changes - we need to reindex all the fields of a resource to
491
- # propagate the label changes to the index.
492
- resource.replace_indexer(await resource.generate_index_message(reindex=True))
493
- else:
494
- # TODO - Ideally we should only update the fields that have been changed in the current transaction.
495
- await resource.compute_global_text()
496
- await resource.compute_global_tags(resource.indexer)
497
- await resource.compute_security(resource.indexer)
498
- return resource.indexer.brain
499
-
500
- async def generate_index_message(
501
- self,
502
- resource: Resource,
503
- messages: list[writer_pb2.BrokerMessage],
504
- resource_created: bool = False,
505
- ) -> PBBrainResource:
506
- if has_feature(
507
- const.Features.INDEX_MESSAGE_GENERATION_V2,
508
- context={"kbid": resource.kb.kbid},
509
- ):
510
- return await self.generate_index_message_v2(resource, messages, resource_created)
511
- else:
512
- return await self.generate_index_message_v1(resource, messages)
513
-
514
480
  async def external_index_delete_resource(
515
481
  self, external_index_manager: ExternalIndexManager, resource_uuid: str
516
482
  ):
@@ -19,11 +19,9 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
- import asyncio
23
22
  import logging
24
23
  from collections import defaultdict
25
24
  from concurrent.futures import ThreadPoolExecutor
26
- from functools import partial
27
25
  from typing import TYPE_CHECKING, Any, Optional, Sequence, Type
28
26
 
29
27
  from nucliadb.common import datamanagers
@@ -32,12 +30,11 @@ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
32
30
  from nucliadb.common.maindb.driver import Transaction
33
31
  from nucliadb.ingest.fields.base import Field
34
32
  from nucliadb.ingest.fields.conversation import Conversation
35
- from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
36
33
  from nucliadb.ingest.fields.file import File
37
34
  from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
38
35
  from nucliadb.ingest.fields.link import Link
39
36
  from nucliadb.ingest.fields.text import Text
40
- from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
37
+ from nucliadb.ingest.orm.brain_v2 import FilePagePositions
41
38
  from nucliadb.ingest.orm.metrics import processor_observer
42
39
  from nucliadb_models import content_types
43
40
  from nucliadb_models.common import CloudLink
@@ -69,9 +66,7 @@ from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
69
66
  from nucliadb_protos.resources_pb2 import Origin as PBOrigin
70
67
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
71
68
  from nucliadb_protos.writer_pb2 import BrokerMessage
72
- from nucliadb_utils import const
73
69
  from nucliadb_utils.storages.storage import Storage
74
- from nucliadb_utils.utilities import has_feature
75
70
 
76
71
  if TYPE_CHECKING: # pragma: no cover
77
72
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
@@ -121,7 +116,6 @@ class Resource:
121
116
  self.extra: Optional[PBExtra] = None
122
117
  self.security: Optional[utils_pb2.Security] = None
123
118
  self.modified: bool = False
124
- self._indexer: Optional[ResourceBrain] = None
125
119
  self._modified_extracted_text: list[FieldID] = []
126
120
 
127
121
  self.txn = txn
@@ -133,15 +127,6 @@ class Resource:
133
127
  self._previous_status: Optional[Metadata.Status.ValueType] = None
134
128
  self.user_relations: Optional[PBRelations] = None
135
129
 
136
- @property
137
- def indexer(self) -> ResourceBrain:
138
- if self._indexer is None:
139
- self._indexer = ResourceBrain(rid=self.uuid)
140
- return self._indexer
141
-
142
- def replace_indexer(self, indexer: ResourceBrain) -> None:
143
- self._indexer = indexer
144
-
145
130
  async def set_slug(self):
146
131
  basic = await self.get_basic()
147
132
  new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
@@ -159,14 +144,6 @@ class Resource:
159
144
  if basic_in_payload.HasField("metadata") and basic_in_payload.metadata.useful:
160
145
  current_basic.metadata.status = basic_in_payload.metadata.status
161
146
 
162
- def has_index_message_v2_feature(self) -> bool:
163
- return has_feature(
164
- const.Features.INDEX_MESSAGE_GENERATION_V2,
165
- context={
166
- "kbid": self.kb.kbid,
167
- },
168
- )
169
-
170
147
  @processor_observer.wrap({"type": "set_basic"})
171
148
  async def set_basic(
172
149
  self,
@@ -219,30 +196,6 @@ class Resource:
219
196
  del self.basic.fieldmetadata[:]
220
197
  self.basic.fieldmetadata.extend(updated)
221
198
 
222
- if not self.has_index_message_v2_feature():
223
- # TODO: Remove this when we remove the old indexer is removed
224
- # All modified field metadata should be indexed
225
- # TODO: could be improved to only index the diff
226
- for user_field_metadata in self.basic.fieldmetadata:
227
- field_id = self.generate_field_id(fieldmetadata.field)
228
- field_obj = await self.get_field(
229
- fieldmetadata.field.field, fieldmetadata.field.field_type
230
- )
231
- field_metadata = await field_obj.get_field_metadata()
232
- if field_metadata is not None:
233
- page_positions: Optional[FilePagePositions] = None
234
- if isinstance(field_obj, File):
235
- page_positions = await get_file_page_positions(field_obj)
236
-
237
- self.indexer.apply_field_metadata(
238
- field_id,
239
- field_metadata,
240
- page_positions=page_positions,
241
- extracted_text=await field_obj.get_extracted_text(),
242
- basic_user_field_metadata=user_field_metadata,
243
- replace_field=True,
244
- )
245
-
246
199
  # Some basic fields are computed off field metadata.
247
200
  # This means we need to recompute upon field deletions.
248
201
  if deleted_fields is not None and len(deleted_fields) > 0:
@@ -313,66 +266,6 @@ class Resource:
313
266
  self.modified = True
314
267
  self.user_relations = payload
315
268
 
316
- @processor_observer.wrap({"type": "generate_index_message_old"})
317
- async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
318
- brain = ResourceBrain(rid=self.uuid)
319
- basic = await self.get_basic()
320
- await self.compute_security(brain)
321
- await self.compute_global_tags(brain)
322
- fields = await self.get_fields(force=True)
323
- for (type_id, field_id), field in fields.items():
324
- fieldid = FieldID(field_type=type_id, field=field_id)
325
- await self.compute_global_text_field(fieldid, brain)
326
-
327
- field_metadata = await field.get_field_metadata()
328
- field_key = self.generate_field_id(fieldid)
329
- if field_metadata is not None:
330
- page_positions: Optional[FilePagePositions] = None
331
- if type_id == FieldType.FILE and isinstance(field, File):
332
- page_positions = await get_file_page_positions(field)
333
-
334
- user_field_metadata = None
335
- if basic is not None:
336
- user_field_metadata = next(
337
- (
338
- fm
339
- for fm in basic.fieldmetadata
340
- if fm.field.field == field_id and fm.field.field_type == type_id
341
- ),
342
- None,
343
- )
344
- brain.apply_field_metadata(
345
- field_key,
346
- field_metadata,
347
- page_positions=page_positions,
348
- extracted_text=await field.get_extracted_text(),
349
- basic_user_field_metadata=user_field_metadata,
350
- replace_field=reindex,
351
- )
352
-
353
- if self.disable_vectors is False:
354
- vectorset_configs = []
355
- async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
356
- self.txn, kbid=self.kb.kbid
357
- ):
358
- vectorset_configs.append(vectorset_config)
359
-
360
- for vectorset_config in vectorset_configs:
361
- vo = await field.get_vectors(
362
- vectorset=vectorset_config.vectorset_id,
363
- storage_key_kind=vectorset_config.storage_key_kind,
364
- )
365
- if vo is not None:
366
- dimension = vectorset_config.vectorset_index_config.vector_dimension
367
- brain.apply_field_vectors(
368
- field_key,
369
- vo,
370
- vectorset=vectorset_config.vectorset_id,
371
- vector_dimension=dimension,
372
- replace_field=reindex,
373
- )
374
- return brain
375
-
376
269
  # Fields
377
270
  async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
378
271
  # Get all fields
@@ -445,11 +338,6 @@ class Resource:
445
338
  if field in self.all_fields_keys:
446
339
  self.all_fields_keys.remove(field)
447
340
 
448
- # TODO: Remove this when we remove the old indexer
449
- if not self.has_index_message_v2_feature():
450
- field_key = self.generate_field_id(FieldID(field_type=type, field=key))
451
- self.indexer.delete_field(field_key=field_key)
452
-
453
341
  await field_obj.delete()
454
342
 
455
343
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
@@ -668,7 +556,6 @@ class Resource:
668
556
  update_basic_languages(self.basic, extracted_languages)
669
557
 
670
558
  # Upload to binary storage
671
- # Vector indexing
672
559
  if self.disable_vectors is False:
673
560
  await self._apply_extracted_vectors(message.field_vectors)
674
561
 
@@ -828,38 +715,7 @@ class Resource:
828
715
  field_metadata.field.field_type,
829
716
  load=False,
830
717
  )
831
- metadata = await field_obj.set_field_metadata(field_metadata)
832
-
833
- # TODO: Remove this when we remove the old indexer
834
- if not self.has_index_message_v2_feature():
835
- field_key = self.generate_field_id(field_metadata.field)
836
-
837
- page_positions: Optional[FilePagePositions] = None
838
- if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
839
- page_positions = await get_file_page_positions(field_obj)
840
-
841
- user_field_metadata = next(
842
- (
843
- fm
844
- for fm in self.basic.fieldmetadata
845
- if fm.field.field == field_metadata.field.field
846
- and fm.field.field_type == field_metadata.field.field_type
847
- ),
848
- None,
849
- )
850
-
851
- extracted_text = await field_obj.get_extracted_text()
852
- apply_field_metadata = partial(
853
- self.indexer.apply_field_metadata,
854
- field_key,
855
- metadata,
856
- page_positions=page_positions,
857
- extracted_text=extracted_text,
858
- basic_user_field_metadata=user_field_metadata,
859
- replace_field=True,
860
- )
861
- loop = asyncio.get_running_loop()
862
- await loop.run_in_executor(_executor, apply_field_metadata)
718
+ await field_obj.set_field_metadata(field_metadata)
863
719
 
864
720
  maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
865
721
 
@@ -913,27 +769,6 @@ class Resource:
913
769
  if vo is None:
914
770
  raise AttributeError("Vector object not found on set_vectors")
915
771
 
916
- if self.has_index_message_v2_feature():
917
- continue
918
-
919
- # TODO: Remove this when we remove the old indexer
920
- # Prepare vectors to be indexed
921
- field_key = self.generate_field_id(field_vectors.field)
922
- dimension = vectorset.vectorset_index_config.vector_dimension
923
- if not dimension:
924
- raise ValueError(f"Vector dimension not set for vectorset '{vectorset.vectorset_id}'")
925
-
926
- apply_field_vectors_partial = partial(
927
- self.indexer.apply_field_vectors,
928
- field_key,
929
- vo,
930
- vectorset=vectorset.vectorset_id,
931
- replace_field=True,
932
- vector_dimension=dimension,
933
- )
934
- loop = asyncio.get_running_loop()
935
- await loop.run_in_executor(_executor, apply_field_vectors_partial)
936
-
937
772
  async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
938
773
  field_obj = await self.get_field(
939
774
  field_large_metadata.field.field,
@@ -946,67 +781,7 @@ class Resource:
946
781
  def generate_field_id(self, field: FieldID) -> str:
947
782
  return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
948
783
 
949
- async def compute_security(self, brain: ResourceBrain):
950
- security = await self.get_security()
951
- if security is None:
952
- return
953
- brain.set_security(security)
954
-
955
- @processor_observer.wrap({"type": "compute_global_tags"})
956
- async def compute_global_tags(self, brain: ResourceBrain):
957
- origin = await self.get_origin()
958
- basic = await self.get_basic()
959
- user_relations = await self.get_user_relations()
960
- if basic is None:
961
- raise KeyError("Resource not found")
962
-
963
- brain.set_processing_status(basic=basic, previous_status=self._previous_status)
964
- brain.set_resource_metadata(basic=basic, origin=origin, user_relations=user_relations)
965
- for type, field in await self.get_fields_ids(force=True):
966
- fieldobj = await self.get_field(field, type, load=False)
967
- fieldid = FieldID(field_type=type, field=field)
968
- fieldkey = self.generate_field_id(fieldid)
969
- extracted_metadata = await fieldobj.get_field_metadata()
970
- valid_user_field_metadata = None
971
- for user_field_metadata in basic.fieldmetadata:
972
- if (
973
- user_field_metadata.field.field == field
974
- and user_field_metadata.field.field_type == type
975
- ):
976
- valid_user_field_metadata = user_field_metadata
977
- break
978
- try:
979
- generated_by = await fieldobj.generated_by()
980
- except FieldAuthorNotFound:
981
- generated_by = None
982
- brain.apply_field_labels(
983
- fieldkey,
984
- extracted_metadata,
985
- self.uuid,
986
- generated_by,
987
- basic.usermetadata,
988
- valid_user_field_metadata,
989
- )
990
-
991
- @processor_observer.wrap({"type": "compute_global_text"})
992
- async def compute_global_text(self):
993
- for type, field in await self.get_fields_ids(force=True):
994
- fieldid = FieldID(field_type=type, field=field)
995
- await self.compute_global_text_field(fieldid, self.indexer)
996
-
997
- async def compute_global_text_field(self, fieldid: FieldID, brain: ResourceBrain):
998
- fieldobj = await self.get_field(fieldid.field, fieldid.field_type, load=False)
999
- fieldkey = self.generate_field_id(fieldid)
1000
- extracted_text = await fieldobj.get_extracted_text()
1001
- if extracted_text is None:
1002
- return
1003
- field_text = extracted_text.text
1004
- for _, split in extracted_text.split_text.items():
1005
- field_text += f" {split} "
1006
- brain.apply_field_text(fieldkey, field_text, replace_field=True)
1007
-
1008
784
  def clean(self):
1009
- self._indexer = None
1010
785
  self.txn = None
1011
786
 
1012
787
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.4.0.post4213
3
+ Version: 6.4.0.post4224
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4213
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4213
25
- Requires-Dist: nucliadb-protos>=6.4.0.post4213
26
- Requires-Dist: nucliadb-models>=6.4.0.post4213
27
- Requires-Dist: nidx-protos>=6.4.0.post4213
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.4.0.post4224
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.4.0.post4224
25
+ Requires-Dist: nucliadb-protos>=6.4.0.post4224
26
+ Requires-Dist: nucliadb-models>=6.4.0.post4224
27
+ Requires-Dist: nidx-protos>=6.4.0.post4224
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn[standard]
@@ -135,7 +135,7 @@ nucliadb/ingest/consumer/consumer.py,sha256=OgS1fr5Yo55u-XbC6zypTH1aJ562Y1vZHnPD
135
135
  nucliadb/ingest/consumer/materializer.py,sha256=tgD_rDI2twQzcz8kKNiW_L4YIth16IGh9mUfD5wiSD4,3858
136
136
  nucliadb/ingest/consumer/metrics.py,sha256=ji1l_4cKiHJthQd8YNem1ft4iMbw9KThmVvJmLcv3Xg,1075
137
137
  nucliadb/ingest/consumer/pull.py,sha256=vv1AyN0EhVgbgnZyT0D_1_IB4hWy7jPd4lAWPAOHGNc,10374
138
- nucliadb/ingest/consumer/service.py,sha256=mWzMQS1QkWmJNrkIahEZsn7jb8NbY9FRvPz89NeTT-4,7842
138
+ nucliadb/ingest/consumer/service.py,sha256=GhuqlK-9Lvhzd8kBox8wOlKlJgM3W_gssKoWSfVVdoI,7897
139
139
  nucliadb/ingest/consumer/shard_creator.py,sha256=w0smEu01FU_2cjZnsfBRNqT_Ntho11X17zTMST-vKbc,4359
140
140
  nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
141
141
  nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
@@ -147,21 +147,20 @@ nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54
147
147
  nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJs,4172
148
148
  nucliadb/ingest/fields/text.py,sha256=2grxo8twWbpXEd_iwUMBw9q0dWorVmlPONmY5d1ThwQ,1684
149
149
  nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
150
- nucliadb/ingest/orm/brain.py,sha256=q8iW7TWgABcGG5gIrR4SN5Flv_hOy6uuSv3VL8aWgAk,29098
151
- nucliadb/ingest/orm/brain_v2.py,sha256=0OYqH9srWghajGh0l1oqTFPBh1Jtlw3ui3Qpww6IC7A,33573
150
+ nucliadb/ingest/orm/brain_v2.py,sha256=qX81wvU-KCcEZ-hNgkQOskMOlZmdbJqDyAfe7eXbGLw,33571
152
151
  nucliadb/ingest/orm/broker_message.py,sha256=XWaiZgDOz94NPOPT-hqbRr5ZkpVimUw6PjUJNftfoVw,7514
153
152
  nucliadb/ingest/orm/entities.py,sha256=kXyeF6XOpFKhEsGLcY-GLIk21Exp0cJst4XQQ9jJoug,14791
154
153
  nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
155
- nucliadb/ingest/orm/index_message.py,sha256=7Pl2qtqoI3b3NAjWfgoiLQktayngdsJ_NfDH0wpTJBw,16041
154
+ nucliadb/ingest/orm/index_message.py,sha256=hI85nSNVChNLLdEFuEJvOt61Tsir-Gq-2_WZoayAdvk,15617
156
155
  nucliadb/ingest/orm/knowledgebox.py,sha256=_rkeTMIXMhR64gbYtZpFHoUHghV2DTJ2lUBqZsoqC_4,23898
157
156
  nucliadb/ingest/orm/metrics.py,sha256=OiuggTh-n3kZHA2G73NEUdIlh8c3yFrbusI88DK-Mko,1273
158
- nucliadb/ingest/orm/resource.py,sha256=nk-aT9mEPenmZ4blyqTbTOXCpJaSSi1AE-wTJ01V2vA,47007
157
+ nucliadb/ingest/orm/resource.py,sha256=hGELQgnzK2wIWgD478bR5OiVDyAxHn6WrFSq2YuHANU,36896
159
158
  nucliadb/ingest/orm/utils.py,sha256=fCQRuyecgqhaY7mcBG93oaXMkzkKb9BFjOcy4-ZiSNw,2693
160
159
  nucliadb/ingest/orm/processor/__init__.py,sha256=Aqd9wCNTvggkMkCY3WvoI8spdr94Jnqk-0iq9XpLs18,922
161
160
  nucliadb/ingest/orm/processor/auditing.py,sha256=TeYhXGJRyQ7ROytbb2u8R0fIh_FYi3HgTu3S1ribY3U,4623
162
161
  nucliadb/ingest/orm/processor/data_augmentation.py,sha256=v-pj4GbBWSuO8dQyahs5UDr5ghsyfhCZDS0ftKd6ZYc,5179
163
162
  nucliadb/ingest/orm/processor/pgcatalog.py,sha256=ht9_I5WlPc6sSFTY8PsxHlpjN-EsaBaChwqsLlMXwUk,3100
164
- nucliadb/ingest/orm/processor/processor.py,sha256=jLGLyfj6TIXoLubmNvQvkj5K32MxDDSoktY6M8z5pVk,33149
163
+ nucliadb/ingest/orm/processor/processor.py,sha256=jaEBwbv--WyoC8zcdxWAyF0dAzVA5crVDJl56Bqv1eI,31444
165
164
  nucliadb/ingest/orm/processor/sequence_manager.py,sha256=uqEphtI1Ir_yk9jRl2gPf7BlzzXWovbARY5MNZSBI_8,1704
166
165
  nucliadb/ingest/service/__init__.py,sha256=LHQFUkdmNBOWqBG0Md9sMMI7g5TQZ-hLAnhw6ZblrJg,2002
167
166
  nucliadb/ingest/service/exceptions.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
@@ -369,8 +368,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
369
368
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
370
369
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
371
370
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
372
- nucliadb-6.4.0.post4213.dist-info/METADATA,sha256=PAI_c9PMh-wJWIS4SmAAltmQcXStRUi6tKINdrNKJRM,4223
373
- nucliadb-6.4.0.post4213.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
374
- nucliadb-6.4.0.post4213.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
375
- nucliadb-6.4.0.post4213.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
376
- nucliadb-6.4.0.post4213.dist-info/RECORD,,
371
+ nucliadb-6.4.0.post4224.dist-info/METADATA,sha256=G9L1810f7GDMjI54RDmZj-ZcpBD3_duqsGRR2q3c6yY,4223
372
+ nucliadb-6.4.0.post4224.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
373
+ nucliadb-6.4.0.post4224.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
374
+ nucliadb-6.4.0.post4224.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
375
+ nucliadb-6.4.0.post4224.dist-info/RECORD,,
@@ -1,695 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import logging
21
- from copy import deepcopy
22
- from dataclasses import dataclass
23
- from typing import Optional
24
-
25
- from nidx_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
26
- from nidx_protos.noderesources_pb2 import (
27
- IndexRelation,
28
- ParagraphMetadata,
29
- Representation,
30
- ResourceID,
31
- )
32
- from nidx_protos.noderesources_pb2 import Position as TextPosition
33
- from nidx_protos.noderesources_pb2 import Resource as PBBrainResource
34
-
35
- from nucliadb.common import ids
36
- from nucliadb.ingest import logger
37
- from nucliadb.ingest.orm.utils import compute_paragraph_key
38
- from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
39
- from nucliadb_models.metadata import ResourceProcessingStatus
40
- from nucliadb_protos import utils_pb2
41
- from nucliadb_protos.resources_pb2 import (
42
- Basic,
43
- ExtractedText,
44
- FieldAuthor,
45
- FieldComputedMetadata,
46
- FieldMetadata,
47
- Metadata,
48
- Origin,
49
- Paragraph,
50
- Relations,
51
- UserFieldMetadata,
52
- UserMetadata,
53
- )
54
- from nucliadb_protos.utils_pb2 import Relation, RelationNode
55
-
56
- FilePagePositions = dict[int, tuple[int, int]]
57
-
58
- METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
59
- Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
60
- Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
61
- Metadata.Status.PENDING: ResourceProcessingStatus.PENDING.name,
62
- Metadata.Status.BLOCKED: ResourceProcessingStatus.BLOCKED.name,
63
- Metadata.Status.EXPIRED: ResourceProcessingStatus.EXPIRED.name,
64
- }
65
-
66
-
67
- @dataclass
68
- class ParagraphClassifications:
69
- valid: dict[str, list[str]]
70
- denied: dict[str, list[str]]
71
-
72
-
73
- class ResourceBrain:
74
- def __init__(self, rid: str):
75
- self.rid = rid
76
- ridobj = ResourceID(uuid=rid)
77
- self.brain: PBBrainResource = PBBrainResource(resource=ridobj)
78
- self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
79
-
80
- def apply_field_text(self, field_key: str, text: str, replace_field: bool):
81
- self.brain.texts[field_key].text = text
82
- if replace_field:
83
- field_type, field_name = field_key.split("/")
84
- full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
85
- self.brain.texts_to_delete.append(full_field_id)
86
-
87
- def _get_paragraph_user_classifications(
88
- self, basic_user_field_metadata: Optional[UserFieldMetadata]
89
- ) -> ParagraphClassifications:
90
- pc = ParagraphClassifications(valid={}, denied={})
91
- if basic_user_field_metadata is None:
92
- return pc
93
- for annotated_paragraph in basic_user_field_metadata.paragraphs:
94
- for classification in annotated_paragraph.classifications:
95
- paragraph_key = compute_paragraph_key(self.rid, annotated_paragraph.key)
96
- classif_label = f"/l/{classification.labelset}/{classification.label}"
97
- if classification.cancelled_by_user:
98
- pc.denied.setdefault(paragraph_key, []).append(classif_label)
99
- else:
100
- pc.valid.setdefault(paragraph_key, []).append(classif_label)
101
- return pc
102
-
103
- def apply_field_metadata(
104
- self,
105
- field_key: str,
106
- metadata: FieldComputedMetadata,
107
- page_positions: Optional[FilePagePositions],
108
- extracted_text: Optional[ExtractedText],
109
- basic_user_field_metadata: Optional[UserFieldMetadata] = None,
110
- *,
111
- replace_field: bool = False,
112
- ):
113
- # To check for duplicate paragraphs
114
- unique_paragraphs: set[str] = set()
115
-
116
- # Expose also user classifications
117
- user_paragraph_classifications = self._get_paragraph_user_classifications(
118
- basic_user_field_metadata
119
- )
120
-
121
- # We should set paragraphs and labels
122
- paragraph_pages = ParagraphPages(page_positions) if page_positions else None
123
- for subfield, metadata_split in metadata.split_metadata.items():
124
- extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
125
-
126
- # For each split of this field
127
- for index, paragraph in enumerate(metadata_split.paragraphs):
128
- key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
129
-
130
- denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
131
- position = TextPosition(
132
- index=index,
133
- start=paragraph.start,
134
- end=paragraph.end,
135
- start_seconds=paragraph.start_seconds,
136
- end_seconds=paragraph.end_seconds,
137
- )
138
- page_with_visual = False
139
- if paragraph.HasField("page"):
140
- position.page_number = paragraph.page.page
141
- page_with_visual = paragraph.page.page_with_visual
142
- position.in_page = True
143
- elif paragraph_pages:
144
- position.page_number = paragraph_pages.get(paragraph.start)
145
- position.in_page = True
146
- else:
147
- position.in_page = False
148
-
149
- representation = Representation()
150
- if paragraph.HasField("representation"):
151
- representation.file = paragraph.representation.reference_file
152
- representation.is_a_table = paragraph.representation.is_a_table
153
-
154
- p = BrainParagraph(
155
- start=paragraph.start,
156
- end=paragraph.end,
157
- field=field_key,
158
- split=subfield,
159
- index=index,
160
- repeated_in_field=is_paragraph_repeated_in_field(
161
- paragraph,
162
- extracted_text_str,
163
- unique_paragraphs,
164
- ),
165
- metadata=ParagraphMetadata(
166
- position=position,
167
- page_with_visual=page_with_visual,
168
- representation=representation,
169
- ),
170
- )
171
- paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
172
- paragraph_labels = {paragraph_kind_label}
173
- paragraph_labels.update(
174
- f"/l/{classification.labelset}/{classification.label}"
175
- for classification in paragraph.classifications
176
- )
177
- paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
178
- paragraph_labels.difference_update(denied_classifications)
179
- p.labels.extend(list(paragraph_labels))
180
-
181
- self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
182
-
183
- extracted_text_str = extracted_text.text if extracted_text else None
184
- for index, paragraph in enumerate(metadata.metadata.paragraphs):
185
- key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
186
- denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
187
- position = TextPosition(
188
- index=index,
189
- start=paragraph.start,
190
- end=paragraph.end,
191
- start_seconds=paragraph.start_seconds,
192
- end_seconds=paragraph.end_seconds,
193
- )
194
- page_with_visual = False
195
- if paragraph.HasField("page"):
196
- position.page_number = paragraph.page.page
197
- position.in_page = True
198
- page_with_visual = paragraph.page.page_with_visual
199
- elif paragraph_pages:
200
- position.page_number = paragraph_pages.get(paragraph.start)
201
- position.in_page = True
202
- else:
203
- position.in_page = False
204
-
205
- representation = Representation()
206
- if paragraph.HasField("representation"):
207
- representation.file = paragraph.representation.reference_file
208
- representation.is_a_table = paragraph.representation.is_a_table
209
-
210
- p = BrainParagraph(
211
- start=paragraph.start,
212
- end=paragraph.end,
213
- field=field_key,
214
- index=index,
215
- repeated_in_field=is_paragraph_repeated_in_field(
216
- paragraph, extracted_text_str, unique_paragraphs
217
- ),
218
- metadata=ParagraphMetadata(
219
- position=position,
220
- page_with_visual=page_with_visual,
221
- representation=representation,
222
- ),
223
- )
224
- paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
225
- paragraph_labels = {paragraph_kind_label}
226
- paragraph_labels.update(
227
- f"/l/{classification.labelset}/{classification.label}"
228
- for classification in paragraph.classifications
229
- )
230
- paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
231
- paragraph_labels.difference_update(denied_classifications)
232
- p.labels.extend(list(paragraph_labels))
233
-
234
- self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
235
-
236
- if replace_field:
237
- field_type, field_name = field_key.split("/")
238
- full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
239
- self.brain.paragraphs_to_delete.append(full_field_id)
240
-
241
- field_relations = self.brain.field_relations[field_key].relations
242
- for relations in metadata.metadata.relations:
243
- for relation in relations.relations:
244
- index_relation = IndexRelation(relation=relation)
245
- if relation.metadata.HasField("data_augmentation_task_id"):
246
- index_relation.facets.append(f"/g/da/{relation.metadata.data_augmentation_task_id}")
247
- field_relations.append(index_relation)
248
-
249
- def delete_field(self, field_key: str):
250
- ftype, fkey = field_key.split("/")
251
- full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
252
- self.brain.texts_to_delete.append(full_field_id)
253
- self.brain.paragraphs_to_delete.append(full_field_id)
254
- self.brain.sentences_to_delete.append(full_field_id)
255
- self.brain.relation_fields_to_delete.append(field_key)
256
-
257
- def apply_field_vectors(
258
- self,
259
- field_id: str,
260
- vo: utils_pb2.VectorObject,
261
- *,
262
- vectorset: str,
263
- replace_field: bool = False,
264
- # cut to specific dimension if specified
265
- vector_dimension: Optional[int] = None,
266
- ):
267
- fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
268
- for subfield, vectors in vo.split_vectors.items():
269
- _field_id = ids.FieldId(
270
- rid=fid.rid,
271
- type=fid.type,
272
- key=fid.key,
273
- subfield_id=subfield,
274
- )
275
- # For each split of this field
276
- for index, vector in enumerate(vectors.vectors):
277
- paragraph_key = ids.ParagraphId(
278
- field_id=_field_id,
279
- paragraph_start=vector.start_paragraph,
280
- paragraph_end=vector.end_paragraph,
281
- )
282
- sentence_key = ids.VectorId(
283
- field_id=_field_id,
284
- index=index,
285
- vector_start=vector.start,
286
- vector_end=vector.end,
287
- )
288
- self._apply_field_vector(
289
- field_id,
290
- paragraph_key,
291
- sentence_key,
292
- vector,
293
- vectorset=vectorset,
294
- vector_dimension=vector_dimension,
295
- )
296
-
297
- _field_id = ids.FieldId(
298
- rid=fid.rid,
299
- type=fid.type,
300
- key=fid.key,
301
- )
302
- for index, vector in enumerate(vo.vectors.vectors):
303
- paragraph_key = ids.ParagraphId(
304
- field_id=_field_id,
305
- paragraph_start=vector.start_paragraph,
306
- paragraph_end=vector.end_paragraph,
307
- )
308
- sentence_key = ids.VectorId(
309
- field_id=_field_id,
310
- index=index,
311
- vector_start=vector.start,
312
- vector_end=vector.end,
313
- )
314
- self._apply_field_vector(
315
- field_id,
316
- paragraph_key,
317
- sentence_key,
318
- vector,
319
- vectorset=vectorset,
320
- vector_dimension=vector_dimension,
321
- )
322
-
323
- if replace_field:
324
- full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
325
- self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
326
-
327
- def _apply_field_vector(
328
- self,
329
- field_id: str,
330
- paragraph_key: ids.ParagraphId,
331
- sentence_key: ids.VectorId,
332
- vector: utils_pb2.Vector,
333
- *,
334
- vectorset: str,
335
- # cut vectors if a specific dimension is specified
336
- vector_dimension: Optional[int] = None,
337
- ):
338
- paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
339
- sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
340
-
341
- sentence_pb.ClearField("vector") # clear first to prevent duplicates
342
- sentence_pb.vector.extend(vector.vector[:vector_dimension])
343
-
344
- # we only care about start/stop position of the paragraph for a given sentence here
345
- # the key has the sentence position
346
- sentence_pb.metadata.position.start = vector.start_paragraph
347
- sentence_pb.metadata.position.end = vector.end_paragraph
348
-
349
- # does it make sense to copy forward paragraph values here?
350
- sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
351
- sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
352
-
353
- sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
354
-
355
- sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
356
-
357
- sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
358
-
359
- sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
360
-
361
- def set_processing_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
362
- """
363
- We purposefully overwrite what we index as a status and DO NOT reflect
364
- actual status with what we index.
365
-
366
- This seems to be is on purpose so the frontend of the product can operate
367
- on 2 statuses only -- PENDING and PROCESSED.
368
- """
369
- # The value of brain.status will either be PROCESSED or PENDING
370
- status = basic.metadata.status
371
- if previous_status is not None and previous_status != Metadata.Status.PENDING:
372
- # Already processed once, so it stays as PROCESSED
373
- self.brain.status = PBBrainResource.PROCESSED
374
- return
375
- # previos_status is None or PENDING
376
- if status == Metadata.Status.PENDING:
377
- # Stays in pending
378
- self.brain.status = PBBrainResource.PENDING
379
- else:
380
- # Means it has just been processed
381
- self.brain.status = PBBrainResource.PROCESSED
382
-
383
- def set_security(self, security: utils_pb2.Security):
384
- self.brain.security.CopyFrom(security)
385
-
386
- def get_processing_status_tag(self, metadata: Metadata) -> str:
387
- if not metadata.useful:
388
- return "EMPTY"
389
- return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
390
-
391
- def set_resource_metadata(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
392
- self._set_resource_dates(basic, origin)
393
- self._set_resource_labels(basic, origin)
394
- self._set_resource_relations(basic, origin, user_relations)
395
-
396
- def _set_resource_dates(self, basic: Basic, origin: Optional[Origin]):
397
- if basic.created.seconds > 0:
398
- self.brain.metadata.created.CopyFrom(basic.created)
399
- else:
400
- logging.warning(f"Basic metadata has no created field for {self.rid}")
401
- self.brain.metadata.created.GetCurrentTime()
402
- if basic.modified.seconds > 0:
403
- self.brain.metadata.modified.CopyFrom(basic.modified)
404
- else:
405
- if basic.created.seconds > 0:
406
- self.brain.metadata.modified.CopyFrom(basic.created)
407
- else:
408
- self.brain.metadata.modified.GetCurrentTime()
409
-
410
- if origin is not None:
411
- # overwrite created/modified if provided on origin
412
- if origin.HasField("created") and origin.created.seconds > 0:
413
- self.brain.metadata.created.CopyFrom(origin.created)
414
- if origin.HasField("modified") and origin.modified.seconds > 0:
415
- self.brain.metadata.modified.CopyFrom(origin.modified)
416
-
417
- def _set_resource_relations(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
418
- relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
419
- if origin is not None:
420
- # origin contributors
421
- for contrib in origin.colaborators:
422
- relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
423
- relation = Relation(
424
- relation=Relation.COLAB,
425
- source=relationnodedocument,
426
- to=relationnodeuser,
427
- )
428
- self.brain.field_relations["a/metadata"].relations.append(
429
- IndexRelation(relation=relation)
430
- )
431
-
432
- # labels
433
- for classification in basic.usermetadata.classifications:
434
- relation_node_label = RelationNode(
435
- value=f"{classification.labelset}/{classification.label}",
436
- ntype=RelationNode.NodeType.LABEL,
437
- )
438
- relation = Relation(
439
- relation=Relation.ABOUT,
440
- source=relationnodedocument,
441
- to=relation_node_label,
442
- )
443
- self.brain.field_relations["a/metadata"].relations.append(IndexRelation(relation=relation))
444
-
445
- # relations
446
- for relation in user_relations.relations:
447
- self.brain.field_relations["a/metadata"].relations.append(
448
- IndexRelation(relation=relation, facets=["/g/u"])
449
- )
450
-
451
- self.brain.relation_fields_to_delete.append("a/metadata")
452
-
453
- def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
454
- if origin is not None:
455
- if origin.source_id:
456
- self.labels["o"] = {origin.source_id}
457
- # origin tags
458
- for tag in origin.tags:
459
- self.labels["t"].add(tag)
460
- # origin source
461
- if origin.source_id != "":
462
- self.labels["u"].add(f"s/{origin.source_id}")
463
-
464
- if origin.path:
465
- self.labels["p"].add(origin.path.lstrip("/"))
466
-
467
- # origin contributors
468
- for contrib in origin.colaborators:
469
- self.labels["u"].add(f"o/{contrib}")
470
-
471
- for key, value in origin.metadata.items():
472
- self.labels["m"].add(f"{key[:255]}/{value[:255]}")
473
-
474
- # icon
475
- self.labels["n"].add(f"i/{basic.icon}")
476
-
477
- # processing status
478
- status_tag = self.get_processing_status_tag(basic.metadata)
479
- self.labels["n"].add(f"s/{status_tag}")
480
-
481
- # main language
482
- if basic.metadata.language:
483
- self.labels["s"].add(f"p/{basic.metadata.language}")
484
-
485
- # all language
486
- for lang in basic.metadata.languages:
487
- self.labels["s"].add(f"s/{lang}")
488
-
489
- # labels
490
- for classification in basic.usermetadata.classifications:
491
- self.labels["l"].add(f"{classification.labelset}/{classification.label}")
492
-
493
- # hidden
494
- if basic.hidden:
495
- _, p1, p2 = LABEL_HIDDEN.split("/")
496
- self.labels[p1].add(p2)
497
-
498
- self.brain.ClearField("labels")
499
- self.brain.labels.extend(flatten_resource_labels(self.labels))
500
-
501
- def process_field_metadata(
502
- self,
503
- field_key: str,
504
- metadata: FieldMetadata,
505
- labels: dict[str, set[str]],
506
- relation_node_document: RelationNode,
507
- user_canceled_labels: set[str],
508
- ):
509
- if metadata.mime_type != "":
510
- labels["mt"].add(metadata.mime_type)
511
-
512
- base_classification_relation = Relation(
513
- relation=Relation.ABOUT,
514
- source=relation_node_document,
515
- to=RelationNode(
516
- ntype=RelationNode.NodeType.LABEL,
517
- ),
518
- )
519
- for classification in metadata.classifications:
520
- label = f"{classification.labelset}/{classification.label}"
521
- if label not in user_canceled_labels:
522
- labels["l"].add(label)
523
- relation = Relation()
524
- relation.CopyFrom(base_classification_relation)
525
- relation.to.value = label
526
- self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
527
-
528
- # Data Augmentation + Processor entities
529
- base_entity_relation = Relation(
530
- relation=Relation.ENTITY,
531
- source=relation_node_document,
532
- to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
533
- )
534
- use_legacy_entities = True
535
- for data_augmentation_task_id, entities in metadata.entities.items():
536
- # If we recieved the entities from the processor here, we don't want to use the legacy entities
537
- # TODO: Remove this when processor doesn't use this anymore
538
- if data_augmentation_task_id == "processor":
539
- use_legacy_entities = False
540
-
541
- for ent in entities.entities:
542
- entity_text = ent.text
543
- entity_label = ent.label
544
- # Seems like we don't care about where the entity is in the text
545
- # entity_positions = entity.positions
546
- labels["e"].add(
547
- f"{entity_label}/{entity_text}"
548
- ) # Add data_augmentation_task_id as a prefix?
549
- relation = Relation()
550
- relation.CopyFrom(base_entity_relation)
551
- relation.to.value = entity_text
552
- relation.to.subtype = entity_label
553
- self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
554
-
555
- # Legacy processor entities
556
- # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
557
- def _parse_entity(klass_entity: str) -> tuple[str, str]:
558
- try:
559
- klass, entity = klass_entity.split("/", 1)
560
- return klass, entity
561
- except ValueError:
562
- raise AttributeError(f"Entity should be with type {klass_entity}")
563
-
564
- if use_legacy_entities:
565
- for klass_entity in metadata.positions.keys():
566
- labels["e"].add(klass_entity)
567
- klass, entity = _parse_entity(klass_entity)
568
- relation = Relation()
569
- relation.CopyFrom(base_entity_relation)
570
- relation.to.value = entity
571
- relation.to.subtype = klass
572
- self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
573
-
574
- def apply_field_labels(
575
- self,
576
- field_key: str,
577
- metadata: Optional[FieldComputedMetadata],
578
- uuid: str,
579
- generated_by: Optional[FieldAuthor],
580
- basic_user_metadata: Optional[UserMetadata] = None,
581
- basic_user_fieldmetadata: Optional[UserFieldMetadata] = None,
582
- ):
583
- user_canceled_labels: set[str] = set()
584
- if basic_user_metadata is not None:
585
- user_canceled_labels.update(
586
- f"{classification.labelset}/{classification.label}"
587
- for classification in basic_user_metadata.classifications
588
- if classification.cancelled_by_user
589
- )
590
- relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
591
- labels: dict[str, set[str]] = {
592
- "l": set(), # classification labels
593
- "e": set(), # entities
594
- "mt": set(), # mime type
595
- "g/da": set(), # generated by
596
- }
597
- if metadata is not None:
598
- for meta in metadata.split_metadata.values():
599
- self.process_field_metadata(
600
- field_key,
601
- meta,
602
- labels,
603
- relation_node_resource,
604
- user_canceled_labels,
605
- )
606
- self.process_field_metadata(
607
- field_key,
608
- metadata.metadata,
609
- labels,
610
- relation_node_resource,
611
- user_canceled_labels,
612
- )
613
-
614
- if basic_user_fieldmetadata is not None:
615
- for paragraph_annotation in basic_user_fieldmetadata.paragraphs:
616
- for classification in paragraph_annotation.classifications:
617
- if not classification.cancelled_by_user:
618
- label = f"/l/{classification.labelset}/{classification.label}"
619
- # FIXME: this condition avoid adding duplicate labels
620
- # while importing a kb. We shouldn't add duplicates on
621
- # the first place
622
- if (
623
- label
624
- not in self.brain.paragraphs[field_key]
625
- .paragraphs[paragraph_annotation.key]
626
- .labels
627
- ):
628
- self.brain.paragraphs[field_key].paragraphs[
629
- paragraph_annotation.key
630
- ].labels.append(label)
631
-
632
- if generated_by is not None and generated_by.WhichOneof("author") == "data_augmentation":
633
- field_type, field_id = field_key.split("/")
634
- da_task_id = ids.extract_data_augmentation_id(field_id)
635
- if da_task_id is None: # pragma: nocover
636
- logger.warning(
637
- "Data augmentation field id has an unexpected format! Skipping label",
638
- extra={
639
- "rid": uuid,
640
- "field_id": field_id,
641
- },
642
- )
643
- else:
644
- labels["g/da"].add(da_task_id)
645
-
646
- flat_labels = flatten_resource_labels(labels)
647
- if len(flat_labels) > 0:
648
- self.brain.texts[field_key].labels.extend(flat_labels)
649
-
650
-
651
- def is_paragraph_repeated_in_field(
652
- paragraph: Paragraph,
653
- extracted_text: Optional[str],
654
- unique_paragraphs: set[str],
655
- ) -> bool:
656
- if extracted_text is None:
657
- return False
658
-
659
- paragraph_text = extracted_text[paragraph.start : paragraph.end]
660
- if len(paragraph_text) == 0:
661
- return False
662
-
663
- if paragraph_text in unique_paragraphs:
664
- repeated_in_field = True
665
- else:
666
- repeated_in_field = False
667
- unique_paragraphs.add(paragraph_text)
668
- return repeated_in_field
669
-
670
-
671
- class ParagraphPages:
672
- """
673
- Class to get the page number for a given paragraph in an optimized way.
674
- """
675
-
676
- def __init__(self, positions: FilePagePositions):
677
- self.positions = positions
678
- self._materialized = self._materialize_page_numbers(positions)
679
-
680
- def _materialize_page_numbers(self, positions: FilePagePositions) -> list[int]:
681
- page_numbers_by_index = []
682
- for page_number, (page_start, page_end) in positions.items():
683
- page_numbers_by_index.extend([page_number] * (page_end - page_start + 1))
684
- return page_numbers_by_index
685
-
686
- def get(self, paragraph_start_index: int) -> int:
687
- try:
688
- return self._materialized[paragraph_start_index]
689
- except IndexError:
690
- logger.error(
691
- f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}" # noqa
692
- )
693
- if len(self._materialized) > 0:
694
- return self._materialized[-1]
695
- return 0