nucliadb 6.3.5.post3980__py3-none-any.whl → 6.3.5.post3990__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,782 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import logging
21
+ from copy import deepcopy
22
+ from dataclasses import dataclass
23
+ from typing import Optional
24
+
25
+ from nucliadb.common import ids
26
+ from nucliadb.ingest import logger
27
+ from nucliadb.ingest.orm.utils import compute_paragraph_key
28
+ from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
29
+ from nucliadb_models.metadata import ResourceProcessingStatus
30
+ from nucliadb_protos import utils_pb2
31
+ from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
32
+ from nucliadb_protos.noderesources_pb2 import (
33
+ IndexRelation,
34
+ ParagraphMetadata,
35
+ Representation,
36
+ ResourceID,
37
+ )
38
+ from nucliadb_protos.noderesources_pb2 import Position as TextPosition
39
+ from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
40
+ from nucliadb_protos.resources_pb2 import (
41
+ Basic,
42
+ ExtractedText,
43
+ FieldAuthor,
44
+ FieldComputedMetadata,
45
+ Metadata,
46
+ Origin,
47
+ Paragraph,
48
+ Relations,
49
+ UserFieldMetadata,
50
+ UserMetadata,
51
+ )
52
+ from nucliadb_protos.utils_pb2 import Relation, RelationNode
53
+
54
+ FilePagePositions = dict[int, tuple[int, int]]
55
+
56
+ METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
57
+ Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
58
+ Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
59
+ Metadata.Status.PENDING: ResourceProcessingStatus.PENDING.name,
60
+ Metadata.Status.BLOCKED: ResourceProcessingStatus.BLOCKED.name,
61
+ Metadata.Status.EXPIRED: ResourceProcessingStatus.EXPIRED.name,
62
+ }
63
+
64
+
65
+ @dataclass
66
+ class ParagraphClassifications:
67
+ valid: dict[str, list[str]]
68
+ denied: dict[str, list[str]]
69
+
70
+
71
+ class ResourceBrainV2:
72
+ def __init__(self, rid: str):
73
+ self.rid = rid
74
+ self.brain: PBBrainResource = PBBrainResource(resource=ResourceID(uuid=rid))
75
+ self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
76
+
77
+ def generate_resource_indexing_metadata(
78
+ self,
79
+ basic: Basic,
80
+ user_relations: Relations,
81
+ origin: Optional[Origin],
82
+ previous_processing_status: Optional[Metadata.Status.ValueType],
83
+ security: Optional[utils_pb2.Security],
84
+ ) -> None:
85
+ self._set_resource_status(basic, previous_processing_status)
86
+ self._set_resource_dates(basic, origin)
87
+ self._set_resource_labels(basic, origin)
88
+ self._set_resource_relations(basic, origin, user_relations)
89
+ if security is not None:
90
+ self._set_resource_security(security)
91
+
92
+ def generate_texts_index_message(
93
+ self,
94
+ field_key: str,
95
+ extracted_text: ExtractedText,
96
+ field_computed_metadata: Optional[FieldComputedMetadata],
97
+ basic_user_metadata: Optional[UserMetadata],
98
+ field_author: Optional[FieldAuthor],
99
+ replace_field: bool,
100
+ skip_index: bool,
101
+ ) -> None:
102
+ self.apply_field_text(
103
+ field_key,
104
+ extracted_text,
105
+ replace_field=replace_field,
106
+ skip_texts=skip_index,
107
+ )
108
+ self.apply_field_labels(
109
+ field_key,
110
+ field_computed_metadata,
111
+ field_author,
112
+ basic_user_metadata,
113
+ )
114
+
115
+ def apply_field_text(
116
+ self,
117
+ field_key: str,
118
+ extracted_text: ExtractedText,
119
+ replace_field: bool,
120
+ skip_texts: Optional[bool],
121
+ ):
122
+ if skip_texts is not None:
123
+ self.brain.skip_texts = skip_texts
124
+ field_text = extracted_text.text
125
+ for _, split in extracted_text.split_text.items():
126
+ field_text += f" {split} "
127
+ self.brain.texts[field_key].text = field_text
128
+
129
+ if replace_field:
130
+ ftype, fkey = field_key.split("/")
131
+ full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
132
+ self.brain.texts_to_delete.append(full_field_id)
133
+
134
+ def apply_field_labels(
135
+ self,
136
+ field_key: str,
137
+ field_computed_metadata: Optional[FieldComputedMetadata],
138
+ field_author: Optional[FieldAuthor],
139
+ basic_user_metadata: Optional[UserMetadata] = None,
140
+ ):
141
+ user_cancelled_labels: set[str] = (
142
+ set(
143
+ [
144
+ f"{classification.labelset}/{classification.label}"
145
+ for classification in basic_user_metadata.classifications
146
+ if classification.cancelled_by_user
147
+ ]
148
+ )
149
+ if basic_user_metadata
150
+ else set()
151
+ )
152
+ labels: dict[str, set[str]] = {
153
+ "l": set(), # classification labels
154
+ "e": set(), # entities
155
+ "mt": set(), # mime type
156
+ "g/da": set(), # generated by
157
+ }
158
+ if field_computed_metadata is not None:
159
+ metadatas = list(field_computed_metadata.split_metadata.values())
160
+ metadatas.append(field_computed_metadata.metadata)
161
+ for metadata in metadatas:
162
+ if metadata.mime_type != "":
163
+ labels["mt"].add(metadata.mime_type)
164
+ for classification in metadata.classifications:
165
+ label = f"{classification.labelset}/{classification.label}"
166
+ if label not in user_cancelled_labels:
167
+ labels["l"].add(label)
168
+ use_legacy_entities = True
169
+ for data_augmentation_task_id, entities in metadata.entities.items():
170
+ # If we recieved the entities from the processor here, we don't want to use the legacy entities
171
+ # TODO: Remove this when processor doesn't use this anymore
172
+ if data_augmentation_task_id == "processor":
173
+ use_legacy_entities = False
174
+ for ent in entities.entities:
175
+ entity_text = ent.text
176
+ entity_label = ent.label
177
+ # Seems like we don't care about where the entity is in the text
178
+ # entity_positions = entity.positions
179
+ labels["e"].add(
180
+ f"{entity_label}/{entity_text}"
181
+ ) # Add data_augmentation_task_id as a prefix?
182
+ # Legacy processor entities
183
+ if use_legacy_entities:
184
+ for klass_entity in metadata.positions.keys():
185
+ labels["e"].add(klass_entity)
186
+
187
+ if field_author is not None and field_author.WhichOneof("author") == "data_augmentation":
188
+ field_type, field_id = field_key.split("/")
189
+ da_task_id = ids.extract_data_augmentation_id(field_id)
190
+ if da_task_id is None: # pragma: nocover
191
+ logger.warning(
192
+ "Data augmentation field id has an unexpected format! Skipping label",
193
+ extra={
194
+ "field_id": field_id,
195
+ "field_type": field_type,
196
+ },
197
+ )
198
+ else:
199
+ labels["g/da"].add(da_task_id)
200
+
201
+ self.brain.texts[field_key].labels.extend(flatten_resource_labels(labels))
202
+
203
+ def generate_paragraphs_index_message(
204
+ self,
205
+ field_key: str,
206
+ field_computed_metadata: FieldComputedMetadata,
207
+ extracted_text: ExtractedText,
208
+ page_positions: Optional[FilePagePositions],
209
+ user_field_metadata: Optional[UserFieldMetadata],
210
+ replace_field: bool,
211
+ skip_index: Optional[bool],
212
+ ) -> None:
213
+ # We need to add the extracted text to the texts section of the Resource so that
214
+ # the paragraphs can be indexed
215
+ self.apply_field_text(
216
+ field_key,
217
+ extracted_text,
218
+ replace_field=False,
219
+ skip_texts=None,
220
+ )
221
+ self.apply_field_paragraphs(
222
+ field_key,
223
+ field_computed_metadata,
224
+ extracted_text,
225
+ page_positions,
226
+ user_field_metadata,
227
+ replace_field=replace_field,
228
+ skip_paragraphs=skip_index,
229
+ )
230
+
231
+ def apply_field_paragraphs(
232
+ self,
233
+ field_key: str,
234
+ field_computed_metadata: FieldComputedMetadata,
235
+ extracted_text: ExtractedText,
236
+ page_positions: Optional[FilePagePositions],
237
+ user_field_metadata: Optional[UserFieldMetadata],
238
+ replace_field: bool,
239
+ skip_paragraphs: Optional[bool],
240
+ ) -> None:
241
+ if skip_paragraphs is not None:
242
+ self.brain.skip_paragraphs = skip_paragraphs
243
+ unique_paragraphs: set[str] = set()
244
+ user_paragraph_classifications = self._get_paragraph_user_classifications(user_field_metadata)
245
+ paragraph_pages = ParagraphPages(page_positions) if page_positions else None
246
+ # Splits of the field
247
+ for subfield, field_metadata in field_computed_metadata.split_metadata.items():
248
+ extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
249
+ for idx, paragraph in enumerate(field_metadata.paragraphs):
250
+ key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
251
+ denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
252
+ position = TextPosition(
253
+ index=idx,
254
+ start=paragraph.start,
255
+ end=paragraph.end,
256
+ start_seconds=paragraph.start_seconds,
257
+ end_seconds=paragraph.end_seconds,
258
+ )
259
+ page_with_visual = False
260
+ if paragraph.HasField("page"):
261
+ position.page_number = paragraph.page.page
262
+ page_with_visual = paragraph.page.page_with_visual
263
+ position.in_page = True
264
+ elif paragraph_pages:
265
+ position.page_number = paragraph_pages.get(paragraph.start)
266
+ position.in_page = True
267
+ else:
268
+ position.in_page = False
269
+ representation = Representation()
270
+ if paragraph.HasField("representation"):
271
+ representation.file = paragraph.representation.reference_file
272
+ representation.is_a_table = paragraph.representation.is_a_table
273
+ p = BrainParagraph(
274
+ start=paragraph.start,
275
+ end=paragraph.end,
276
+ field=field_key,
277
+ split=subfield,
278
+ index=idx,
279
+ repeated_in_field=is_paragraph_repeated_in_field(
280
+ paragraph,
281
+ extracted_text_str,
282
+ unique_paragraphs,
283
+ ),
284
+ metadata=ParagraphMetadata(
285
+ position=position,
286
+ page_with_visual=page_with_visual,
287
+ representation=representation,
288
+ ),
289
+ )
290
+ paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
291
+ paragraph_labels = {paragraph_kind_label}
292
+ paragraph_labels.update(
293
+ f"/l/{classification.labelset}/{classification.label}"
294
+ for classification in paragraph.classifications
295
+ )
296
+ paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
297
+ paragraph_labels.difference_update(denied_classifications)
298
+ p.labels.extend(list(paragraph_labels))
299
+ self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
300
+
301
+ # Main field
302
+ extracted_text_str = extracted_text.text if extracted_text else None
303
+ for idx, paragraph in enumerate(field_computed_metadata.metadata.paragraphs):
304
+ key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
305
+ denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
306
+ position = TextPosition(
307
+ index=idx,
308
+ start=paragraph.start,
309
+ end=paragraph.end,
310
+ start_seconds=paragraph.start_seconds,
311
+ end_seconds=paragraph.end_seconds,
312
+ )
313
+ page_with_visual = False
314
+ if paragraph.HasField("page"):
315
+ position.page_number = paragraph.page.page
316
+ position.in_page = True
317
+ page_with_visual = paragraph.page.page_with_visual
318
+ elif paragraph_pages:
319
+ position.page_number = paragraph_pages.get(paragraph.start)
320
+ position.in_page = True
321
+ else:
322
+ position.in_page = False
323
+ representation = Representation()
324
+ if paragraph.HasField("representation"):
325
+ representation.file = paragraph.representation.reference_file
326
+ representation.is_a_table = paragraph.representation.is_a_table
327
+ p = BrainParagraph(
328
+ start=paragraph.start,
329
+ end=paragraph.end,
330
+ field=field_key,
331
+ index=idx,
332
+ repeated_in_field=is_paragraph_repeated_in_field(
333
+ paragraph, extracted_text_str, unique_paragraphs
334
+ ),
335
+ metadata=ParagraphMetadata(
336
+ position=position,
337
+ page_with_visual=page_with_visual,
338
+ representation=representation,
339
+ ),
340
+ )
341
+ paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
342
+ paragraph_labels = {paragraph_kind_label}
343
+ paragraph_labels.update(
344
+ f"/l/{classification.labelset}/{classification.label}"
345
+ for classification in paragraph.classifications
346
+ )
347
+ paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
348
+ paragraph_labels.difference_update(denied_classifications)
349
+ p.labels.extend(list(paragraph_labels))
350
+
351
+ self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
352
+
353
+ if replace_field:
354
+ field_type, field_name = field_key.split("/")
355
+ full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
356
+ self.brain.paragraphs_to_delete.append(full_field_id)
357
+
358
+ def _get_paragraph_user_classifications(
359
+ self, basic_user_field_metadata: Optional[UserFieldMetadata]
360
+ ) -> ParagraphClassifications:
361
+ pc = ParagraphClassifications(valid={}, denied={})
362
+ if basic_user_field_metadata is None:
363
+ return pc
364
+ for annotated_paragraph in basic_user_field_metadata.paragraphs:
365
+ for classification in annotated_paragraph.classifications:
366
+ paragraph_key = compute_paragraph_key(self.rid, annotated_paragraph.key)
367
+ classif_label = f"/l/{classification.labelset}/{classification.label}"
368
+ if classification.cancelled_by_user:
369
+ pc.denied.setdefault(paragraph_key, []).append(classif_label)
370
+ else:
371
+ pc.valid.setdefault(paragraph_key, []).append(classif_label)
372
+ return pc
373
+
374
+ def generate_relations_index_message(
375
+ self,
376
+ field_key: str,
377
+ field_computed_metadata: Optional[FieldComputedMetadata],
378
+ basic_user_metadata: Optional[UserMetadata],
379
+ replace_field: bool,
380
+ ) -> None:
381
+ user_cancelled_labels: set[str] = (
382
+ set(
383
+ [
384
+ f"{classification.labelset}/{classification.label}"
385
+ for classification in basic_user_metadata.classifications
386
+ if classification.cancelled_by_user
387
+ ]
388
+ )
389
+ if basic_user_metadata
390
+ else set()
391
+ )
392
+
393
+ field_relations = self.brain.field_relations[field_key].relations
394
+
395
+ # Index relations that are computed by the processor
396
+ if field_computed_metadata is not None:
397
+ relation_node_document = RelationNode(
398
+ value=self.brain.resource.uuid,
399
+ ntype=RelationNode.NodeType.RESOURCE,
400
+ )
401
+ field_metadatas = list(field_computed_metadata.split_metadata.values())
402
+ field_metadatas.append(field_computed_metadata.metadata)
403
+ for field_metadata in field_metadatas:
404
+ # Relations computed by the processor
405
+ for relations in field_metadata.relations:
406
+ for relation in relations.relations:
407
+ index_relation = IndexRelation(relation=relation)
408
+ if relation.metadata.HasField("data_augmentation_task_id"):
409
+ index_relation.facets.append(
410
+ f"/g/da/{relation.metadata.data_augmentation_task_id}"
411
+ )
412
+ field_relations.append(index_relation)
413
+ # Entities computed by the processor or ingestion agents
414
+ base_entity_relation = Relation(
415
+ relation=Relation.ENTITY,
416
+ source=relation_node_document,
417
+ to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
418
+ )
419
+ use_legacy_entities = True
420
+ for data_augmentation_task_id, entities in field_metadata.entities.items():
421
+ # If we recieved the entities from the processor here, we don't want to use the legacy entities
422
+ # TODO: Remove this when processor doesn't use this anymore
423
+ if data_augmentation_task_id == "processor":
424
+ use_legacy_entities = False
425
+
426
+ for ent in entities.entities:
427
+ entity_text = ent.text
428
+ entity_label = ent.label
429
+ relation = Relation()
430
+ relation.CopyFrom(base_entity_relation)
431
+ relation.to.value = entity_text
432
+ relation.to.subtype = entity_label
433
+ field_relations.append(IndexRelation(relation=relation))
434
+
435
+ # Legacy processor entities
436
+ # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
437
+ def _parse_entity(klass_entity: str) -> tuple[str, str]:
438
+ try:
439
+ klass, entity = klass_entity.split("/", 1)
440
+ return klass, entity
441
+ except ValueError:
442
+ raise AttributeError(f"Entity should be with type {klass_entity}")
443
+
444
+ if use_legacy_entities:
445
+ for klass_entity in field_metadata.positions.keys():
446
+ klass, entity = _parse_entity(klass_entity)
447
+ relation = Relation()
448
+ relation.CopyFrom(base_entity_relation)
449
+ relation.to.value = entity
450
+ relation.to.subtype = klass
451
+ field_relations.append(IndexRelation(relation=relation))
452
+
453
+ # Relations from field to classifications label
454
+ base_classification_relation = Relation(
455
+ relation=Relation.ABOUT,
456
+ source=relation_node_document,
457
+ to=RelationNode(
458
+ ntype=RelationNode.NodeType.LABEL,
459
+ ),
460
+ )
461
+ for classification in field_metadata.classifications:
462
+ label = f"{classification.labelset}/{classification.label}"
463
+ if label in user_cancelled_labels:
464
+ continue
465
+ relation = Relation()
466
+ relation.CopyFrom(base_classification_relation)
467
+ relation.to.value = label
468
+ field_relations.append(IndexRelation(relation=relation))
469
+ if replace_field:
470
+ self.brain.relation_fields_to_delete.append(field_key)
471
+
472
+ def delete_field(self, field_key: str):
473
+ ftype, fkey = field_key.split("/")
474
+ full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
475
+ self.brain.texts_to_delete.append(full_field_id)
476
+ self.brain.paragraphs_to_delete.append(full_field_id)
477
+ self.brain.sentences_to_delete.append(full_field_id)
478
+ self.brain.relation_fields_to_delete.append(field_key)
479
+
480
+ def generate_vectors_index_message(
481
+ self,
482
+ field_id: str,
483
+ vo: utils_pb2.VectorObject,
484
+ *,
485
+ vectorset: str,
486
+ replace_field: bool = False,
487
+ # cut to specific dimension if specified
488
+ vector_dimension: Optional[int] = None,
489
+ ):
490
+ fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
491
+ for subfield, vectors in vo.split_vectors.items():
492
+ _field_id = ids.FieldId(
493
+ rid=fid.rid,
494
+ type=fid.type,
495
+ key=fid.key,
496
+ subfield_id=subfield,
497
+ )
498
+ # For each split of this field
499
+ for index, vector in enumerate(vectors.vectors):
500
+ paragraph_key = ids.ParagraphId(
501
+ field_id=_field_id,
502
+ paragraph_start=vector.start_paragraph,
503
+ paragraph_end=vector.end_paragraph,
504
+ )
505
+ sentence_key = ids.VectorId(
506
+ field_id=_field_id,
507
+ index=index,
508
+ vector_start=vector.start,
509
+ vector_end=vector.end,
510
+ )
511
+ self._apply_field_vector(
512
+ field_id,
513
+ paragraph_key,
514
+ sentence_key,
515
+ vector,
516
+ vectorset=vectorset,
517
+ vector_dimension=vector_dimension,
518
+ )
519
+
520
+ _field_id = ids.FieldId(
521
+ rid=fid.rid,
522
+ type=fid.type,
523
+ key=fid.key,
524
+ )
525
+ for index, vector in enumerate(vo.vectors.vectors):
526
+ paragraph_key = ids.ParagraphId(
527
+ field_id=_field_id,
528
+ paragraph_start=vector.start_paragraph,
529
+ paragraph_end=vector.end_paragraph,
530
+ )
531
+ sentence_key = ids.VectorId(
532
+ field_id=_field_id,
533
+ index=index,
534
+ vector_start=vector.start,
535
+ vector_end=vector.end,
536
+ )
537
+ self._apply_field_vector(
538
+ field_id,
539
+ paragraph_key,
540
+ sentence_key,
541
+ vector,
542
+ vectorset=vectorset,
543
+ vector_dimension=vector_dimension,
544
+ )
545
+
546
+ if replace_field:
547
+ full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
548
+ self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
549
+
550
+ def _apply_field_vector(
551
+ self,
552
+ field_id: str,
553
+ paragraph_key: ids.ParagraphId,
554
+ sentence_key: ids.VectorId,
555
+ vector: utils_pb2.Vector,
556
+ *,
557
+ vectorset: str,
558
+ # cut vectors if a specific dimension is specified
559
+ vector_dimension: Optional[int] = None,
560
+ ):
561
+ paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
562
+ sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
563
+
564
+ sentence_pb.ClearField("vector") # clear first to prevent duplicates
565
+ sentence_pb.vector.extend(vector.vector[:vector_dimension])
566
+
567
+ # we only care about start/stop position of the paragraph for a given sentence here
568
+ # the key has the sentence position
569
+ sentence_pb.metadata.position.start = vector.start_paragraph
570
+ sentence_pb.metadata.position.end = vector.end_paragraph
571
+
572
+ # does it make sense to copy forward paragraph values here?
573
+ sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
574
+ sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
575
+
576
+ sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
577
+
578
+ sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
579
+
580
+ sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
581
+
582
+ sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
583
+
584
+ def _set_resource_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
585
+ """
586
+ We purposefully overwrite what we index as a status and DO NOT reflect
587
+ actual status with what we index.
588
+
589
+ This seems to be is on purpose so the frontend of the product can operate
590
+ on 2 statuses only -- PENDING and PROCESSED.
591
+ """
592
+ # The value of brain.status will either be PROCESSED or PENDING
593
+ status = basic.metadata.status
594
+ if previous_status is not None and previous_status != Metadata.Status.PENDING:
595
+ # Already processed once, so it stays as PROCESSED
596
+ self.brain.status = PBBrainResource.PROCESSED
597
+ return
598
+ # previos_status is None or PENDING
599
+ if status == Metadata.Status.PENDING:
600
+ # Stays in pending
601
+ self.brain.status = PBBrainResource.PENDING
602
+ else:
603
+ # Means it has just been processed
604
+ self.brain.status = PBBrainResource.PROCESSED
605
+
606
+ def _set_resource_security(self, security: utils_pb2.Security):
607
+ self.brain.security.CopyFrom(security)
608
+
609
+ def get_processing_status_tag(self, metadata: Metadata) -> str:
610
+ if not metadata.useful:
611
+ return "EMPTY"
612
+ return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
613
+
614
+ def _set_resource_dates(self, basic: Basic, origin: Optional[Origin]):
615
+ """
616
+ Adds the user-defined dates to the brain object. This is at resource level and applies to
617
+ all fields of the resource.
618
+ """
619
+ if basic.created.seconds > 0:
620
+ self.brain.metadata.created.CopyFrom(basic.created)
621
+ else:
622
+ logging.warning(f"Basic metadata has no created field for {self.rid}")
623
+ self.brain.metadata.created.GetCurrentTime()
624
+ if basic.modified.seconds > 0:
625
+ self.brain.metadata.modified.CopyFrom(basic.modified)
626
+ else:
627
+ if basic.created.seconds > 0:
628
+ self.brain.metadata.modified.CopyFrom(basic.created)
629
+ else:
630
+ self.brain.metadata.modified.GetCurrentTime()
631
+
632
+ if origin is not None:
633
+ # overwrite created/modified if provided on origin
634
+ if origin.HasField("created") and origin.created.seconds > 0:
635
+ self.brain.metadata.created.CopyFrom(origin.created)
636
+ if origin.HasField("modified") and origin.modified.seconds > 0:
637
+ self.brain.metadata.modified.CopyFrom(origin.modified)
638
+
639
+ def _set_resource_relations(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
640
+ """
641
+ Adds the relations to the brain object corresponding to the user-defined metadata at the resource level:
642
+ - Contributors of the document
643
+ - Classificatin labels
644
+ - Relations
645
+ """
646
+ relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
647
+ if origin is not None:
648
+ # origin contributors
649
+ for contrib in origin.colaborators:
650
+ relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
651
+ relation = Relation(
652
+ relation=Relation.COLAB,
653
+ source=relationnodedocument,
654
+ to=relationnodeuser,
655
+ )
656
+ self.brain.field_relations["a/metadata"].relations.append(
657
+ IndexRelation(relation=relation)
658
+ )
659
+
660
+ # labels
661
+ for classification in basic.usermetadata.classifications:
662
+ if classification.cancelled_by_user:
663
+ continue
664
+ relation_node_label = RelationNode(
665
+ value=f"{classification.labelset}/{classification.label}",
666
+ ntype=RelationNode.NodeType.LABEL,
667
+ )
668
+ relation = Relation(
669
+ relation=Relation.ABOUT,
670
+ source=relationnodedocument,
671
+ to=relation_node_label,
672
+ )
673
+ self.brain.field_relations["a/metadata"].relations.append(IndexRelation(relation=relation))
674
+
675
+ # relations
676
+ for relation in user_relations.relations:
677
+ self.brain.field_relations["a/metadata"].relations.append(
678
+ IndexRelation(relation=relation, facets=["/g/u"])
679
+ )
680
+
681
+ self.brain.relation_fields_to_delete.append("a/metadata")
682
+
683
+ def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
684
+ """
685
+ Adds the resource-level labels to the brain object.
686
+ These levels are user-defined in basic or origin metadata.
687
+ """
688
+ if origin is not None:
689
+ if origin.source_id:
690
+ self.labels["o"] = {origin.source_id}
691
+ # origin tags
692
+ for tag in origin.tags:
693
+ self.labels["t"].add(tag)
694
+ # origin source
695
+ if origin.source_id != "":
696
+ self.labels["u"].add(f"s/{origin.source_id}")
697
+
698
+ if origin.path:
699
+ self.labels["p"].add(origin.path.lstrip("/"))
700
+
701
+ # origin contributors
702
+ for contrib in origin.colaborators:
703
+ self.labels["u"].add(f"o/{contrib}")
704
+
705
+ for key, value in origin.metadata.items():
706
+ self.labels["m"].add(f"{key[:255]}/{value[:255]}")
707
+
708
+ # icon
709
+ self.labels["n"].add(f"i/{basic.icon}")
710
+
711
+ # processing status
712
+ status_tag = self.get_processing_status_tag(basic.metadata)
713
+ self.labels["n"].add(f"s/{status_tag}")
714
+
715
+ # main language
716
+ if basic.metadata.language:
717
+ self.labels["s"].add(f"p/{basic.metadata.language}")
718
+
719
+ # all language
720
+ for lang in basic.metadata.languages:
721
+ self.labels["s"].add(f"s/{lang}")
722
+
723
+ # labels
724
+ for classification in basic.usermetadata.classifications:
725
+ if classification.cancelled_by_user:
726
+ continue
727
+ self.labels["l"].add(f"{classification.labelset}/{classification.label}")
728
+
729
+ # hidden
730
+ if basic.hidden:
731
+ _, p1, p2 = LABEL_HIDDEN.split("/")
732
+ self.labels[p1].add(p2)
733
+
734
+ self.brain.ClearField("labels")
735
+ self.brain.labels.extend(flatten_resource_labels(self.labels))
736
+
737
+
738
+ def is_paragraph_repeated_in_field(
739
+ paragraph: Paragraph,
740
+ extracted_text: Optional[str],
741
+ unique_paragraphs: set[str],
742
+ ) -> bool:
743
+ if extracted_text is None:
744
+ return False
745
+
746
+ paragraph_text = extracted_text[paragraph.start : paragraph.end]
747
+ if len(paragraph_text) == 0:
748
+ return False
749
+
750
+ if paragraph_text in unique_paragraphs:
751
+ repeated_in_field = True
752
+ else:
753
+ repeated_in_field = False
754
+ unique_paragraphs.add(paragraph_text)
755
+ return repeated_in_field
756
+
757
+
758
+ class ParagraphPages:
759
+ """
760
+ Class to get the page number for a given paragraph in an optimized way.
761
+ """
762
+
763
+ def __init__(self, positions: FilePagePositions):
764
+ self.positions = positions
765
+ self._materialized = self._materialize_page_numbers(positions)
766
+
767
+ def _materialize_page_numbers(self, positions: FilePagePositions) -> list[int]:
768
+ page_numbers_by_index = []
769
+ for page_number, (page_start, page_end) in positions.items():
770
+ page_numbers_by_index.extend([page_number] * (page_end - page_start + 1))
771
+ return page_numbers_by_index
772
+
773
+ def get(self, paragraph_start_index: int) -> int:
774
+ try:
775
+ return self._materialized[paragraph_start_index]
776
+ except IndexError:
777
+ logger.error(
778
+ f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}" # noqa
779
+ )
780
+ if len(self._materialized) > 0:
781
+ return self._materialized[-1]
782
+ return 0