nucliadb 6.4.0.post4210__py3-none-any.whl → 6.4.0.post4224__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,695 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import logging
21
- from copy import deepcopy
22
- from dataclasses import dataclass
23
- from typing import Optional
24
-
25
- from nidx_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
26
- from nidx_protos.noderesources_pb2 import (
27
- IndexRelation,
28
- ParagraphMetadata,
29
- Representation,
30
- ResourceID,
31
- )
32
- from nidx_protos.noderesources_pb2 import Position as TextPosition
33
- from nidx_protos.noderesources_pb2 import Resource as PBBrainResource
34
-
35
- from nucliadb.common import ids
36
- from nucliadb.ingest import logger
37
- from nucliadb.ingest.orm.utils import compute_paragraph_key
38
- from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
39
- from nucliadb_models.metadata import ResourceProcessingStatus
40
- from nucliadb_protos import utils_pb2
41
- from nucliadb_protos.resources_pb2 import (
42
- Basic,
43
- ExtractedText,
44
- FieldAuthor,
45
- FieldComputedMetadata,
46
- FieldMetadata,
47
- Metadata,
48
- Origin,
49
- Paragraph,
50
- Relations,
51
- UserFieldMetadata,
52
- UserMetadata,
53
- )
54
- from nucliadb_protos.utils_pb2 import Relation, RelationNode
55
-
56
- FilePagePositions = dict[int, tuple[int, int]]
57
-
58
- METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
59
- Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
60
- Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
61
- Metadata.Status.PENDING: ResourceProcessingStatus.PENDING.name,
62
- Metadata.Status.BLOCKED: ResourceProcessingStatus.BLOCKED.name,
63
- Metadata.Status.EXPIRED: ResourceProcessingStatus.EXPIRED.name,
64
- }
65
-
66
-
67
- @dataclass
68
- class ParagraphClassifications:
69
- valid: dict[str, list[str]]
70
- denied: dict[str, list[str]]
71
-
72
-
73
- class ResourceBrain:
74
- def __init__(self, rid: str):
75
- self.rid = rid
76
- ridobj = ResourceID(uuid=rid)
77
- self.brain: PBBrainResource = PBBrainResource(resource=ridobj)
78
- self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
79
-
80
- def apply_field_text(self, field_key: str, text: str, replace_field: bool):
81
- self.brain.texts[field_key].text = text
82
- if replace_field:
83
- field_type, field_name = field_key.split("/")
84
- full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
85
- self.brain.texts_to_delete.append(full_field_id)
86
-
87
- def _get_paragraph_user_classifications(
88
- self, basic_user_field_metadata: Optional[UserFieldMetadata]
89
- ) -> ParagraphClassifications:
90
- pc = ParagraphClassifications(valid={}, denied={})
91
- if basic_user_field_metadata is None:
92
- return pc
93
- for annotated_paragraph in basic_user_field_metadata.paragraphs:
94
- for classification in annotated_paragraph.classifications:
95
- paragraph_key = compute_paragraph_key(self.rid, annotated_paragraph.key)
96
- classif_label = f"/l/{classification.labelset}/{classification.label}"
97
- if classification.cancelled_by_user:
98
- pc.denied.setdefault(paragraph_key, []).append(classif_label)
99
- else:
100
- pc.valid.setdefault(paragraph_key, []).append(classif_label)
101
- return pc
102
-
103
- def apply_field_metadata(
104
- self,
105
- field_key: str,
106
- metadata: FieldComputedMetadata,
107
- page_positions: Optional[FilePagePositions],
108
- extracted_text: Optional[ExtractedText],
109
- basic_user_field_metadata: Optional[UserFieldMetadata] = None,
110
- *,
111
- replace_field: bool = False,
112
- ):
113
- # To check for duplicate paragraphs
114
- unique_paragraphs: set[str] = set()
115
-
116
- # Expose also user classifications
117
- user_paragraph_classifications = self._get_paragraph_user_classifications(
118
- basic_user_field_metadata
119
- )
120
-
121
- # We should set paragraphs and labels
122
- paragraph_pages = ParagraphPages(page_positions) if page_positions else None
123
- for subfield, metadata_split in metadata.split_metadata.items():
124
- extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
125
-
126
- # For each split of this field
127
- for index, paragraph in enumerate(metadata_split.paragraphs):
128
- key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
129
-
130
- denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
131
- position = TextPosition(
132
- index=index,
133
- start=paragraph.start,
134
- end=paragraph.end,
135
- start_seconds=paragraph.start_seconds,
136
- end_seconds=paragraph.end_seconds,
137
- )
138
- page_with_visual = False
139
- if paragraph.HasField("page"):
140
- position.page_number = paragraph.page.page
141
- page_with_visual = paragraph.page.page_with_visual
142
- position.in_page = True
143
- elif paragraph_pages:
144
- position.page_number = paragraph_pages.get(paragraph.start)
145
- position.in_page = True
146
- else:
147
- position.in_page = False
148
-
149
- representation = Representation()
150
- if paragraph.HasField("representation"):
151
- representation.file = paragraph.representation.reference_file
152
- representation.is_a_table = paragraph.representation.is_a_table
153
-
154
- p = BrainParagraph(
155
- start=paragraph.start,
156
- end=paragraph.end,
157
- field=field_key,
158
- split=subfield,
159
- index=index,
160
- repeated_in_field=is_paragraph_repeated_in_field(
161
- paragraph,
162
- extracted_text_str,
163
- unique_paragraphs,
164
- ),
165
- metadata=ParagraphMetadata(
166
- position=position,
167
- page_with_visual=page_with_visual,
168
- representation=representation,
169
- ),
170
- )
171
- paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
172
- paragraph_labels = {paragraph_kind_label}
173
- paragraph_labels.update(
174
- f"/l/{classification.labelset}/{classification.label}"
175
- for classification in paragraph.classifications
176
- )
177
- paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
178
- paragraph_labels.difference_update(denied_classifications)
179
- p.labels.extend(list(paragraph_labels))
180
-
181
- self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
182
-
183
- extracted_text_str = extracted_text.text if extracted_text else None
184
- for index, paragraph in enumerate(metadata.metadata.paragraphs):
185
- key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
186
- denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
187
- position = TextPosition(
188
- index=index,
189
- start=paragraph.start,
190
- end=paragraph.end,
191
- start_seconds=paragraph.start_seconds,
192
- end_seconds=paragraph.end_seconds,
193
- )
194
- page_with_visual = False
195
- if paragraph.HasField("page"):
196
- position.page_number = paragraph.page.page
197
- position.in_page = True
198
- page_with_visual = paragraph.page.page_with_visual
199
- elif paragraph_pages:
200
- position.page_number = paragraph_pages.get(paragraph.start)
201
- position.in_page = True
202
- else:
203
- position.in_page = False
204
-
205
- representation = Representation()
206
- if paragraph.HasField("representation"):
207
- representation.file = paragraph.representation.reference_file
208
- representation.is_a_table = paragraph.representation.is_a_table
209
-
210
- p = BrainParagraph(
211
- start=paragraph.start,
212
- end=paragraph.end,
213
- field=field_key,
214
- index=index,
215
- repeated_in_field=is_paragraph_repeated_in_field(
216
- paragraph, extracted_text_str, unique_paragraphs
217
- ),
218
- metadata=ParagraphMetadata(
219
- position=position,
220
- page_with_visual=page_with_visual,
221
- representation=representation,
222
- ),
223
- )
224
- paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
225
- paragraph_labels = {paragraph_kind_label}
226
- paragraph_labels.update(
227
- f"/l/{classification.labelset}/{classification.label}"
228
- for classification in paragraph.classifications
229
- )
230
- paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
231
- paragraph_labels.difference_update(denied_classifications)
232
- p.labels.extend(list(paragraph_labels))
233
-
234
- self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
235
-
236
- if replace_field:
237
- field_type, field_name = field_key.split("/")
238
- full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
239
- self.brain.paragraphs_to_delete.append(full_field_id)
240
-
241
- field_relations = self.brain.field_relations[field_key].relations
242
- for relations in metadata.metadata.relations:
243
- for relation in relations.relations:
244
- index_relation = IndexRelation(relation=relation)
245
- if relation.metadata.HasField("data_augmentation_task_id"):
246
- index_relation.facets.append(f"/g/da/{relation.metadata.data_augmentation_task_id}")
247
- field_relations.append(index_relation)
248
-
249
- def delete_field(self, field_key: str):
250
- ftype, fkey = field_key.split("/")
251
- full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
252
- self.brain.texts_to_delete.append(full_field_id)
253
- self.brain.paragraphs_to_delete.append(full_field_id)
254
- self.brain.sentences_to_delete.append(full_field_id)
255
- self.brain.relation_fields_to_delete.append(field_key)
256
-
257
- def apply_field_vectors(
258
- self,
259
- field_id: str,
260
- vo: utils_pb2.VectorObject,
261
- *,
262
- vectorset: str,
263
- replace_field: bool = False,
264
- # cut to specific dimension if specified
265
- vector_dimension: Optional[int] = None,
266
- ):
267
- fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
268
- for subfield, vectors in vo.split_vectors.items():
269
- _field_id = ids.FieldId(
270
- rid=fid.rid,
271
- type=fid.type,
272
- key=fid.key,
273
- subfield_id=subfield,
274
- )
275
- # For each split of this field
276
- for index, vector in enumerate(vectors.vectors):
277
- paragraph_key = ids.ParagraphId(
278
- field_id=_field_id,
279
- paragraph_start=vector.start_paragraph,
280
- paragraph_end=vector.end_paragraph,
281
- )
282
- sentence_key = ids.VectorId(
283
- field_id=_field_id,
284
- index=index,
285
- vector_start=vector.start,
286
- vector_end=vector.end,
287
- )
288
- self._apply_field_vector(
289
- field_id,
290
- paragraph_key,
291
- sentence_key,
292
- vector,
293
- vectorset=vectorset,
294
- vector_dimension=vector_dimension,
295
- )
296
-
297
- _field_id = ids.FieldId(
298
- rid=fid.rid,
299
- type=fid.type,
300
- key=fid.key,
301
- )
302
- for index, vector in enumerate(vo.vectors.vectors):
303
- paragraph_key = ids.ParagraphId(
304
- field_id=_field_id,
305
- paragraph_start=vector.start_paragraph,
306
- paragraph_end=vector.end_paragraph,
307
- )
308
- sentence_key = ids.VectorId(
309
- field_id=_field_id,
310
- index=index,
311
- vector_start=vector.start,
312
- vector_end=vector.end,
313
- )
314
- self._apply_field_vector(
315
- field_id,
316
- paragraph_key,
317
- sentence_key,
318
- vector,
319
- vectorset=vectorset,
320
- vector_dimension=vector_dimension,
321
- )
322
-
323
- if replace_field:
324
- full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
325
- self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
326
-
327
- def _apply_field_vector(
328
- self,
329
- field_id: str,
330
- paragraph_key: ids.ParagraphId,
331
- sentence_key: ids.VectorId,
332
- vector: utils_pb2.Vector,
333
- *,
334
- vectorset: str,
335
- # cut vectors if a specific dimension is specified
336
- vector_dimension: Optional[int] = None,
337
- ):
338
- paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
339
- sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
340
-
341
- sentence_pb.ClearField("vector") # clear first to prevent duplicates
342
- sentence_pb.vector.extend(vector.vector[:vector_dimension])
343
-
344
- # we only care about start/stop position of the paragraph for a given sentence here
345
- # the key has the sentence position
346
- sentence_pb.metadata.position.start = vector.start_paragraph
347
- sentence_pb.metadata.position.end = vector.end_paragraph
348
-
349
- # does it make sense to copy forward paragraph values here?
350
- sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
351
- sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
352
-
353
- sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
354
-
355
- sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
356
-
357
- sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
358
-
359
- sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
360
-
361
- def set_processing_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
362
- """
363
- We purposefully overwrite what we index as a status and DO NOT reflect
364
- actual status with what we index.
365
-
366
- This seems to be is on purpose so the frontend of the product can operate
367
- on 2 statuses only -- PENDING and PROCESSED.
368
- """
369
- # The value of brain.status will either be PROCESSED or PENDING
370
- status = basic.metadata.status
371
- if previous_status is not None and previous_status != Metadata.Status.PENDING:
372
- # Already processed once, so it stays as PROCESSED
373
- self.brain.status = PBBrainResource.PROCESSED
374
- return
375
- # previos_status is None or PENDING
376
- if status == Metadata.Status.PENDING:
377
- # Stays in pending
378
- self.brain.status = PBBrainResource.PENDING
379
- else:
380
- # Means it has just been processed
381
- self.brain.status = PBBrainResource.PROCESSED
382
-
383
- def set_security(self, security: utils_pb2.Security):
384
- self.brain.security.CopyFrom(security)
385
-
386
- def get_processing_status_tag(self, metadata: Metadata) -> str:
387
- if not metadata.useful:
388
- return "EMPTY"
389
- return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
390
-
391
- def set_resource_metadata(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
392
- self._set_resource_dates(basic, origin)
393
- self._set_resource_labels(basic, origin)
394
- self._set_resource_relations(basic, origin, user_relations)
395
-
396
- def _set_resource_dates(self, basic: Basic, origin: Optional[Origin]):
397
- if basic.created.seconds > 0:
398
- self.brain.metadata.created.CopyFrom(basic.created)
399
- else:
400
- logging.warning(f"Basic metadata has no created field for {self.rid}")
401
- self.brain.metadata.created.GetCurrentTime()
402
- if basic.modified.seconds > 0:
403
- self.brain.metadata.modified.CopyFrom(basic.modified)
404
- else:
405
- if basic.created.seconds > 0:
406
- self.brain.metadata.modified.CopyFrom(basic.created)
407
- else:
408
- self.brain.metadata.modified.GetCurrentTime()
409
-
410
- if origin is not None:
411
- # overwrite created/modified if provided on origin
412
- if origin.HasField("created") and origin.created.seconds > 0:
413
- self.brain.metadata.created.CopyFrom(origin.created)
414
- if origin.HasField("modified") and origin.modified.seconds > 0:
415
- self.brain.metadata.modified.CopyFrom(origin.modified)
416
-
417
- def _set_resource_relations(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
418
- relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
419
- if origin is not None:
420
- # origin contributors
421
- for contrib in origin.colaborators:
422
- relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
423
- relation = Relation(
424
- relation=Relation.COLAB,
425
- source=relationnodedocument,
426
- to=relationnodeuser,
427
- )
428
- self.brain.field_relations["a/metadata"].relations.append(
429
- IndexRelation(relation=relation)
430
- )
431
-
432
- # labels
433
- for classification in basic.usermetadata.classifications:
434
- relation_node_label = RelationNode(
435
- value=f"{classification.labelset}/{classification.label}",
436
- ntype=RelationNode.NodeType.LABEL,
437
- )
438
- relation = Relation(
439
- relation=Relation.ABOUT,
440
- source=relationnodedocument,
441
- to=relation_node_label,
442
- )
443
- self.brain.field_relations["a/metadata"].relations.append(IndexRelation(relation=relation))
444
-
445
- # relations
446
- for relation in user_relations.relations:
447
- self.brain.field_relations["a/metadata"].relations.append(
448
- IndexRelation(relation=relation, facets=["/g/u"])
449
- )
450
-
451
- self.brain.relation_fields_to_delete.append("a/metadata")
452
-
453
- def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
454
- if origin is not None:
455
- if origin.source_id:
456
- self.labels["o"] = {origin.source_id}
457
- # origin tags
458
- for tag in origin.tags:
459
- self.labels["t"].add(tag)
460
- # origin source
461
- if origin.source_id != "":
462
- self.labels["u"].add(f"s/{origin.source_id}")
463
-
464
- if origin.path:
465
- self.labels["p"].add(origin.path.lstrip("/"))
466
-
467
- # origin contributors
468
- for contrib in origin.colaborators:
469
- self.labels["u"].add(f"o/{contrib}")
470
-
471
- for key, value in origin.metadata.items():
472
- self.labels["m"].add(f"{key[:255]}/{value[:255]}")
473
-
474
- # icon
475
- self.labels["n"].add(f"i/{basic.icon}")
476
-
477
- # processing status
478
- status_tag = self.get_processing_status_tag(basic.metadata)
479
- self.labels["n"].add(f"s/{status_tag}")
480
-
481
- # main language
482
- if basic.metadata.language:
483
- self.labels["s"].add(f"p/{basic.metadata.language}")
484
-
485
- # all language
486
- for lang in basic.metadata.languages:
487
- self.labels["s"].add(f"s/{lang}")
488
-
489
- # labels
490
- for classification in basic.usermetadata.classifications:
491
- self.labels["l"].add(f"{classification.labelset}/{classification.label}")
492
-
493
- # hidden
494
- if basic.hidden:
495
- _, p1, p2 = LABEL_HIDDEN.split("/")
496
- self.labels[p1].add(p2)
497
-
498
- self.brain.ClearField("labels")
499
- self.brain.labels.extend(flatten_resource_labels(self.labels))
500
-
501
- def process_field_metadata(
502
- self,
503
- field_key: str,
504
- metadata: FieldMetadata,
505
- labels: dict[str, set[str]],
506
- relation_node_document: RelationNode,
507
- user_canceled_labels: set[str],
508
- ):
509
- if metadata.mime_type != "":
510
- labels["mt"].add(metadata.mime_type)
511
-
512
- base_classification_relation = Relation(
513
- relation=Relation.ABOUT,
514
- source=relation_node_document,
515
- to=RelationNode(
516
- ntype=RelationNode.NodeType.LABEL,
517
- ),
518
- )
519
- for classification in metadata.classifications:
520
- label = f"{classification.labelset}/{classification.label}"
521
- if label not in user_canceled_labels:
522
- labels["l"].add(label)
523
- relation = Relation()
524
- relation.CopyFrom(base_classification_relation)
525
- relation.to.value = label
526
- self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
527
-
528
- # Data Augmentation + Processor entities
529
- base_entity_relation = Relation(
530
- relation=Relation.ENTITY,
531
- source=relation_node_document,
532
- to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
533
- )
534
- use_legacy_entities = True
535
- for data_augmentation_task_id, entities in metadata.entities.items():
536
- # If we recieved the entities from the processor here, we don't want to use the legacy entities
537
- # TODO: Remove this when processor doesn't use this anymore
538
- if data_augmentation_task_id == "processor":
539
- use_legacy_entities = False
540
-
541
- for ent in entities.entities:
542
- entity_text = ent.text
543
- entity_label = ent.label
544
- # Seems like we don't care about where the entity is in the text
545
- # entity_positions = entity.positions
546
- labels["e"].add(
547
- f"{entity_label}/{entity_text}"
548
- ) # Add data_augmentation_task_id as a prefix?
549
- relation = Relation()
550
- relation.CopyFrom(base_entity_relation)
551
- relation.to.value = entity_text
552
- relation.to.subtype = entity_label
553
- self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
554
-
555
- # Legacy processor entities
556
- # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
557
- def _parse_entity(klass_entity: str) -> tuple[str, str]:
558
- try:
559
- klass, entity = klass_entity.split("/", 1)
560
- return klass, entity
561
- except ValueError:
562
- raise AttributeError(f"Entity should be with type {klass_entity}")
563
-
564
- if use_legacy_entities:
565
- for klass_entity in metadata.positions.keys():
566
- labels["e"].add(klass_entity)
567
- klass, entity = _parse_entity(klass_entity)
568
- relation = Relation()
569
- relation.CopyFrom(base_entity_relation)
570
- relation.to.value = entity
571
- relation.to.subtype = klass
572
- self.brain.field_relations[field_key].relations.append(IndexRelation(relation=relation))
573
-
574
- def apply_field_labels(
575
- self,
576
- field_key: str,
577
- metadata: Optional[FieldComputedMetadata],
578
- uuid: str,
579
- generated_by: Optional[FieldAuthor],
580
- basic_user_metadata: Optional[UserMetadata] = None,
581
- basic_user_fieldmetadata: Optional[UserFieldMetadata] = None,
582
- ):
583
- user_canceled_labels: set[str] = set()
584
- if basic_user_metadata is not None:
585
- user_canceled_labels.update(
586
- f"{classification.labelset}/{classification.label}"
587
- for classification in basic_user_metadata.classifications
588
- if classification.cancelled_by_user
589
- )
590
- relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
591
- labels: dict[str, set[str]] = {
592
- "l": set(), # classification labels
593
- "e": set(), # entities
594
- "mt": set(), # mime type
595
- "g/da": set(), # generated by
596
- }
597
- if metadata is not None:
598
- for meta in metadata.split_metadata.values():
599
- self.process_field_metadata(
600
- field_key,
601
- meta,
602
- labels,
603
- relation_node_resource,
604
- user_canceled_labels,
605
- )
606
- self.process_field_metadata(
607
- field_key,
608
- metadata.metadata,
609
- labels,
610
- relation_node_resource,
611
- user_canceled_labels,
612
- )
613
-
614
- if basic_user_fieldmetadata is not None:
615
- for paragraph_annotation in basic_user_fieldmetadata.paragraphs:
616
- for classification in paragraph_annotation.classifications:
617
- if not classification.cancelled_by_user:
618
- label = f"/l/{classification.labelset}/{classification.label}"
619
- # FIXME: this condition avoid adding duplicate labels
620
- # while importing a kb. We shouldn't add duplicates on
621
- # the first place
622
- if (
623
- label
624
- not in self.brain.paragraphs[field_key]
625
- .paragraphs[paragraph_annotation.key]
626
- .labels
627
- ):
628
- self.brain.paragraphs[field_key].paragraphs[
629
- paragraph_annotation.key
630
- ].labels.append(label)
631
-
632
- if generated_by is not None and generated_by.WhichOneof("author") == "data_augmentation":
633
- field_type, field_id = field_key.split("/")
634
- da_task_id = ids.extract_data_augmentation_id(field_id)
635
- if da_task_id is None: # pragma: nocover
636
- logger.warning(
637
- "Data augmentation field id has an unexpected format! Skipping label",
638
- extra={
639
- "rid": uuid,
640
- "field_id": field_id,
641
- },
642
- )
643
- else:
644
- labels["g/da"].add(da_task_id)
645
-
646
- flat_labels = flatten_resource_labels(labels)
647
- if len(flat_labels) > 0:
648
- self.brain.texts[field_key].labels.extend(flat_labels)
649
-
650
-
651
- def is_paragraph_repeated_in_field(
652
- paragraph: Paragraph,
653
- extracted_text: Optional[str],
654
- unique_paragraphs: set[str],
655
- ) -> bool:
656
- if extracted_text is None:
657
- return False
658
-
659
- paragraph_text = extracted_text[paragraph.start : paragraph.end]
660
- if len(paragraph_text) == 0:
661
- return False
662
-
663
- if paragraph_text in unique_paragraphs:
664
- repeated_in_field = True
665
- else:
666
- repeated_in_field = False
667
- unique_paragraphs.add(paragraph_text)
668
- return repeated_in_field
669
-
670
-
671
- class ParagraphPages:
672
- """
673
- Class to get the page number for a given paragraph in an optimized way.
674
- """
675
-
676
- def __init__(self, positions: FilePagePositions):
677
- self.positions = positions
678
- self._materialized = self._materialize_page_numbers(positions)
679
-
680
- def _materialize_page_numbers(self, positions: FilePagePositions) -> list[int]:
681
- page_numbers_by_index = []
682
- for page_number, (page_start, page_end) in positions.items():
683
- page_numbers_by_index.extend([page_number] * (page_end - page_start + 1))
684
- return page_numbers_by_index
685
-
686
- def get(self, paragraph_start_index: int) -> int:
687
- try:
688
- return self._materialized[paragraph_start_index]
689
- except IndexError:
690
- logger.error(
691
- f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}" # noqa
692
- )
693
- if len(self._materialized) > 0:
694
- return self._materialized[-1]
695
- return 0