nucliadb 6.3.5.post3985__py3-none-any.whl → 6.3.5.post3995__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +3 -2
- nucliadb/common/cluster/rollover.py +3 -3
- nucliadb/common/cluster/utils.py +8 -4
- nucliadb/common/external_index_providers/pinecone.py +7 -44
- nucliadb/ingest/fields/exceptions.py +4 -0
- nucliadb/ingest/orm/brain_v2.py +782 -0
- nucliadb/ingest/orm/index_message.py +409 -0
- nucliadb/ingest/orm/metrics.py +1 -1
- nucliadb/ingest/orm/processor/data_augmentation.py +2 -2
- nucliadb/ingest/orm/processor/pgcatalog.py +3 -2
- nucliadb/ingest/orm/processor/processor.py +61 -47
- nucliadb/ingest/orm/resource.py +70 -50
- nucliadb/ingest/orm/utils.py +1 -2
- nucliadb/ingest/processing.py +2 -54
- nucliadb/ingest/service/writer.py +2 -2
- nucliadb/models/internal/__init__.py +19 -0
- nucliadb/models/internal/processing.py +160 -0
- nucliadb/writer/api/v1/field.py +1 -1
- nucliadb/writer/api/v1/resource.py +2 -1
- nucliadb/writer/api/v1/upload.py +1 -1
- nucliadb/writer/resource/basic.py +2 -3
- nucliadb/writer/resource/field.py +13 -14
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/METADATA +6 -6
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/RECORD +27 -23
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/WHEEL +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,782 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
import logging
|
21
|
+
from copy import deepcopy
|
22
|
+
from dataclasses import dataclass
|
23
|
+
from typing import Optional
|
24
|
+
|
25
|
+
from nucliadb.common import ids
|
26
|
+
from nucliadb.ingest import logger
|
27
|
+
from nucliadb.ingest.orm.utils import compute_paragraph_key
|
28
|
+
from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
|
29
|
+
from nucliadb_models.metadata import ResourceProcessingStatus
|
30
|
+
from nucliadb_protos import utils_pb2
|
31
|
+
from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
|
32
|
+
from nucliadb_protos.noderesources_pb2 import (
|
33
|
+
IndexRelation,
|
34
|
+
ParagraphMetadata,
|
35
|
+
Representation,
|
36
|
+
ResourceID,
|
37
|
+
)
|
38
|
+
from nucliadb_protos.noderesources_pb2 import Position as TextPosition
|
39
|
+
from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
|
40
|
+
from nucliadb_protos.resources_pb2 import (
|
41
|
+
Basic,
|
42
|
+
ExtractedText,
|
43
|
+
FieldAuthor,
|
44
|
+
FieldComputedMetadata,
|
45
|
+
Metadata,
|
46
|
+
Origin,
|
47
|
+
Paragraph,
|
48
|
+
Relations,
|
49
|
+
UserFieldMetadata,
|
50
|
+
UserMetadata,
|
51
|
+
)
|
52
|
+
from nucliadb_protos.utils_pb2 import Relation, RelationNode
|
53
|
+
|
54
|
+
FilePagePositions = dict[int, tuple[int, int]]
|
55
|
+
|
56
|
+
METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
|
57
|
+
Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
|
58
|
+
Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
|
59
|
+
Metadata.Status.PENDING: ResourceProcessingStatus.PENDING.name,
|
60
|
+
Metadata.Status.BLOCKED: ResourceProcessingStatus.BLOCKED.name,
|
61
|
+
Metadata.Status.EXPIRED: ResourceProcessingStatus.EXPIRED.name,
|
62
|
+
}
|
63
|
+
|
64
|
+
|
65
|
+
@dataclass
|
66
|
+
class ParagraphClassifications:
|
67
|
+
valid: dict[str, list[str]]
|
68
|
+
denied: dict[str, list[str]]
|
69
|
+
|
70
|
+
|
71
|
+
class ResourceBrainV2:
|
72
|
+
def __init__(self, rid: str):
|
73
|
+
self.rid = rid
|
74
|
+
self.brain: PBBrainResource = PBBrainResource(resource=ResourceID(uuid=rid))
|
75
|
+
self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
|
76
|
+
|
77
|
+
def generate_resource_indexing_metadata(
|
78
|
+
self,
|
79
|
+
basic: Basic,
|
80
|
+
user_relations: Relations,
|
81
|
+
origin: Optional[Origin],
|
82
|
+
previous_processing_status: Optional[Metadata.Status.ValueType],
|
83
|
+
security: Optional[utils_pb2.Security],
|
84
|
+
) -> None:
|
85
|
+
self._set_resource_status(basic, previous_processing_status)
|
86
|
+
self._set_resource_dates(basic, origin)
|
87
|
+
self._set_resource_labels(basic, origin)
|
88
|
+
self._set_resource_relations(basic, origin, user_relations)
|
89
|
+
if security is not None:
|
90
|
+
self._set_resource_security(security)
|
91
|
+
|
92
|
+
def generate_texts_index_message(
|
93
|
+
self,
|
94
|
+
field_key: str,
|
95
|
+
extracted_text: ExtractedText,
|
96
|
+
field_computed_metadata: Optional[FieldComputedMetadata],
|
97
|
+
basic_user_metadata: Optional[UserMetadata],
|
98
|
+
field_author: Optional[FieldAuthor],
|
99
|
+
replace_field: bool,
|
100
|
+
skip_index: bool,
|
101
|
+
) -> None:
|
102
|
+
self.apply_field_text(
|
103
|
+
field_key,
|
104
|
+
extracted_text,
|
105
|
+
replace_field=replace_field,
|
106
|
+
skip_texts=skip_index,
|
107
|
+
)
|
108
|
+
self.apply_field_labels(
|
109
|
+
field_key,
|
110
|
+
field_computed_metadata,
|
111
|
+
field_author,
|
112
|
+
basic_user_metadata,
|
113
|
+
)
|
114
|
+
|
115
|
+
def apply_field_text(
|
116
|
+
self,
|
117
|
+
field_key: str,
|
118
|
+
extracted_text: ExtractedText,
|
119
|
+
replace_field: bool,
|
120
|
+
skip_texts: Optional[bool],
|
121
|
+
):
|
122
|
+
if skip_texts is not None:
|
123
|
+
self.brain.skip_texts = skip_texts
|
124
|
+
field_text = extracted_text.text
|
125
|
+
for _, split in extracted_text.split_text.items():
|
126
|
+
field_text += f" {split} "
|
127
|
+
self.brain.texts[field_key].text = field_text
|
128
|
+
|
129
|
+
if replace_field:
|
130
|
+
ftype, fkey = field_key.split("/")
|
131
|
+
full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
|
132
|
+
self.brain.texts_to_delete.append(full_field_id)
|
133
|
+
|
134
|
+
def apply_field_labels(
|
135
|
+
self,
|
136
|
+
field_key: str,
|
137
|
+
field_computed_metadata: Optional[FieldComputedMetadata],
|
138
|
+
field_author: Optional[FieldAuthor],
|
139
|
+
basic_user_metadata: Optional[UserMetadata] = None,
|
140
|
+
):
|
141
|
+
user_cancelled_labels: set[str] = (
|
142
|
+
set(
|
143
|
+
[
|
144
|
+
f"{classification.labelset}/{classification.label}"
|
145
|
+
for classification in basic_user_metadata.classifications
|
146
|
+
if classification.cancelled_by_user
|
147
|
+
]
|
148
|
+
)
|
149
|
+
if basic_user_metadata
|
150
|
+
else set()
|
151
|
+
)
|
152
|
+
labels: dict[str, set[str]] = {
|
153
|
+
"l": set(), # classification labels
|
154
|
+
"e": set(), # entities
|
155
|
+
"mt": set(), # mime type
|
156
|
+
"g/da": set(), # generated by
|
157
|
+
}
|
158
|
+
if field_computed_metadata is not None:
|
159
|
+
metadatas = list(field_computed_metadata.split_metadata.values())
|
160
|
+
metadatas.append(field_computed_metadata.metadata)
|
161
|
+
for metadata in metadatas:
|
162
|
+
if metadata.mime_type != "":
|
163
|
+
labels["mt"].add(metadata.mime_type)
|
164
|
+
for classification in metadata.classifications:
|
165
|
+
label = f"{classification.labelset}/{classification.label}"
|
166
|
+
if label not in user_cancelled_labels:
|
167
|
+
labels["l"].add(label)
|
168
|
+
use_legacy_entities = True
|
169
|
+
for data_augmentation_task_id, entities in metadata.entities.items():
|
170
|
+
# If we recieved the entities from the processor here, we don't want to use the legacy entities
|
171
|
+
# TODO: Remove this when processor doesn't use this anymore
|
172
|
+
if data_augmentation_task_id == "processor":
|
173
|
+
use_legacy_entities = False
|
174
|
+
for ent in entities.entities:
|
175
|
+
entity_text = ent.text
|
176
|
+
entity_label = ent.label
|
177
|
+
# Seems like we don't care about where the entity is in the text
|
178
|
+
# entity_positions = entity.positions
|
179
|
+
labels["e"].add(
|
180
|
+
f"{entity_label}/{entity_text}"
|
181
|
+
) # Add data_augmentation_task_id as a prefix?
|
182
|
+
# Legacy processor entities
|
183
|
+
if use_legacy_entities:
|
184
|
+
for klass_entity in metadata.positions.keys():
|
185
|
+
labels["e"].add(klass_entity)
|
186
|
+
|
187
|
+
if field_author is not None and field_author.WhichOneof("author") == "data_augmentation":
|
188
|
+
field_type, field_id = field_key.split("/")
|
189
|
+
da_task_id = ids.extract_data_augmentation_id(field_id)
|
190
|
+
if da_task_id is None: # pragma: nocover
|
191
|
+
logger.warning(
|
192
|
+
"Data augmentation field id has an unexpected format! Skipping label",
|
193
|
+
extra={
|
194
|
+
"field_id": field_id,
|
195
|
+
"field_type": field_type,
|
196
|
+
},
|
197
|
+
)
|
198
|
+
else:
|
199
|
+
labels["g/da"].add(da_task_id)
|
200
|
+
|
201
|
+
self.brain.texts[field_key].labels.extend(flatten_resource_labels(labels))
|
202
|
+
|
203
|
+
def generate_paragraphs_index_message(
|
204
|
+
self,
|
205
|
+
field_key: str,
|
206
|
+
field_computed_metadata: FieldComputedMetadata,
|
207
|
+
extracted_text: ExtractedText,
|
208
|
+
page_positions: Optional[FilePagePositions],
|
209
|
+
user_field_metadata: Optional[UserFieldMetadata],
|
210
|
+
replace_field: bool,
|
211
|
+
skip_index: Optional[bool],
|
212
|
+
) -> None:
|
213
|
+
# We need to add the extracted text to the texts section of the Resource so that
|
214
|
+
# the paragraphs can be indexed
|
215
|
+
self.apply_field_text(
|
216
|
+
field_key,
|
217
|
+
extracted_text,
|
218
|
+
replace_field=False,
|
219
|
+
skip_texts=None,
|
220
|
+
)
|
221
|
+
self.apply_field_paragraphs(
|
222
|
+
field_key,
|
223
|
+
field_computed_metadata,
|
224
|
+
extracted_text,
|
225
|
+
page_positions,
|
226
|
+
user_field_metadata,
|
227
|
+
replace_field=replace_field,
|
228
|
+
skip_paragraphs=skip_index,
|
229
|
+
)
|
230
|
+
|
231
|
+
def apply_field_paragraphs(
|
232
|
+
self,
|
233
|
+
field_key: str,
|
234
|
+
field_computed_metadata: FieldComputedMetadata,
|
235
|
+
extracted_text: ExtractedText,
|
236
|
+
page_positions: Optional[FilePagePositions],
|
237
|
+
user_field_metadata: Optional[UserFieldMetadata],
|
238
|
+
replace_field: bool,
|
239
|
+
skip_paragraphs: Optional[bool],
|
240
|
+
) -> None:
|
241
|
+
if skip_paragraphs is not None:
|
242
|
+
self.brain.skip_paragraphs = skip_paragraphs
|
243
|
+
unique_paragraphs: set[str] = set()
|
244
|
+
user_paragraph_classifications = self._get_paragraph_user_classifications(user_field_metadata)
|
245
|
+
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
246
|
+
# Splits of the field
|
247
|
+
for subfield, field_metadata in field_computed_metadata.split_metadata.items():
|
248
|
+
extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
|
249
|
+
for idx, paragraph in enumerate(field_metadata.paragraphs):
|
250
|
+
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
251
|
+
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
252
|
+
position = TextPosition(
|
253
|
+
index=idx,
|
254
|
+
start=paragraph.start,
|
255
|
+
end=paragraph.end,
|
256
|
+
start_seconds=paragraph.start_seconds,
|
257
|
+
end_seconds=paragraph.end_seconds,
|
258
|
+
)
|
259
|
+
page_with_visual = False
|
260
|
+
if paragraph.HasField("page"):
|
261
|
+
position.page_number = paragraph.page.page
|
262
|
+
page_with_visual = paragraph.page.page_with_visual
|
263
|
+
position.in_page = True
|
264
|
+
elif paragraph_pages:
|
265
|
+
position.page_number = paragraph_pages.get(paragraph.start)
|
266
|
+
position.in_page = True
|
267
|
+
else:
|
268
|
+
position.in_page = False
|
269
|
+
representation = Representation()
|
270
|
+
if paragraph.HasField("representation"):
|
271
|
+
representation.file = paragraph.representation.reference_file
|
272
|
+
representation.is_a_table = paragraph.representation.is_a_table
|
273
|
+
p = BrainParagraph(
|
274
|
+
start=paragraph.start,
|
275
|
+
end=paragraph.end,
|
276
|
+
field=field_key,
|
277
|
+
split=subfield,
|
278
|
+
index=idx,
|
279
|
+
repeated_in_field=is_paragraph_repeated_in_field(
|
280
|
+
paragraph,
|
281
|
+
extracted_text_str,
|
282
|
+
unique_paragraphs,
|
283
|
+
),
|
284
|
+
metadata=ParagraphMetadata(
|
285
|
+
position=position,
|
286
|
+
page_with_visual=page_with_visual,
|
287
|
+
representation=representation,
|
288
|
+
),
|
289
|
+
)
|
290
|
+
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
291
|
+
paragraph_labels = {paragraph_kind_label}
|
292
|
+
paragraph_labels.update(
|
293
|
+
f"/l/{classification.labelset}/{classification.label}"
|
294
|
+
for classification in paragraph.classifications
|
295
|
+
)
|
296
|
+
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
297
|
+
paragraph_labels.difference_update(denied_classifications)
|
298
|
+
p.labels.extend(list(paragraph_labels))
|
299
|
+
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
300
|
+
|
301
|
+
# Main field
|
302
|
+
extracted_text_str = extracted_text.text if extracted_text else None
|
303
|
+
for idx, paragraph in enumerate(field_computed_metadata.metadata.paragraphs):
|
304
|
+
key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
305
|
+
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
306
|
+
position = TextPosition(
|
307
|
+
index=idx,
|
308
|
+
start=paragraph.start,
|
309
|
+
end=paragraph.end,
|
310
|
+
start_seconds=paragraph.start_seconds,
|
311
|
+
end_seconds=paragraph.end_seconds,
|
312
|
+
)
|
313
|
+
page_with_visual = False
|
314
|
+
if paragraph.HasField("page"):
|
315
|
+
position.page_number = paragraph.page.page
|
316
|
+
position.in_page = True
|
317
|
+
page_with_visual = paragraph.page.page_with_visual
|
318
|
+
elif paragraph_pages:
|
319
|
+
position.page_number = paragraph_pages.get(paragraph.start)
|
320
|
+
position.in_page = True
|
321
|
+
else:
|
322
|
+
position.in_page = False
|
323
|
+
representation = Representation()
|
324
|
+
if paragraph.HasField("representation"):
|
325
|
+
representation.file = paragraph.representation.reference_file
|
326
|
+
representation.is_a_table = paragraph.representation.is_a_table
|
327
|
+
p = BrainParagraph(
|
328
|
+
start=paragraph.start,
|
329
|
+
end=paragraph.end,
|
330
|
+
field=field_key,
|
331
|
+
index=idx,
|
332
|
+
repeated_in_field=is_paragraph_repeated_in_field(
|
333
|
+
paragraph, extracted_text_str, unique_paragraphs
|
334
|
+
),
|
335
|
+
metadata=ParagraphMetadata(
|
336
|
+
position=position,
|
337
|
+
page_with_visual=page_with_visual,
|
338
|
+
representation=representation,
|
339
|
+
),
|
340
|
+
)
|
341
|
+
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
342
|
+
paragraph_labels = {paragraph_kind_label}
|
343
|
+
paragraph_labels.update(
|
344
|
+
f"/l/{classification.labelset}/{classification.label}"
|
345
|
+
for classification in paragraph.classifications
|
346
|
+
)
|
347
|
+
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
348
|
+
paragraph_labels.difference_update(denied_classifications)
|
349
|
+
p.labels.extend(list(paragraph_labels))
|
350
|
+
|
351
|
+
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
352
|
+
|
353
|
+
if replace_field:
|
354
|
+
field_type, field_name = field_key.split("/")
|
355
|
+
full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
|
356
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
357
|
+
|
358
|
+
def _get_paragraph_user_classifications(
|
359
|
+
self, basic_user_field_metadata: Optional[UserFieldMetadata]
|
360
|
+
) -> ParagraphClassifications:
|
361
|
+
pc = ParagraphClassifications(valid={}, denied={})
|
362
|
+
if basic_user_field_metadata is None:
|
363
|
+
return pc
|
364
|
+
for annotated_paragraph in basic_user_field_metadata.paragraphs:
|
365
|
+
for classification in annotated_paragraph.classifications:
|
366
|
+
paragraph_key = compute_paragraph_key(self.rid, annotated_paragraph.key)
|
367
|
+
classif_label = f"/l/{classification.labelset}/{classification.label}"
|
368
|
+
if classification.cancelled_by_user:
|
369
|
+
pc.denied.setdefault(paragraph_key, []).append(classif_label)
|
370
|
+
else:
|
371
|
+
pc.valid.setdefault(paragraph_key, []).append(classif_label)
|
372
|
+
return pc
|
373
|
+
|
374
|
+
def generate_relations_index_message(
|
375
|
+
self,
|
376
|
+
field_key: str,
|
377
|
+
field_computed_metadata: Optional[FieldComputedMetadata],
|
378
|
+
basic_user_metadata: Optional[UserMetadata],
|
379
|
+
replace_field: bool,
|
380
|
+
) -> None:
|
381
|
+
user_cancelled_labels: set[str] = (
|
382
|
+
set(
|
383
|
+
[
|
384
|
+
f"{classification.labelset}/{classification.label}"
|
385
|
+
for classification in basic_user_metadata.classifications
|
386
|
+
if classification.cancelled_by_user
|
387
|
+
]
|
388
|
+
)
|
389
|
+
if basic_user_metadata
|
390
|
+
else set()
|
391
|
+
)
|
392
|
+
|
393
|
+
field_relations = self.brain.field_relations[field_key].relations
|
394
|
+
|
395
|
+
# Index relations that are computed by the processor
|
396
|
+
if field_computed_metadata is not None:
|
397
|
+
relation_node_document = RelationNode(
|
398
|
+
value=self.brain.resource.uuid,
|
399
|
+
ntype=RelationNode.NodeType.RESOURCE,
|
400
|
+
)
|
401
|
+
field_metadatas = list(field_computed_metadata.split_metadata.values())
|
402
|
+
field_metadatas.append(field_computed_metadata.metadata)
|
403
|
+
for field_metadata in field_metadatas:
|
404
|
+
# Relations computed by the processor
|
405
|
+
for relations in field_metadata.relations:
|
406
|
+
for relation in relations.relations:
|
407
|
+
index_relation = IndexRelation(relation=relation)
|
408
|
+
if relation.metadata.HasField("data_augmentation_task_id"):
|
409
|
+
index_relation.facets.append(
|
410
|
+
f"/g/da/{relation.metadata.data_augmentation_task_id}"
|
411
|
+
)
|
412
|
+
field_relations.append(index_relation)
|
413
|
+
# Entities computed by the processor or ingestion agents
|
414
|
+
base_entity_relation = Relation(
|
415
|
+
relation=Relation.ENTITY,
|
416
|
+
source=relation_node_document,
|
417
|
+
to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
|
418
|
+
)
|
419
|
+
use_legacy_entities = True
|
420
|
+
for data_augmentation_task_id, entities in field_metadata.entities.items():
|
421
|
+
# If we recieved the entities from the processor here, we don't want to use the legacy entities
|
422
|
+
# TODO: Remove this when processor doesn't use this anymore
|
423
|
+
if data_augmentation_task_id == "processor":
|
424
|
+
use_legacy_entities = False
|
425
|
+
|
426
|
+
for ent in entities.entities:
|
427
|
+
entity_text = ent.text
|
428
|
+
entity_label = ent.label
|
429
|
+
relation = Relation()
|
430
|
+
relation.CopyFrom(base_entity_relation)
|
431
|
+
relation.to.value = entity_text
|
432
|
+
relation.to.subtype = entity_label
|
433
|
+
field_relations.append(IndexRelation(relation=relation))
|
434
|
+
|
435
|
+
# Legacy processor entities
|
436
|
+
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
437
|
+
def _parse_entity(klass_entity: str) -> tuple[str, str]:
|
438
|
+
try:
|
439
|
+
klass, entity = klass_entity.split("/", 1)
|
440
|
+
return klass, entity
|
441
|
+
except ValueError:
|
442
|
+
raise AttributeError(f"Entity should be with type {klass_entity}")
|
443
|
+
|
444
|
+
if use_legacy_entities:
|
445
|
+
for klass_entity in field_metadata.positions.keys():
|
446
|
+
klass, entity = _parse_entity(klass_entity)
|
447
|
+
relation = Relation()
|
448
|
+
relation.CopyFrom(base_entity_relation)
|
449
|
+
relation.to.value = entity
|
450
|
+
relation.to.subtype = klass
|
451
|
+
field_relations.append(IndexRelation(relation=relation))
|
452
|
+
|
453
|
+
# Relations from field to classifications label
|
454
|
+
base_classification_relation = Relation(
|
455
|
+
relation=Relation.ABOUT,
|
456
|
+
source=relation_node_document,
|
457
|
+
to=RelationNode(
|
458
|
+
ntype=RelationNode.NodeType.LABEL,
|
459
|
+
),
|
460
|
+
)
|
461
|
+
for classification in field_metadata.classifications:
|
462
|
+
label = f"{classification.labelset}/{classification.label}"
|
463
|
+
if label in user_cancelled_labels:
|
464
|
+
continue
|
465
|
+
relation = Relation()
|
466
|
+
relation.CopyFrom(base_classification_relation)
|
467
|
+
relation.to.value = label
|
468
|
+
field_relations.append(IndexRelation(relation=relation))
|
469
|
+
if replace_field:
|
470
|
+
self.brain.relation_fields_to_delete.append(field_key)
|
471
|
+
|
472
|
+
def delete_field(self, field_key: str):
|
473
|
+
ftype, fkey = field_key.split("/")
|
474
|
+
full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
|
475
|
+
self.brain.texts_to_delete.append(full_field_id)
|
476
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
477
|
+
self.brain.sentences_to_delete.append(full_field_id)
|
478
|
+
self.brain.relation_fields_to_delete.append(field_key)
|
479
|
+
|
480
|
+
def generate_vectors_index_message(
|
481
|
+
self,
|
482
|
+
field_id: str,
|
483
|
+
vo: utils_pb2.VectorObject,
|
484
|
+
*,
|
485
|
+
vectorset: str,
|
486
|
+
replace_field: bool = False,
|
487
|
+
# cut to specific dimension if specified
|
488
|
+
vector_dimension: Optional[int] = None,
|
489
|
+
):
|
490
|
+
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
491
|
+
for subfield, vectors in vo.split_vectors.items():
|
492
|
+
_field_id = ids.FieldId(
|
493
|
+
rid=fid.rid,
|
494
|
+
type=fid.type,
|
495
|
+
key=fid.key,
|
496
|
+
subfield_id=subfield,
|
497
|
+
)
|
498
|
+
# For each split of this field
|
499
|
+
for index, vector in enumerate(vectors.vectors):
|
500
|
+
paragraph_key = ids.ParagraphId(
|
501
|
+
field_id=_field_id,
|
502
|
+
paragraph_start=vector.start_paragraph,
|
503
|
+
paragraph_end=vector.end_paragraph,
|
504
|
+
)
|
505
|
+
sentence_key = ids.VectorId(
|
506
|
+
field_id=_field_id,
|
507
|
+
index=index,
|
508
|
+
vector_start=vector.start,
|
509
|
+
vector_end=vector.end,
|
510
|
+
)
|
511
|
+
self._apply_field_vector(
|
512
|
+
field_id,
|
513
|
+
paragraph_key,
|
514
|
+
sentence_key,
|
515
|
+
vector,
|
516
|
+
vectorset=vectorset,
|
517
|
+
vector_dimension=vector_dimension,
|
518
|
+
)
|
519
|
+
|
520
|
+
_field_id = ids.FieldId(
|
521
|
+
rid=fid.rid,
|
522
|
+
type=fid.type,
|
523
|
+
key=fid.key,
|
524
|
+
)
|
525
|
+
for index, vector in enumerate(vo.vectors.vectors):
|
526
|
+
paragraph_key = ids.ParagraphId(
|
527
|
+
field_id=_field_id,
|
528
|
+
paragraph_start=vector.start_paragraph,
|
529
|
+
paragraph_end=vector.end_paragraph,
|
530
|
+
)
|
531
|
+
sentence_key = ids.VectorId(
|
532
|
+
field_id=_field_id,
|
533
|
+
index=index,
|
534
|
+
vector_start=vector.start,
|
535
|
+
vector_end=vector.end,
|
536
|
+
)
|
537
|
+
self._apply_field_vector(
|
538
|
+
field_id,
|
539
|
+
paragraph_key,
|
540
|
+
sentence_key,
|
541
|
+
vector,
|
542
|
+
vectorset=vectorset,
|
543
|
+
vector_dimension=vector_dimension,
|
544
|
+
)
|
545
|
+
|
546
|
+
if replace_field:
|
547
|
+
full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
|
548
|
+
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
549
|
+
|
550
|
+
def _apply_field_vector(
|
551
|
+
self,
|
552
|
+
field_id: str,
|
553
|
+
paragraph_key: ids.ParagraphId,
|
554
|
+
sentence_key: ids.VectorId,
|
555
|
+
vector: utils_pb2.Vector,
|
556
|
+
*,
|
557
|
+
vectorset: str,
|
558
|
+
# cut vectors if a specific dimension is specified
|
559
|
+
vector_dimension: Optional[int] = None,
|
560
|
+
):
|
561
|
+
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
562
|
+
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
563
|
+
|
564
|
+
sentence_pb.ClearField("vector") # clear first to prevent duplicates
|
565
|
+
sentence_pb.vector.extend(vector.vector[:vector_dimension])
|
566
|
+
|
567
|
+
# we only care about start/stop position of the paragraph for a given sentence here
|
568
|
+
# the key has the sentence position
|
569
|
+
sentence_pb.metadata.position.start = vector.start_paragraph
|
570
|
+
sentence_pb.metadata.position.end = vector.end_paragraph
|
571
|
+
|
572
|
+
# does it make sense to copy forward paragraph values here?
|
573
|
+
sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
|
574
|
+
sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
|
575
|
+
|
576
|
+
sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
|
577
|
+
|
578
|
+
sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
|
579
|
+
|
580
|
+
sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
|
581
|
+
|
582
|
+
sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
|
583
|
+
|
584
|
+
def _set_resource_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
|
585
|
+
"""
|
586
|
+
We purposefully overwrite what we index as a status and DO NOT reflect
|
587
|
+
actual status with what we index.
|
588
|
+
|
589
|
+
This seems to be is on purpose so the frontend of the product can operate
|
590
|
+
on 2 statuses only -- PENDING and PROCESSED.
|
591
|
+
"""
|
592
|
+
# The value of brain.status will either be PROCESSED or PENDING
|
593
|
+
status = basic.metadata.status
|
594
|
+
if previous_status is not None and previous_status != Metadata.Status.PENDING:
|
595
|
+
# Already processed once, so it stays as PROCESSED
|
596
|
+
self.brain.status = PBBrainResource.PROCESSED
|
597
|
+
return
|
598
|
+
# previos_status is None or PENDING
|
599
|
+
if status == Metadata.Status.PENDING:
|
600
|
+
# Stays in pending
|
601
|
+
self.brain.status = PBBrainResource.PENDING
|
602
|
+
else:
|
603
|
+
# Means it has just been processed
|
604
|
+
self.brain.status = PBBrainResource.PROCESSED
|
605
|
+
|
606
|
+
def _set_resource_security(self, security: utils_pb2.Security):
|
607
|
+
self.brain.security.CopyFrom(security)
|
608
|
+
|
609
|
+
def get_processing_status_tag(self, metadata: Metadata) -> str:
|
610
|
+
if not metadata.useful:
|
611
|
+
return "EMPTY"
|
612
|
+
return METADATA_STATUS_PB_TYPE_TO_NAME_MAP[metadata.status]
|
613
|
+
|
614
|
+
def _set_resource_dates(self, basic: Basic, origin: Optional[Origin]):
|
615
|
+
"""
|
616
|
+
Adds the user-defined dates to the brain object. This is at resource level and applies to
|
617
|
+
all fields of the resource.
|
618
|
+
"""
|
619
|
+
if basic.created.seconds > 0:
|
620
|
+
self.brain.metadata.created.CopyFrom(basic.created)
|
621
|
+
else:
|
622
|
+
logging.warning(f"Basic metadata has no created field for {self.rid}")
|
623
|
+
self.brain.metadata.created.GetCurrentTime()
|
624
|
+
if basic.modified.seconds > 0:
|
625
|
+
self.brain.metadata.modified.CopyFrom(basic.modified)
|
626
|
+
else:
|
627
|
+
if basic.created.seconds > 0:
|
628
|
+
self.brain.metadata.modified.CopyFrom(basic.created)
|
629
|
+
else:
|
630
|
+
self.brain.metadata.modified.GetCurrentTime()
|
631
|
+
|
632
|
+
if origin is not None:
|
633
|
+
# overwrite created/modified if provided on origin
|
634
|
+
if origin.HasField("created") and origin.created.seconds > 0:
|
635
|
+
self.brain.metadata.created.CopyFrom(origin.created)
|
636
|
+
if origin.HasField("modified") and origin.modified.seconds > 0:
|
637
|
+
self.brain.metadata.modified.CopyFrom(origin.modified)
|
638
|
+
|
639
|
+
def _set_resource_relations(self, basic: Basic, origin: Optional[Origin], user_relations: Relations):
|
640
|
+
"""
|
641
|
+
Adds the relations to the brain object corresponding to the user-defined metadata at the resource level:
|
642
|
+
- Contributors of the document
|
643
|
+
- Classificatin labels
|
644
|
+
- Relations
|
645
|
+
"""
|
646
|
+
relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
|
647
|
+
if origin is not None:
|
648
|
+
# origin contributors
|
649
|
+
for contrib in origin.colaborators:
|
650
|
+
relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
|
651
|
+
relation = Relation(
|
652
|
+
relation=Relation.COLAB,
|
653
|
+
source=relationnodedocument,
|
654
|
+
to=relationnodeuser,
|
655
|
+
)
|
656
|
+
self.brain.field_relations["a/metadata"].relations.append(
|
657
|
+
IndexRelation(relation=relation)
|
658
|
+
)
|
659
|
+
|
660
|
+
# labels
|
661
|
+
for classification in basic.usermetadata.classifications:
|
662
|
+
if classification.cancelled_by_user:
|
663
|
+
continue
|
664
|
+
relation_node_label = RelationNode(
|
665
|
+
value=f"{classification.labelset}/{classification.label}",
|
666
|
+
ntype=RelationNode.NodeType.LABEL,
|
667
|
+
)
|
668
|
+
relation = Relation(
|
669
|
+
relation=Relation.ABOUT,
|
670
|
+
source=relationnodedocument,
|
671
|
+
to=relation_node_label,
|
672
|
+
)
|
673
|
+
self.brain.field_relations["a/metadata"].relations.append(IndexRelation(relation=relation))
|
674
|
+
|
675
|
+
# relations
|
676
|
+
for relation in user_relations.relations:
|
677
|
+
self.brain.field_relations["a/metadata"].relations.append(
|
678
|
+
IndexRelation(relation=relation, facets=["/g/u"])
|
679
|
+
)
|
680
|
+
|
681
|
+
self.brain.relation_fields_to_delete.append("a/metadata")
|
682
|
+
|
683
|
+
def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
|
684
|
+
"""
|
685
|
+
Adds the resource-level labels to the brain object.
|
686
|
+
These levels are user-defined in basic or origin metadata.
|
687
|
+
"""
|
688
|
+
if origin is not None:
|
689
|
+
if origin.source_id:
|
690
|
+
self.labels["o"] = {origin.source_id}
|
691
|
+
# origin tags
|
692
|
+
for tag in origin.tags:
|
693
|
+
self.labels["t"].add(tag)
|
694
|
+
# origin source
|
695
|
+
if origin.source_id != "":
|
696
|
+
self.labels["u"].add(f"s/{origin.source_id}")
|
697
|
+
|
698
|
+
if origin.path:
|
699
|
+
self.labels["p"].add(origin.path.lstrip("/"))
|
700
|
+
|
701
|
+
# origin contributors
|
702
|
+
for contrib in origin.colaborators:
|
703
|
+
self.labels["u"].add(f"o/{contrib}")
|
704
|
+
|
705
|
+
for key, value in origin.metadata.items():
|
706
|
+
self.labels["m"].add(f"{key[:255]}/{value[:255]}")
|
707
|
+
|
708
|
+
# icon
|
709
|
+
self.labels["n"].add(f"i/{basic.icon}")
|
710
|
+
|
711
|
+
# processing status
|
712
|
+
status_tag = self.get_processing_status_tag(basic.metadata)
|
713
|
+
self.labels["n"].add(f"s/{status_tag}")
|
714
|
+
|
715
|
+
# main language
|
716
|
+
if basic.metadata.language:
|
717
|
+
self.labels["s"].add(f"p/{basic.metadata.language}")
|
718
|
+
|
719
|
+
# all language
|
720
|
+
for lang in basic.metadata.languages:
|
721
|
+
self.labels["s"].add(f"s/{lang}")
|
722
|
+
|
723
|
+
# labels
|
724
|
+
for classification in basic.usermetadata.classifications:
|
725
|
+
if classification.cancelled_by_user:
|
726
|
+
continue
|
727
|
+
self.labels["l"].add(f"{classification.labelset}/{classification.label}")
|
728
|
+
|
729
|
+
# hidden
|
730
|
+
if basic.hidden:
|
731
|
+
_, p1, p2 = LABEL_HIDDEN.split("/")
|
732
|
+
self.labels[p1].add(p2)
|
733
|
+
|
734
|
+
self.brain.ClearField("labels")
|
735
|
+
self.brain.labels.extend(flatten_resource_labels(self.labels))
|
736
|
+
|
737
|
+
|
738
|
+
def is_paragraph_repeated_in_field(
|
739
|
+
paragraph: Paragraph,
|
740
|
+
extracted_text: Optional[str],
|
741
|
+
unique_paragraphs: set[str],
|
742
|
+
) -> bool:
|
743
|
+
if extracted_text is None:
|
744
|
+
return False
|
745
|
+
|
746
|
+
paragraph_text = extracted_text[paragraph.start : paragraph.end]
|
747
|
+
if len(paragraph_text) == 0:
|
748
|
+
return False
|
749
|
+
|
750
|
+
if paragraph_text in unique_paragraphs:
|
751
|
+
repeated_in_field = True
|
752
|
+
else:
|
753
|
+
repeated_in_field = False
|
754
|
+
unique_paragraphs.add(paragraph_text)
|
755
|
+
return repeated_in_field
|
756
|
+
|
757
|
+
|
758
|
+
class ParagraphPages:
|
759
|
+
"""
|
760
|
+
Class to get the page number for a given paragraph in an optimized way.
|
761
|
+
"""
|
762
|
+
|
763
|
+
def __init__(self, positions: FilePagePositions):
|
764
|
+
self.positions = positions
|
765
|
+
self._materialized = self._materialize_page_numbers(positions)
|
766
|
+
|
767
|
+
def _materialize_page_numbers(self, positions: FilePagePositions) -> list[int]:
|
768
|
+
page_numbers_by_index = []
|
769
|
+
for page_number, (page_start, page_end) in positions.items():
|
770
|
+
page_numbers_by_index.extend([page_number] * (page_end - page_start + 1))
|
771
|
+
return page_numbers_by_index
|
772
|
+
|
773
|
+
def get(self, paragraph_start_index: int) -> int:
|
774
|
+
try:
|
775
|
+
return self._materialized[paragraph_start_index]
|
776
|
+
except IndexError:
|
777
|
+
logger.error(
|
778
|
+
f"Could not find a page for the given index: {paragraph_start_index}. Page positions: {self.positions}" # noqa
|
779
|
+
)
|
780
|
+
if len(self._materialized) > 0:
|
781
|
+
return self._materialized[-1]
|
782
|
+
return 0
|