nucliadb 6.2.1.post2746__py3-none-any.whl → 6.2.1.post2755__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nucliadb/common/external_index_providers/pinecone.py +1 -0
- nucliadb/common/models_utils/__init__.py +19 -0
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/ingest/orm/brain.py +12 -2
- nucliadb/ingest/orm/processor/processor.py +5 -2
- nucliadb/ingest/orm/resource.py +4 -0
- nucliadb/ingest/serialize.py +21 -26
- nucliadb/reader/api/models.py +1 -3
- nucliadb/reader/api/v1/download.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +3 -3
- nucliadb/reader/api/v1/resource.py +6 -9
- nucliadb/reader/api/v1/services.py +4 -4
- nucliadb/search/api/v1/feedback.py +3 -3
- nucliadb/search/api/v1/knowledgebox.py +2 -1
- nucliadb/search/api/v1/search.py +2 -1
- nucliadb/search/search/chat/prompt.py +3 -2
- nucliadb/search/search/chat/query.py +2 -1
- nucliadb/search/search/find.py +2 -1
- nucliadb/search/search/merge.py +14 -4
- nucliadb/search/search/query.py +10 -2
- nucliadb/writer/api/v1/knowledgebox.py +1 -0
- nucliadb/writer/api/v1/services.py +2 -1
- nucliadb/writer/resource/basic.py +7 -6
- nucliadb/writer/resource/field.py +4 -7
- {nucliadb-6.2.1.post2746.dist-info → nucliadb-6.2.1.post2755.dist-info}/METADATA +5 -5
- {nucliadb-6.2.1.post2746.dist-info → nucliadb-6.2.1.post2755.dist-info}/RECORD +31 -28
- {nucliadb-6.2.1.post2746.dist-info → nucliadb-6.2.1.post2755.dist-info}/WHEEL +0 -0
- {nucliadb-6.2.1.post2746.dist-info → nucliadb-6.2.1.post2755.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.2.1.post2746.dist-info → nucliadb-6.2.1.post2755.dist-info}/top_level.txt +0 -0
- {nucliadb-6.2.1.post2746.dist-info → nucliadb-6.2.1.post2755.dist-info}/zip-safe +0 -0
@@ -441,6 +441,7 @@ class PineconeIndexManager(ExternalIndexManager):
|
|
441
441
|
|
442
442
|
def get_prefixes_to_delete(self, index_data: Resource) -> set[str]:
|
443
443
|
prefixes_to_delete = set()
|
444
|
+
# TODO: migrate to vector_prefixes_to_delete
|
444
445
|
for field_id in index_data.sentences_to_delete:
|
445
446
|
try:
|
446
447
|
delete_vid = VectorId.from_string(field_id)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
@@ -0,0 +1,479 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
|
22
|
+
from typing import Any
|
23
|
+
|
24
|
+
from google.protobuf.json_format import MessageToDict
|
25
|
+
|
26
|
+
from nucliadb_models.common import Classification, FieldID, FieldTypeName
|
27
|
+
from nucliadb_models.conversation import Conversation, FieldConversation
|
28
|
+
from nucliadb_models.entities import EntitiesGroup, EntitiesGroupSummary, Entity
|
29
|
+
from nucliadb_models.extracted import (
|
30
|
+
ExtractedText,
|
31
|
+
FieldComputedMetadata,
|
32
|
+
FieldQuestionAnswers,
|
33
|
+
FileExtractedData,
|
34
|
+
LargeComputedMetadata,
|
35
|
+
LinkExtractedData,
|
36
|
+
VectorObject,
|
37
|
+
)
|
38
|
+
from nucliadb_models.file import FieldFile
|
39
|
+
from nucliadb_models.internal.shards import KnowledgeboxShards
|
40
|
+
from nucliadb_models.link import FieldLink
|
41
|
+
from nucliadb_models.metadata import (
|
42
|
+
ComputedMetadata,
|
43
|
+
Extra,
|
44
|
+
FieldClassification,
|
45
|
+
Metadata,
|
46
|
+
Origin,
|
47
|
+
Relation,
|
48
|
+
RelationMetadata,
|
49
|
+
RelationNodeType,
|
50
|
+
RelationType,
|
51
|
+
UserFieldMetadata,
|
52
|
+
UserMetadata,
|
53
|
+
)
|
54
|
+
from nucliadb_models.resource import KnowledgeBoxConfig
|
55
|
+
from nucliadb_models.synonyms import KnowledgeBoxSynonyms
|
56
|
+
from nucliadb_models.text import FieldText
|
57
|
+
from nucliadb_protos import knowledgebox_pb2, resources_pb2, utils_pb2, writer_pb2
|
58
|
+
|
59
|
+
|
60
|
+
def field_type_name(field_type: resources_pb2.FieldType.ValueType) -> FieldTypeName:
|
61
|
+
return {
|
62
|
+
resources_pb2.FieldType.LINK: FieldTypeName.LINK,
|
63
|
+
resources_pb2.FieldType.FILE: FieldTypeName.FILE,
|
64
|
+
resources_pb2.FieldType.TEXT: FieldTypeName.TEXT,
|
65
|
+
resources_pb2.FieldType.GENERIC: FieldTypeName.GENERIC,
|
66
|
+
resources_pb2.FieldType.CONVERSATION: FieldTypeName.CONVERSATION,
|
67
|
+
}[field_type]
|
68
|
+
|
69
|
+
|
70
|
+
def field_type(field_type: resources_pb2.FieldType.ValueType) -> FieldID.FieldType:
|
71
|
+
return {
|
72
|
+
resources_pb2.FieldType.LINK: FieldID.FieldType.LINK,
|
73
|
+
resources_pb2.FieldType.FILE: FieldID.FieldType.FILE,
|
74
|
+
resources_pb2.FieldType.TEXT: FieldID.FieldType.TEXT,
|
75
|
+
resources_pb2.FieldType.GENERIC: FieldID.FieldType.GENERIC,
|
76
|
+
resources_pb2.FieldType.CONVERSATION: FieldID.FieldType.CONVERSATION,
|
77
|
+
}[field_type]
|
78
|
+
|
79
|
+
|
80
|
+
def user_field_metadata(message: resources_pb2.UserFieldMetadata) -> UserFieldMetadata:
|
81
|
+
value = MessageToDict(
|
82
|
+
message,
|
83
|
+
preserving_proto_field_name=True,
|
84
|
+
including_default_value_fields=True,
|
85
|
+
use_integers_for_enums=True,
|
86
|
+
)
|
87
|
+
value["selections"] = [
|
88
|
+
MessageToDict(
|
89
|
+
selections,
|
90
|
+
preserving_proto_field_name=True,
|
91
|
+
including_default_value_fields=True,
|
92
|
+
use_integers_for_enums=True,
|
93
|
+
)
|
94
|
+
for selections in message.page_selections
|
95
|
+
]
|
96
|
+
value["field"]["field_type"] = field_type_name(value["field"]["field_type"]).value
|
97
|
+
return UserFieldMetadata(**value)
|
98
|
+
|
99
|
+
|
100
|
+
def computed_metadata(message: resources_pb2.ComputedMetadata) -> ComputedMetadata:
|
101
|
+
values: dict[str, list[FieldClassification]] = {"field_classifications": []}
|
102
|
+
for fc in message.field_classifications:
|
103
|
+
values["field_classifications"].append(
|
104
|
+
FieldClassification(
|
105
|
+
field=FieldID(
|
106
|
+
field=fc.field.field,
|
107
|
+
field_type=field_type(fc.field.field_type),
|
108
|
+
),
|
109
|
+
classifications=[
|
110
|
+
Classification(label=c.label, labelset=c.labelset) for c in fc.classifications
|
111
|
+
],
|
112
|
+
)
|
113
|
+
)
|
114
|
+
return ComputedMetadata(**values)
|
115
|
+
|
116
|
+
|
117
|
+
def user_metadata(message: resources_pb2.UserMetadata) -> UserMetadata:
|
118
|
+
value = MessageToDict(
|
119
|
+
message,
|
120
|
+
preserving_proto_field_name=True,
|
121
|
+
including_default_value_fields=True,
|
122
|
+
)
|
123
|
+
value["relations"] = [convert_pb_relation_to_api(rel) for rel in message.relations]
|
124
|
+
return UserMetadata(**value)
|
125
|
+
|
126
|
+
|
127
|
+
RelationNodeTypeMap: dict[RelationNodeType, utils_pb2.RelationNode.NodeType.ValueType] = {
|
128
|
+
RelationNodeType.ENTITY: utils_pb2.RelationNode.NodeType.ENTITY,
|
129
|
+
RelationNodeType.LABEL: utils_pb2.RelationNode.NodeType.LABEL,
|
130
|
+
RelationNodeType.RESOURCE: utils_pb2.RelationNode.NodeType.RESOURCE,
|
131
|
+
RelationNodeType.USER: utils_pb2.RelationNode.NodeType.USER,
|
132
|
+
}
|
133
|
+
|
134
|
+
RelationNodeTypePbMap: dict[utils_pb2.RelationNode.NodeType.ValueType, RelationNodeType] = {
|
135
|
+
utils_pb2.RelationNode.NodeType.ENTITY: RelationNodeType.ENTITY,
|
136
|
+
utils_pb2.RelationNode.NodeType.LABEL: RelationNodeType.LABEL,
|
137
|
+
utils_pb2.RelationNode.NodeType.RESOURCE: RelationNodeType.RESOURCE,
|
138
|
+
utils_pb2.RelationNode.NodeType.USER: RelationNodeType.USER,
|
139
|
+
}
|
140
|
+
|
141
|
+
|
142
|
+
RelationTypePbMap: dict[utils_pb2.Relation.RelationType.ValueType, RelationType] = {
|
143
|
+
utils_pb2.Relation.RelationType.ABOUT: RelationType.ABOUT,
|
144
|
+
utils_pb2.Relation.RelationType.CHILD: RelationType.CHILD,
|
145
|
+
utils_pb2.Relation.RelationType.COLAB: RelationType.COLAB,
|
146
|
+
utils_pb2.Relation.RelationType.ENTITY: RelationType.ENTITY,
|
147
|
+
utils_pb2.Relation.RelationType.OTHER: RelationType.OTHER,
|
148
|
+
utils_pb2.Relation.RelationType.SYNONYM: RelationType.SYNONYM,
|
149
|
+
}
|
150
|
+
|
151
|
+
RelationTypeMap: dict[RelationType, utils_pb2.Relation.RelationType.ValueType] = {
|
152
|
+
RelationType.ABOUT: utils_pb2.Relation.RelationType.ABOUT,
|
153
|
+
RelationType.CHILD: utils_pb2.Relation.RelationType.CHILD,
|
154
|
+
RelationType.COLAB: utils_pb2.Relation.RelationType.COLAB,
|
155
|
+
RelationType.ENTITY: utils_pb2.Relation.RelationType.ENTITY,
|
156
|
+
RelationType.OTHER: utils_pb2.Relation.RelationType.OTHER,
|
157
|
+
RelationType.SYNONYM: utils_pb2.Relation.RelationType.SYNONYM,
|
158
|
+
}
|
159
|
+
|
160
|
+
|
161
|
+
def convert_pb_relation_to_api(rel: utils_pb2.Relation) -> dict[str, Any]:
|
162
|
+
return {
|
163
|
+
"relation": RelationTypePbMap[rel.relation],
|
164
|
+
"from": convert_pb_relation_node_to_api(rel.source),
|
165
|
+
"to": convert_pb_relation_node_to_api(rel.to),
|
166
|
+
"label": rel.relation_label,
|
167
|
+
"metadata": relation_metadata(rel.metadata),
|
168
|
+
}
|
169
|
+
|
170
|
+
|
171
|
+
def convert_pb_relation_node_to_api(
|
172
|
+
relation_node: utils_pb2.RelationNode,
|
173
|
+
) -> dict[str, Any]:
|
174
|
+
return {
|
175
|
+
"type": RelationNodeTypePbMap[relation_node.ntype],
|
176
|
+
"value": relation_node.value,
|
177
|
+
"group": relation_node.subtype,
|
178
|
+
}
|
179
|
+
|
180
|
+
|
181
|
+
def relation_metadata(message: utils_pb2.RelationMetadata) -> RelationMetadata:
|
182
|
+
return RelationMetadata(
|
183
|
+
**MessageToDict(
|
184
|
+
message,
|
185
|
+
preserving_proto_field_name=True,
|
186
|
+
including_default_value_fields=True,
|
187
|
+
)
|
188
|
+
)
|
189
|
+
|
190
|
+
|
191
|
+
def relation(message: utils_pb2.Relation) -> Relation:
|
192
|
+
value = convert_pb_relation_to_api(message)
|
193
|
+
return Relation(**value)
|
194
|
+
|
195
|
+
|
196
|
+
def origin(message: resources_pb2.Origin) -> Origin:
|
197
|
+
data = MessageToDict(
|
198
|
+
message,
|
199
|
+
preserving_proto_field_name=True,
|
200
|
+
including_default_value_fields=True,
|
201
|
+
)
|
202
|
+
# old field was "colaborators" and we want to keep pb field name
|
203
|
+
# to avoid migration
|
204
|
+
data["collaborators"] = data.pop("colaborators", [])
|
205
|
+
return Origin(**data)
|
206
|
+
|
207
|
+
|
208
|
+
def extra(message: resources_pb2.Extra) -> Extra:
|
209
|
+
return Extra(
|
210
|
+
**MessageToDict(
|
211
|
+
message,
|
212
|
+
preserving_proto_field_name=True,
|
213
|
+
including_default_value_fields=False,
|
214
|
+
)
|
215
|
+
)
|
216
|
+
|
217
|
+
|
218
|
+
def metadata(message: resources_pb2.Metadata) -> Metadata:
|
219
|
+
return Metadata(
|
220
|
+
**MessageToDict(
|
221
|
+
message,
|
222
|
+
preserving_proto_field_name=True,
|
223
|
+
including_default_value_fields=True,
|
224
|
+
)
|
225
|
+
)
|
226
|
+
|
227
|
+
|
228
|
+
def field_question_answers(
|
229
|
+
message: resources_pb2.FieldQuestionAnswers,
|
230
|
+
) -> FieldQuestionAnswers:
|
231
|
+
value = MessageToDict(
|
232
|
+
message,
|
233
|
+
preserving_proto_field_name=True,
|
234
|
+
including_default_value_fields=True,
|
235
|
+
)
|
236
|
+
return FieldQuestionAnswers(**value)
|
237
|
+
|
238
|
+
|
239
|
+
def extracted_text(message: resources_pb2.ExtractedText) -> ExtractedText:
|
240
|
+
return ExtractedText(
|
241
|
+
**MessageToDict(
|
242
|
+
message,
|
243
|
+
preserving_proto_field_name=True,
|
244
|
+
including_default_value_fields=True,
|
245
|
+
)
|
246
|
+
)
|
247
|
+
|
248
|
+
|
249
|
+
def vector_object(message: resources_pb2.VectorObject) -> VectorObject:
|
250
|
+
return VectorObject(
|
251
|
+
**MessageToDict(
|
252
|
+
message,
|
253
|
+
preserving_proto_field_name=True,
|
254
|
+
including_default_value_fields=True,
|
255
|
+
)
|
256
|
+
)
|
257
|
+
|
258
|
+
|
259
|
+
def large_computed_metadata(
|
260
|
+
message: resources_pb2.LargeComputedMetadata,
|
261
|
+
) -> LargeComputedMetadata:
|
262
|
+
return LargeComputedMetadata(
|
263
|
+
**MessageToDict(
|
264
|
+
message,
|
265
|
+
preserving_proto_field_name=True,
|
266
|
+
including_default_value_fields=True,
|
267
|
+
)
|
268
|
+
)
|
269
|
+
|
270
|
+
|
271
|
+
def link_extracted_data(message: resources_pb2.LinkExtractedData) -> LinkExtractedData:
|
272
|
+
return LinkExtractedData(
|
273
|
+
**MessageToDict(
|
274
|
+
message,
|
275
|
+
preserving_proto_field_name=True,
|
276
|
+
including_default_value_fields=True,
|
277
|
+
)
|
278
|
+
)
|
279
|
+
|
280
|
+
|
281
|
+
def file_extracted_data(message: resources_pb2.FileExtractedData) -> FileExtractedData:
|
282
|
+
return FileExtractedData(
|
283
|
+
**MessageToDict(
|
284
|
+
message,
|
285
|
+
preserving_proto_field_name=True,
|
286
|
+
including_default_value_fields=True,
|
287
|
+
)
|
288
|
+
)
|
289
|
+
|
290
|
+
|
291
|
+
def shorten_fieldmetadata(
|
292
|
+
message: resources_pb2.FieldComputedMetadata,
|
293
|
+
) -> None:
|
294
|
+
large_fields = ["ner", "relations", "positions", "classifications", "entities"]
|
295
|
+
for field in large_fields:
|
296
|
+
message.metadata.ClearField(field) # type: ignore
|
297
|
+
for metadata in message.split_metadata.values():
|
298
|
+
for field in large_fields:
|
299
|
+
metadata.ClearField(field) # type: ignore
|
300
|
+
|
301
|
+
|
302
|
+
def field_computed_metadata(
|
303
|
+
message: resources_pb2.FieldComputedMetadata, shortened: bool = False
|
304
|
+
) -> FieldComputedMetadata:
|
305
|
+
if shortened:
|
306
|
+
shorten_fieldmetadata(message)
|
307
|
+
metadata = convert_fieldmetadata_pb_to_dict(message.metadata)
|
308
|
+
split_metadata = {
|
309
|
+
split: convert_fieldmetadata_pb_to_dict(metadata_split)
|
310
|
+
for split, metadata_split in message.split_metadata.items()
|
311
|
+
}
|
312
|
+
value = MessageToDict(
|
313
|
+
message,
|
314
|
+
preserving_proto_field_name=True,
|
315
|
+
including_default_value_fields=True,
|
316
|
+
)
|
317
|
+
value["metadata"] = metadata
|
318
|
+
value["split_metadata"] = split_metadata
|
319
|
+
return FieldComputedMetadata(**value)
|
320
|
+
|
321
|
+
|
322
|
+
def convert_fieldmetadata_pb_to_dict(
|
323
|
+
message: resources_pb2.FieldMetadata,
|
324
|
+
) -> dict[str, Any]:
|
325
|
+
# Backwards compatibility with old entities format
|
326
|
+
# TODO: Remove once deprecated fields are removed
|
327
|
+
# If we recieved processor entities in the new field and the old field is empty, we copy them to the old field
|
328
|
+
if "processor" in message.entities and len(message.positions) == 0 and len(message.ner) == 0:
|
329
|
+
message.ner.update({ent.text: ent.label for ent in message.entities["processor"].entities})
|
330
|
+
for ent in message.entities["processor"].entities:
|
331
|
+
message.positions[ent.label + "/" + ent.text].entity = ent.text
|
332
|
+
message.positions[ent.label + "/" + ent.text].position.extend(
|
333
|
+
[
|
334
|
+
resources_pb2.Position(
|
335
|
+
start=position.start,
|
336
|
+
end=position.end,
|
337
|
+
)
|
338
|
+
for position in ent.positions
|
339
|
+
]
|
340
|
+
)
|
341
|
+
|
342
|
+
value = MessageToDict(
|
343
|
+
message,
|
344
|
+
preserving_proto_field_name=True,
|
345
|
+
including_default_value_fields=True,
|
346
|
+
)
|
347
|
+
value["relations"] = [
|
348
|
+
convert_pb_relation_to_api(rel) for relations in message.relations for rel in relations.relations
|
349
|
+
]
|
350
|
+
return value
|
351
|
+
|
352
|
+
|
353
|
+
def conversation(message: resources_pb2.Conversation) -> Conversation:
|
354
|
+
as_dict = MessageToDict(
|
355
|
+
message,
|
356
|
+
preserving_proto_field_name=True,
|
357
|
+
including_default_value_fields=True,
|
358
|
+
)
|
359
|
+
for conv_message in as_dict.get("messages", []):
|
360
|
+
for attachment_field in conv_message.get("content", {}).get("attachments_fields", []):
|
361
|
+
attachment_field["field_type"] = attachment_field["field_type"].lower()
|
362
|
+
return Conversation(**as_dict)
|
363
|
+
|
364
|
+
|
365
|
+
def field_conversation(message: resources_pb2.FieldConversation) -> FieldConversation:
|
366
|
+
return FieldConversation(
|
367
|
+
**MessageToDict(
|
368
|
+
message,
|
369
|
+
preserving_proto_field_name=True,
|
370
|
+
including_default_value_fields=True,
|
371
|
+
)
|
372
|
+
)
|
373
|
+
|
374
|
+
|
375
|
+
def entity(message: knowledgebox_pb2.Entity) -> Entity:
|
376
|
+
return Entity(
|
377
|
+
**MessageToDict(
|
378
|
+
message,
|
379
|
+
preserving_proto_field_name=True,
|
380
|
+
including_default_value_fields=True,
|
381
|
+
)
|
382
|
+
)
|
383
|
+
|
384
|
+
|
385
|
+
def entities_group(
|
386
|
+
message: knowledgebox_pb2.EntitiesGroup,
|
387
|
+
) -> EntitiesGroup:
|
388
|
+
entities_group = MessageToDict(
|
389
|
+
message,
|
390
|
+
preserving_proto_field_name=True,
|
391
|
+
including_default_value_fields=True,
|
392
|
+
)
|
393
|
+
entities_group["entities"] = {}
|
394
|
+
|
395
|
+
for name, ent in message.entities.items():
|
396
|
+
if not ent.deleted:
|
397
|
+
entities_group["entities"][name] = entity(ent)
|
398
|
+
|
399
|
+
return EntitiesGroup(**entities_group)
|
400
|
+
|
401
|
+
|
402
|
+
def entities_group_summary(
|
403
|
+
message: knowledgebox_pb2.EntitiesGroupSummary,
|
404
|
+
) -> EntitiesGroupSummary:
|
405
|
+
return EntitiesGroupSummary(
|
406
|
+
**MessageToDict(
|
407
|
+
message,
|
408
|
+
preserving_proto_field_name=True,
|
409
|
+
including_default_value_fields=True,
|
410
|
+
)
|
411
|
+
)
|
412
|
+
|
413
|
+
|
414
|
+
def field_file(message: resources_pb2.FieldFile) -> FieldFile:
|
415
|
+
instance = FieldFile(
|
416
|
+
**MessageToDict(
|
417
|
+
message,
|
418
|
+
preserving_proto_field_name=True,
|
419
|
+
including_default_value_fields=True,
|
420
|
+
)
|
421
|
+
)
|
422
|
+
instance.external = ( # type: ignore
|
423
|
+
message.file.source == resources_pb2.CloudFile.Source.EXTERNAL
|
424
|
+
)
|
425
|
+
return instance
|
426
|
+
|
427
|
+
|
428
|
+
def field_link(message: resources_pb2.FieldLink) -> FieldLink:
|
429
|
+
return FieldLink(
|
430
|
+
**MessageToDict(
|
431
|
+
message,
|
432
|
+
preserving_proto_field_name=True,
|
433
|
+
including_default_value_fields=True,
|
434
|
+
)
|
435
|
+
)
|
436
|
+
|
437
|
+
|
438
|
+
def field_text(message: resources_pb2.FieldText) -> FieldText:
|
439
|
+
return FieldText(
|
440
|
+
**MessageToDict(
|
441
|
+
message,
|
442
|
+
preserving_proto_field_name=True,
|
443
|
+
including_default_value_fields=True,
|
444
|
+
)
|
445
|
+
)
|
446
|
+
|
447
|
+
|
448
|
+
def knowledgebox_config(message: knowledgebox_pb2.KnowledgeBoxConfig) -> KnowledgeBoxConfig:
|
449
|
+
as_dict = MessageToDict(
|
450
|
+
message,
|
451
|
+
preserving_proto_field_name=True,
|
452
|
+
including_default_value_fields=True,
|
453
|
+
)
|
454
|
+
# Calculate external index provider metadata
|
455
|
+
# that is shown on read requests
|
456
|
+
eip = as_dict.pop("external_index_provider", None)
|
457
|
+
if eip:
|
458
|
+
as_dict["configured_external_index_provider"] = {"type": eip["type"].lower()}
|
459
|
+
return KnowledgeBoxConfig(**as_dict)
|
460
|
+
|
461
|
+
|
462
|
+
def kb_synonyms(message: knowledgebox_pb2.Synonyms) -> KnowledgeBoxSynonyms:
|
463
|
+
return KnowledgeBoxSynonyms(
|
464
|
+
**dict(
|
465
|
+
synonyms={
|
466
|
+
term: list(term_synonyms.synonyms) for term, term_synonyms in message.terms.items()
|
467
|
+
}
|
468
|
+
)
|
469
|
+
)
|
470
|
+
|
471
|
+
|
472
|
+
def kb_shards(message: writer_pb2.Shards) -> KnowledgeboxShards:
|
473
|
+
return KnowledgeboxShards(
|
474
|
+
**MessageToDict(
|
475
|
+
message,
|
476
|
+
preserving_proto_field_name=True,
|
477
|
+
including_default_value_fields=True,
|
478
|
+
)
|
479
|
+
)
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from nucliadb_models.common import FieldID, FieldTypeName
|
22
|
+
from nucliadb_models.search import FeedbackTasks, NucliaDBClientType
|
23
|
+
from nucliadb_models.synonyms import KnowledgeBoxSynonyms
|
24
|
+
from nucliadb_protos import knowledgebox_pb2, resources_pb2
|
25
|
+
from nucliadb_protos.audit_pb2 import ClientType, TaskType
|
26
|
+
|
27
|
+
|
28
|
+
def client_type(obj: NucliaDBClientType) -> ClientType.ValueType:
|
29
|
+
return ClientType.Value(obj.name)
|
30
|
+
|
31
|
+
|
32
|
+
def feedback_task(obj: FeedbackTasks) -> TaskType.ValueType:
|
33
|
+
return TaskType.Value(obj.name)
|
34
|
+
|
35
|
+
|
36
|
+
def field_type_name(obj: FieldTypeName) -> resources_pb2.FieldType.ValueType:
|
37
|
+
return {
|
38
|
+
FieldTypeName.LINK: resources_pb2.FieldType.LINK,
|
39
|
+
FieldTypeName.FILE: resources_pb2.FieldType.FILE,
|
40
|
+
FieldTypeName.TEXT: resources_pb2.FieldType.TEXT,
|
41
|
+
FieldTypeName.GENERIC: resources_pb2.FieldType.GENERIC,
|
42
|
+
FieldTypeName.CONVERSATION: resources_pb2.FieldType.CONVERSATION,
|
43
|
+
}[obj]
|
44
|
+
|
45
|
+
|
46
|
+
def field_type(obj: FieldID.FieldType) -> resources_pb2.FieldType.ValueType:
|
47
|
+
return {
|
48
|
+
FieldID.FieldType.LINK: resources_pb2.FieldType.LINK,
|
49
|
+
FieldID.FieldType.FILE: resources_pb2.FieldType.FILE,
|
50
|
+
FieldID.FieldType.TEXT: resources_pb2.FieldType.TEXT,
|
51
|
+
FieldID.FieldType.GENERIC: resources_pb2.FieldType.GENERIC,
|
52
|
+
FieldID.FieldType.CONVERSATION: resources_pb2.FieldType.CONVERSATION,
|
53
|
+
}[obj]
|
54
|
+
|
55
|
+
|
56
|
+
def kb_synonyms(obj: KnowledgeBoxSynonyms) -> knowledgebox_pb2.Synonyms:
|
57
|
+
pbsyn = knowledgebox_pb2.Synonyms()
|
58
|
+
for term, term_synonyms in obj.synonyms.items():
|
59
|
+
pbsyn.terms[term].synonyms.extend(term_synonyms)
|
60
|
+
return pbsyn
|
nucliadb/ingest/orm/brain.py
CHANGED
@@ -100,6 +100,8 @@ class ResourceBrain:
|
|
100
100
|
page_positions: Optional[FilePagePositions],
|
101
101
|
extracted_text: Optional[ExtractedText],
|
102
102
|
basic_user_field_metadata: Optional[UserFieldMetadata] = None,
|
103
|
+
*,
|
104
|
+
replace_field: bool = False,
|
103
105
|
):
|
104
106
|
# To check for duplicate paragraphs
|
105
107
|
unique_paragraphs: set[str] = set()
|
@@ -224,6 +226,11 @@ class ResourceBrain:
|
|
224
226
|
|
225
227
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
226
228
|
|
229
|
+
if replace_field:
|
230
|
+
field_type, field_name = field_key.split("/")
|
231
|
+
full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
|
232
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
233
|
+
|
227
234
|
for relations in metadata.metadata.relations:
|
228
235
|
for relation in relations.relations:
|
229
236
|
self.brain.relations.append(relation)
|
@@ -301,8 +308,11 @@ class ResourceBrain:
|
|
301
308
|
|
302
309
|
if replace_field:
|
303
310
|
full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
|
304
|
-
|
305
|
-
|
311
|
+
if vectorset is None:
|
312
|
+
# DEPRECATED
|
313
|
+
self.brain.sentences_to_delete.append(full_field_id)
|
314
|
+
else:
|
315
|
+
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
306
316
|
|
307
317
|
def _apply_field_vector(
|
308
318
|
self,
|
@@ -275,7 +275,6 @@ class Processor:
|
|
275
275
|
|
276
276
|
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
277
277
|
resource = await kb.get(uuid)
|
278
|
-
|
279
278
|
if resource is None:
|
280
279
|
# It's a new resource
|
281
280
|
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
@@ -737,7 +736,11 @@ def has_vectors_operation(index_message: PBBrainResource) -> bool:
|
|
737
736
|
"""
|
738
737
|
Returns True if the index message has any vectors to index or to delete.
|
739
738
|
"""
|
740
|
-
if
|
739
|
+
if (
|
740
|
+
len(index_message.sentences_to_delete) > 0
|
741
|
+
or len(index_message.paragraphs_to_delete) > 0
|
742
|
+
or any([len(deletions.items) for deletions in index_message.vector_prefixes_to_delete.values()])
|
743
|
+
):
|
741
744
|
return True
|
742
745
|
for field_paragraphs in index_message.paragraphs.values():
|
743
746
|
for paragraph in field_paragraphs.paragraphs.values():
|
nucliadb/ingest/orm/resource.py
CHANGED
@@ -226,6 +226,7 @@ class Resource:
|
|
226
226
|
page_positions=page_positions,
|
227
227
|
extracted_text=await field_obj.get_extracted_text(),
|
228
228
|
basic_user_field_metadata=user_field_metadata,
|
229
|
+
replace_field=True,
|
229
230
|
)
|
230
231
|
|
231
232
|
# Some basic fields are computed off field metadata.
|
@@ -336,6 +337,7 @@ class Resource:
|
|
336
337
|
page_positions=page_positions,
|
337
338
|
extracted_text=await field.get_extracted_text(),
|
338
339
|
basic_user_field_metadata=user_field_metadata,
|
340
|
+
replace_field=reindex,
|
339
341
|
)
|
340
342
|
|
341
343
|
if self.disable_vectors is False:
|
@@ -584,6 +586,7 @@ class Resource:
|
|
584
586
|
# Upload to binary storage
|
585
587
|
# Vector indexing
|
586
588
|
if self.disable_vectors is False:
|
589
|
+
await self.get_fields(force=True)
|
587
590
|
for field_vectors in message.field_vectors:
|
588
591
|
await self._apply_extracted_vectors(field_vectors)
|
589
592
|
|
@@ -723,6 +726,7 @@ class Resource:
|
|
723
726
|
page_positions=page_positions,
|
724
727
|
extracted_text=extracted_text,
|
725
728
|
basic_user_field_metadata=user_field_metadata,
|
729
|
+
replace_field=True,
|
726
730
|
)
|
727
731
|
loop = asyncio.get_running_loop()
|
728
732
|
await loop.run_in_executor(_executor, apply_field_metadata)
|