nucliadb 6.3.5.post3985__py3-none-any.whl → 6.3.5.post3995__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +3 -2
- nucliadb/common/cluster/rollover.py +3 -3
- nucliadb/common/cluster/utils.py +8 -4
- nucliadb/common/external_index_providers/pinecone.py +7 -44
- nucliadb/ingest/fields/exceptions.py +4 -0
- nucliadb/ingest/orm/brain_v2.py +782 -0
- nucliadb/ingest/orm/index_message.py +409 -0
- nucliadb/ingest/orm/metrics.py +1 -1
- nucliadb/ingest/orm/processor/data_augmentation.py +2 -2
- nucliadb/ingest/orm/processor/pgcatalog.py +3 -2
- nucliadb/ingest/orm/processor/processor.py +61 -47
- nucliadb/ingest/orm/resource.py +70 -50
- nucliadb/ingest/orm/utils.py +1 -2
- nucliadb/ingest/processing.py +2 -54
- nucliadb/ingest/service/writer.py +2 -2
- nucliadb/models/internal/__init__.py +19 -0
- nucliadb/models/internal/processing.py +160 -0
- nucliadb/writer/api/v1/field.py +1 -1
- nucliadb/writer/api/v1/resource.py +2 -1
- nucliadb/writer/api/v1/upload.py +1 -1
- nucliadb/writer/resource/basic.py +2 -3
- nucliadb/writer/resource/field.py +13 -14
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/METADATA +6 -6
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/RECORD +27 -23
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/WHEEL +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.3.5.post3985.dist-info → nucliadb-6.3.5.post3995.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,409 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
|
22
|
+
import asyncio
|
23
|
+
from typing import Optional
|
24
|
+
|
25
|
+
from nucliadb.common import datamanagers
|
26
|
+
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
27
|
+
from nucliadb.ingest.fields.file import File
|
28
|
+
from nucliadb.ingest.orm.brain_v2 import ResourceBrainV2 as ResourceBrain
|
29
|
+
from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
|
30
|
+
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
31
|
+
from nucliadb_protos.noderesources_pb2 import Resource as IndexMessage
|
32
|
+
from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
|
33
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage
|
34
|
+
from nucliadb_utils import const
|
35
|
+
from nucliadb_utils.utilities import has_feature
|
36
|
+
|
37
|
+
|
38
|
+
class IndexMessageBuilder:
|
39
|
+
def __init__(self, resource: Resource):
|
40
|
+
self.resource = resource
|
41
|
+
self.brain = ResourceBrain(resource.uuid)
|
42
|
+
|
43
|
+
async def _apply_resource_index_data(self, brain: ResourceBrain) -> None:
|
44
|
+
# Set the metadata at the resource level
|
45
|
+
basic = await self.resource.get_basic()
|
46
|
+
assert basic is not None
|
47
|
+
user_relations = await self.resource.get_user_relations()
|
48
|
+
origin = await self.resource.get_origin()
|
49
|
+
security = await self.resource.get_security()
|
50
|
+
await asyncio.to_thread(
|
51
|
+
brain.generate_resource_indexing_metadata,
|
52
|
+
basic,
|
53
|
+
user_relations,
|
54
|
+
origin,
|
55
|
+
self.resource._previous_status,
|
56
|
+
security,
|
57
|
+
)
|
58
|
+
|
59
|
+
async def _apply_field_index_data(
|
60
|
+
self,
|
61
|
+
brain: ResourceBrain,
|
62
|
+
fieldid: FieldID,
|
63
|
+
basic: Basic,
|
64
|
+
texts: bool = True,
|
65
|
+
paragraphs: bool = True,
|
66
|
+
vectors: bool = True,
|
67
|
+
relations: bool = True,
|
68
|
+
replace: bool = True,
|
69
|
+
vectorset_configs: Optional[list[VectorSetConfig]] = None,
|
70
|
+
):
|
71
|
+
field = await self.resource.get_field(fieldid.field, fieldid.field_type)
|
72
|
+
extracted_text = await field.get_extracted_text()
|
73
|
+
field_computed_metadata = await field.get_field_metadata()
|
74
|
+
user_field_metadata = next(
|
75
|
+
(fm for fm in basic.fieldmetadata if fm.field == fieldid),
|
76
|
+
None,
|
77
|
+
)
|
78
|
+
if texts or paragraphs:
|
79
|
+
# We need to compute the texts when we're going to generate the paragraphs too, but we may not
|
80
|
+
# want to index them always.
|
81
|
+
skip_index_texts = not texts
|
82
|
+
replace_texts = replace and not skip_index_texts
|
83
|
+
|
84
|
+
if extracted_text is not None:
|
85
|
+
try:
|
86
|
+
field_author = await field.generated_by()
|
87
|
+
except FieldAuthorNotFound:
|
88
|
+
field_author = None
|
89
|
+
await asyncio.to_thread(
|
90
|
+
brain.generate_texts_index_message,
|
91
|
+
self.resource.generate_field_id(fieldid),
|
92
|
+
extracted_text,
|
93
|
+
field_computed_metadata,
|
94
|
+
basic.usermetadata,
|
95
|
+
field_author,
|
96
|
+
replace_field=replace_texts,
|
97
|
+
skip_index=skip_index_texts,
|
98
|
+
)
|
99
|
+
if paragraphs or vectors:
|
100
|
+
# The paragraphs are needed to generate the vectors. However, we don't need to index them
|
101
|
+
# in all cases.
|
102
|
+
skip_index_paragraphs = not paragraphs
|
103
|
+
replace_paragraphs = replace and not skip_index_paragraphs
|
104
|
+
|
105
|
+
# We need to compute the paragraphs when we're going to generate the vectors too.
|
106
|
+
if extracted_text is not None and field_computed_metadata is not None:
|
107
|
+
page_positions = (
|
108
|
+
await get_file_page_positions(field) if isinstance(field, File) else None
|
109
|
+
)
|
110
|
+
await asyncio.to_thread(
|
111
|
+
brain.generate_paragraphs_index_message,
|
112
|
+
self.resource.generate_field_id(fieldid),
|
113
|
+
field_computed_metadata,
|
114
|
+
extracted_text,
|
115
|
+
page_positions,
|
116
|
+
user_field_metadata,
|
117
|
+
replace_field=replace_paragraphs,
|
118
|
+
skip_index=skip_index_paragraphs,
|
119
|
+
)
|
120
|
+
if vectors:
|
121
|
+
assert vectorset_configs is not None
|
122
|
+
for vectorset_config in vectorset_configs:
|
123
|
+
vo = await field.get_vectors(
|
124
|
+
vectorset=vectorset_config.vectorset_id,
|
125
|
+
storage_key_kind=vectorset_config.storage_key_kind,
|
126
|
+
)
|
127
|
+
if vo is not None:
|
128
|
+
dimension = vectorset_config.vectorset_index_config.vector_dimension
|
129
|
+
await asyncio.to_thread(
|
130
|
+
brain.generate_vectors_index_message,
|
131
|
+
self.resource.generate_field_id(fieldid),
|
132
|
+
vo,
|
133
|
+
vectorset=vectorset_config.vectorset_id,
|
134
|
+
replace_field=replace,
|
135
|
+
vector_dimension=dimension,
|
136
|
+
)
|
137
|
+
if relations:
|
138
|
+
await asyncio.to_thread(
|
139
|
+
brain.generate_relations_index_message,
|
140
|
+
self.resource.generate_field_id(fieldid),
|
141
|
+
field_computed_metadata,
|
142
|
+
basic.usermetadata,
|
143
|
+
replace_field=replace,
|
144
|
+
)
|
145
|
+
|
146
|
+
def _apply_field_deletions(
|
147
|
+
self,
|
148
|
+
brain: ResourceBrain,
|
149
|
+
field_ids: list[FieldID],
|
150
|
+
) -> None:
|
151
|
+
for field_id in field_ids:
|
152
|
+
brain.delete_field(self.resource.generate_field_id(field_id))
|
153
|
+
|
154
|
+
async def for_writer_bm(
|
155
|
+
self,
|
156
|
+
messages: list[BrokerMessage],
|
157
|
+
resource_created: bool,
|
158
|
+
) -> IndexMessage:
|
159
|
+
"""
|
160
|
+
Builds the index message for the broker messages coming from the writer.
|
161
|
+
The writer messages are not adding new vectors to the index.
|
162
|
+
"""
|
163
|
+
assert all(message.source == BrokerMessage.MessageSource.WRITER for message in messages)
|
164
|
+
|
165
|
+
deleted_fields = get_bm_deleted_fields(messages)
|
166
|
+
self._apply_field_deletions(self.brain, deleted_fields)
|
167
|
+
await self._apply_resource_index_data(self.brain)
|
168
|
+
basic = await self.get_basic()
|
169
|
+
prefilter_update = needs_prefilter_update(messages)
|
170
|
+
if prefilter_update:
|
171
|
+
# Changes on some metadata at the resource level that is used for filtering require that we reindex all the fields
|
172
|
+
# in the texts index (as it is the one used for prefiltering).
|
173
|
+
fields_to_index = [
|
174
|
+
FieldID(field=field_id, field_type=field_type)
|
175
|
+
for field_type, field_id in await self.resource.get_fields(force=True)
|
176
|
+
]
|
177
|
+
else:
|
178
|
+
# Simply process the fields that are in the message
|
179
|
+
fields_to_index = get_bm_modified_fields(messages)
|
180
|
+
for fieldid in fields_to_index:
|
181
|
+
if fieldid in deleted_fields:
|
182
|
+
continue
|
183
|
+
await self._apply_field_index_data(
|
184
|
+
self.brain,
|
185
|
+
fieldid,
|
186
|
+
basic,
|
187
|
+
texts=prefilter_update or needs_texts_update(fieldid, messages),
|
188
|
+
paragraphs=needs_paragraphs_update(fieldid, messages),
|
189
|
+
relations=False, # Relations at the field level are not modified by the writer
|
190
|
+
vectors=False, # Vectors are never added by the writer
|
191
|
+
replace=not resource_created,
|
192
|
+
)
|
193
|
+
return self.brain.brain
|
194
|
+
|
195
|
+
async def for_processor_bm(
|
196
|
+
self,
|
197
|
+
messages: list[BrokerMessage],
|
198
|
+
) -> IndexMessage:
|
199
|
+
"""
|
200
|
+
Builds the index message for the broker messages coming from the processor.
|
201
|
+
The processor can index new data to any index.
|
202
|
+
"""
|
203
|
+
assert all(message.source == BrokerMessage.MessageSource.PROCESSOR for message in messages)
|
204
|
+
deleted_fields = get_bm_deleted_fields(messages)
|
205
|
+
self._apply_field_deletions(self.brain, deleted_fields)
|
206
|
+
await self._apply_resource_index_data(self.brain)
|
207
|
+
basic = await self.get_basic()
|
208
|
+
fields_to_index = get_bm_modified_fields(messages)
|
209
|
+
vectorsets_configs = await self.get_vectorsets_configs()
|
210
|
+
for fieldid in fields_to_index:
|
211
|
+
if fieldid in deleted_fields:
|
212
|
+
continue
|
213
|
+
await self._apply_field_index_data(
|
214
|
+
self.brain,
|
215
|
+
fieldid,
|
216
|
+
basic,
|
217
|
+
texts=needs_texts_update(fieldid, messages),
|
218
|
+
paragraphs=needs_paragraphs_update(fieldid, messages),
|
219
|
+
relations=needs_relations_update(fieldid, messages),
|
220
|
+
vectors=needs_vectors_update(fieldid, messages),
|
221
|
+
replace=True,
|
222
|
+
vectorset_configs=vectorsets_configs,
|
223
|
+
)
|
224
|
+
return self.brain.brain
|
225
|
+
|
226
|
+
async def full(self, reindex: bool) -> IndexMessage:
|
227
|
+
await self._apply_resource_index_data(self.brain)
|
228
|
+
basic = await self.get_basic()
|
229
|
+
fields_to_index = [
|
230
|
+
FieldID(field=field_id, field_type=field_type)
|
231
|
+
for field_type, field_id in await self.resource.get_fields(force=True)
|
232
|
+
]
|
233
|
+
vectorsets_configs = await self.get_vectorsets_configs()
|
234
|
+
for fieldid in fields_to_index:
|
235
|
+
await self._apply_field_index_data(
|
236
|
+
self.brain,
|
237
|
+
fieldid,
|
238
|
+
basic,
|
239
|
+
texts=True,
|
240
|
+
paragraphs=True,
|
241
|
+
relations=True,
|
242
|
+
vectors=True,
|
243
|
+
replace=reindex,
|
244
|
+
vectorset_configs=vectorsets_configs,
|
245
|
+
)
|
246
|
+
return self.brain.brain
|
247
|
+
|
248
|
+
async def get_basic(self) -> Basic:
|
249
|
+
basic = await self.resource.get_basic()
|
250
|
+
assert basic is not None
|
251
|
+
return basic
|
252
|
+
|
253
|
+
async def get_vectorsets_configs(self) -> list[VectorSetConfig]:
|
254
|
+
"""
|
255
|
+
Get the vectorsets config for the resource.
|
256
|
+
"""
|
257
|
+
vectorset_configs = [
|
258
|
+
vectorset_config
|
259
|
+
async for _, vectorset_config in datamanagers.vectorsets.iter(
|
260
|
+
self.resource.txn, kbid=self.resource.kb.kbid
|
261
|
+
)
|
262
|
+
]
|
263
|
+
return vectorset_configs
|
264
|
+
|
265
|
+
|
266
|
+
def get_bm_deleted_fields(
|
267
|
+
messages: list[BrokerMessage],
|
268
|
+
) -> list[FieldID]:
|
269
|
+
deleted = []
|
270
|
+
for message in messages:
|
271
|
+
for field in message.delete_fields:
|
272
|
+
if field not in deleted:
|
273
|
+
deleted.append(field)
|
274
|
+
return deleted
|
275
|
+
|
276
|
+
|
277
|
+
def get_bm_modified_fields(messages: list[BrokerMessage]) -> list[FieldID]:
|
278
|
+
message_source = get_messages_source(messages)
|
279
|
+
modified = set()
|
280
|
+
for message in messages:
|
281
|
+
# Added or modified fields need indexing
|
282
|
+
for link in message.links:
|
283
|
+
modified.add((link, FieldType.LINK))
|
284
|
+
for file in message.files:
|
285
|
+
modified.add((file, FieldType.FILE))
|
286
|
+
for conv in message.conversations:
|
287
|
+
modified.add((conv, FieldType.CONVERSATION))
|
288
|
+
for text in message.texts:
|
289
|
+
modified.add((text, FieldType.TEXT))
|
290
|
+
if message.HasField("basic"):
|
291
|
+
# Add title and summary only if they have changed
|
292
|
+
if message.basic.title != "":
|
293
|
+
modified.add(("title", FieldType.GENERIC))
|
294
|
+
if message.basic.summary != "":
|
295
|
+
modified.add(("summary", FieldType.GENERIC))
|
296
|
+
|
297
|
+
if message_source == BrokerMessage.MessageSource.PROCESSOR:
|
298
|
+
# Messages with field metadata, extracted text or field vectors need indexing
|
299
|
+
for fm in message.field_metadata:
|
300
|
+
modified.add((fm.field.field, fm.field.field_type))
|
301
|
+
for et in message.extracted_text:
|
302
|
+
modified.add((et.field.field, et.field.field_type))
|
303
|
+
for fv in message.field_vectors:
|
304
|
+
modified.add((fv.field.field, fv.field.field_type))
|
305
|
+
|
306
|
+
if message_source == BrokerMessage.MessageSource.WRITER:
|
307
|
+
# Any field that has fieldmetadata annotations should be considered as modified
|
308
|
+
# and needs to be reindexed
|
309
|
+
if message.HasField("basic"):
|
310
|
+
for ufm in message.basic.fieldmetadata:
|
311
|
+
modified.add((ufm.field.field, ufm.field.field_type))
|
312
|
+
return [FieldID(field=field, field_type=field_type) for field, field_type in modified]
|
313
|
+
|
314
|
+
|
315
|
+
def get_messages_source(messages: list[BrokerMessage]) -> BrokerMessage.MessageSource.ValueType:
|
316
|
+
assert len(set(message.source for message in messages)) == 1
|
317
|
+
return messages[0].source
|
318
|
+
|
319
|
+
|
320
|
+
def needs_prefilter_update(messages: list[BrokerMessage]) -> bool:
|
321
|
+
return any(message.reindex for message in messages)
|
322
|
+
|
323
|
+
|
324
|
+
def needs_paragraphs_update(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
|
325
|
+
return (
|
326
|
+
has_paragraph_annotations(field_id, messages)
|
327
|
+
or has_new_extracted_text(field_id, messages)
|
328
|
+
or has_new_field_metadata(field_id, messages)
|
329
|
+
)
|
330
|
+
|
331
|
+
|
332
|
+
def has_paragraph_annotations(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
|
333
|
+
for message in messages:
|
334
|
+
ufm = next(
|
335
|
+
(fm for fm in message.basic.fieldmetadata if fm.field == field_id),
|
336
|
+
None,
|
337
|
+
)
|
338
|
+
if ufm is None:
|
339
|
+
continue
|
340
|
+
if len(ufm.paragraphs) > 0:
|
341
|
+
return True
|
342
|
+
return False
|
343
|
+
|
344
|
+
|
345
|
+
def has_new_field_metadata(
|
346
|
+
field_id: FieldID,
|
347
|
+
messages: list[BrokerMessage],
|
348
|
+
) -> bool:
|
349
|
+
for message in messages:
|
350
|
+
for field_metadata in message.field_metadata:
|
351
|
+
if field_metadata.field == field_id:
|
352
|
+
return True
|
353
|
+
return False
|
354
|
+
|
355
|
+
|
356
|
+
def has_new_extracted_text(
|
357
|
+
field_id: FieldID,
|
358
|
+
messages: list[BrokerMessage],
|
359
|
+
) -> bool:
|
360
|
+
for message in messages:
|
361
|
+
for extracted_text in message.extracted_text:
|
362
|
+
if extracted_text.field == field_id:
|
363
|
+
return True
|
364
|
+
return False
|
365
|
+
|
366
|
+
|
367
|
+
def needs_texts_update(
|
368
|
+
field_id: FieldID,
|
369
|
+
messages: list[BrokerMessage],
|
370
|
+
) -> bool:
|
371
|
+
return has_new_extracted_text(field_id, messages) or has_new_field_metadata(field_id, messages)
|
372
|
+
|
373
|
+
|
374
|
+
def needs_vectors_update(
|
375
|
+
field_id: FieldID,
|
376
|
+
messages: list[BrokerMessage],
|
377
|
+
) -> bool:
|
378
|
+
for message in messages:
|
379
|
+
for field_vectors in message.field_vectors:
|
380
|
+
if field_vectors.field == field_id:
|
381
|
+
return True
|
382
|
+
return False
|
383
|
+
|
384
|
+
|
385
|
+
def needs_relations_update(
|
386
|
+
field_id: FieldID,
|
387
|
+
messages: list[BrokerMessage],
|
388
|
+
) -> bool:
|
389
|
+
return has_new_field_metadata(field_id, messages) or has_new_extracted_text(field_id, messages)
|
390
|
+
|
391
|
+
|
392
|
+
async def get_resource_index_message(
|
393
|
+
resource: Resource,
|
394
|
+
reindex: bool = False,
|
395
|
+
) -> IndexMessage:
|
396
|
+
"""
|
397
|
+
Get the full index message for a resource.
|
398
|
+
"""
|
399
|
+
if has_feature(
|
400
|
+
const.Features.INDEX_MESSAGE_GENERATION_V2,
|
401
|
+
context={
|
402
|
+
"kbid": resource.kb.kbid,
|
403
|
+
},
|
404
|
+
):
|
405
|
+
im_builder = IndexMessageBuilder(resource)
|
406
|
+
return await im_builder.full(reindex=reindex)
|
407
|
+
else:
|
408
|
+
# TODO: remove this code when we remove the old index message generation
|
409
|
+
return (await resource.generate_index_message(reindex=reindex)).brain
|
nucliadb/ingest/orm/metrics.py
CHANGED
@@ -23,8 +23,8 @@ from dataclasses import dataclass, field
|
|
23
23
|
from typing import Optional
|
24
24
|
|
25
25
|
from nucliadb.ingest.orm.resource import Resource
|
26
|
-
from nucliadb.ingest.processing import ProcessingEngine
|
27
|
-
from
|
26
|
+
from nucliadb.ingest.processing import ProcessingEngine
|
27
|
+
from nucliadb.models.internal.processing import PushPayload, PushTextFormat, Source, Text
|
28
28
|
from nucliadb_protos import resources_pb2, writer_pb2
|
29
29
|
from nucliadb_protos.resources_pb2 import FieldType
|
30
30
|
from nucliadb_utils.utilities import Utility, get_partitioning, get_utility
|
@@ -23,6 +23,7 @@ from typing import cast
|
|
23
23
|
from nucliadb.common.maindb.driver import Transaction
|
24
24
|
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
25
25
|
from nucliadb.common.maindb.utils import get_driver
|
26
|
+
from nucliadb_protos.noderesources_pb2 import Resource as IndexMessage
|
26
27
|
from nucliadb_telemetry import metrics
|
27
28
|
|
28
29
|
from ..resource import Resource
|
@@ -39,7 +40,7 @@ def pgcatalog_enabled(kbid):
|
|
39
40
|
|
40
41
|
|
41
42
|
@observer.wrap({"type": "update"})
|
42
|
-
async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource):
|
43
|
+
async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
|
43
44
|
if not pgcatalog_enabled(kbid):
|
44
45
|
return
|
45
46
|
|
@@ -69,7 +70,7 @@ async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource):
|
|
69
70
|
"title": resource.basic.title,
|
70
71
|
"created_at": created_at,
|
71
72
|
"modified_at": modified_at,
|
72
|
-
"labels": list(
|
73
|
+
"labels": list(index_message.labels),
|
73
74
|
},
|
74
75
|
)
|
75
76
|
|
@@ -38,6 +38,7 @@ from nucliadb.ingest.orm.exceptions import (
|
|
38
38
|
ResourceNotIndexable,
|
39
39
|
SequenceOrderViolation,
|
40
40
|
)
|
41
|
+
from nucliadb.ingest.orm.index_message import IndexMessageBuilder
|
41
42
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
42
43
|
from nucliadb.ingest.orm.metrics import processor_observer
|
43
44
|
from nucliadb.ingest.orm.processor import sequence_manager
|
@@ -312,25 +313,11 @@ class Processor:
|
|
312
313
|
await self.apply_resource(message, resource, update=(not created))
|
313
314
|
|
314
315
|
# index message
|
315
|
-
|
316
|
-
if resource:
|
317
|
-
if any(needs_reindex(m) for m in messages):
|
318
|
-
# when reindexing, let's just generate full new index message
|
319
|
-
# TODO - This should be improved in the future as it's not optimal for very large resources:
|
320
|
-
# As of now, there are some API operations that require fully reindexing all the fields of a resource.
|
321
|
-
# An example of this is classification label changes - we need to reindex all the fields of a resource to
|
322
|
-
# propagate the label changes to the index.
|
323
|
-
resource.replace_indexer(await resource.generate_index_message(reindex=True))
|
324
|
-
else:
|
325
|
-
# TODO - Ideally we should only update the fields that have been changed in the current transaction.
|
326
|
-
await resource.compute_global_text()
|
327
|
-
await resource.compute_global_tags(resource.indexer)
|
328
|
-
await resource.compute_security(resource.indexer)
|
329
|
-
|
330
316
|
if resource and resource.modified:
|
331
|
-
await
|
317
|
+
index_message = await self.generate_index_message(resource, messages, created)
|
318
|
+
await pgcatalog_update(txn, kbid, resource, index_message)
|
332
319
|
await self.index_resource( # noqa
|
333
|
-
|
320
|
+
index_message=index_message,
|
334
321
|
txn=txn,
|
335
322
|
uuid=uuid,
|
336
323
|
kbid=kbid,
|
@@ -451,7 +438,7 @@ class Processor:
|
|
451
438
|
@processor_observer.wrap({"type": "index_resource"})
|
452
439
|
async def index_resource(
|
453
440
|
self,
|
454
|
-
|
441
|
+
index_message: PBBrainResource,
|
455
442
|
txn: Transaction,
|
456
443
|
uuid: str,
|
457
444
|
kbid: str,
|
@@ -460,9 +447,8 @@ class Processor:
|
|
460
447
|
kb: KnowledgeBox,
|
461
448
|
source: nodewriter_pb2.IndexMessageSource.ValueType,
|
462
449
|
) -> None:
|
463
|
-
validate_indexable_resource(
|
450
|
+
validate_indexable_resource(index_message)
|
464
451
|
shard = await self.get_or_assign_resource_shard(txn, kb, uuid)
|
465
|
-
index_message = resource.indexer.brain
|
466
452
|
external_index_manager = await get_external_index_manager(kbid=kbid)
|
467
453
|
if external_index_manager is not None:
|
468
454
|
await self.external_index_add_resource(external_index_manager, uuid, index_message)
|
@@ -476,6 +462,56 @@ class Processor:
|
|
476
462
|
source=source,
|
477
463
|
)
|
478
464
|
|
465
|
+
async def generate_index_message_v2(
|
466
|
+
self,
|
467
|
+
resource: Resource,
|
468
|
+
messages: list[writer_pb2.BrokerMessage],
|
469
|
+
resource_created: bool,
|
470
|
+
) -> PBBrainResource:
|
471
|
+
builder = IndexMessageBuilder(resource)
|
472
|
+
message_source = messages_source(messages)
|
473
|
+
if message_source == nodewriter_pb2.IndexMessageSource.WRITER:
|
474
|
+
with processor_observer({"type": "generate_index_message", "source": "writer"}):
|
475
|
+
return await builder.for_writer_bm(messages, resource_created)
|
476
|
+
elif message_source == nodewriter_pb2.IndexMessageSource.PROCESSOR:
|
477
|
+
with processor_observer({"type": "generate_index_message", "source": "processor"}):
|
478
|
+
return await builder.for_processor_bm(messages)
|
479
|
+
else: # pragma: no cover
|
480
|
+
raise InvalidBrokerMessage(f"Unknown broker message source: {message_source}")
|
481
|
+
|
482
|
+
async def generate_index_message_v1(
|
483
|
+
self,
|
484
|
+
resource: Resource,
|
485
|
+
messages: list[writer_pb2.BrokerMessage],
|
486
|
+
) -> PBBrainResource:
|
487
|
+
if any(needs_reindex(m) for m in messages):
|
488
|
+
# when reindexing, let's just generate full new index message
|
489
|
+
# TODO - This should be improved in the future as it's not optimal for very large resources:
|
490
|
+
# As of now, there are some API operations that require fully reindexing all the fields of a resource.
|
491
|
+
# An example of this is classification label changes - we need to reindex all the fields of a resource to
|
492
|
+
# propagate the label changes to the index.
|
493
|
+
resource.replace_indexer(await resource.generate_index_message(reindex=True))
|
494
|
+
else:
|
495
|
+
# TODO - Ideally we should only update the fields that have been changed in the current transaction.
|
496
|
+
await resource.compute_global_text()
|
497
|
+
await resource.compute_global_tags(resource.indexer)
|
498
|
+
await resource.compute_security(resource.indexer)
|
499
|
+
return resource.indexer.brain
|
500
|
+
|
501
|
+
async def generate_index_message(
|
502
|
+
self,
|
503
|
+
resource: Resource,
|
504
|
+
messages: list[writer_pb2.BrokerMessage],
|
505
|
+
resource_created: bool = False,
|
506
|
+
) -> PBBrainResource:
|
507
|
+
if has_feature(
|
508
|
+
const.Features.INDEX_MESSAGE_GENERATION_V2,
|
509
|
+
context={"kbid": resource.kb.kbid},
|
510
|
+
):
|
511
|
+
return await self.generate_index_message_v2(resource, messages, resource_created)
|
512
|
+
else:
|
513
|
+
return await self.generate_index_message_v1(resource, messages)
|
514
|
+
|
479
515
|
async def external_index_delete_resource(
|
480
516
|
self, external_index_manager: ExternalIndexManager, resource_uuid: str
|
481
517
|
):
|
@@ -564,7 +600,10 @@ class Processor:
|
|
564
600
|
resource: Resource,
|
565
601
|
update: bool = False,
|
566
602
|
):
|
567
|
-
"""
|
603
|
+
"""
|
604
|
+
Apply broker message to resource object in the persistence layers (maindb and storage).
|
605
|
+
DO NOT add any indexing logic here.
|
606
|
+
"""
|
568
607
|
if update:
|
569
608
|
await self.maybe_update_resource_basic(resource, message)
|
570
609
|
|
@@ -675,30 +714,9 @@ class Processor:
|
|
675
714
|
try:
|
676
715
|
async with self.driver.transaction() as txn:
|
677
716
|
kb.txn = resource.txn = txn
|
678
|
-
|
679
|
-
shard_id = await datamanagers.resources.get_resource_shard_id(
|
680
|
-
txn, kbid=kb.kbid, rid=resource.uuid
|
681
|
-
)
|
682
|
-
shard = None
|
683
|
-
if shard_id is not None:
|
684
|
-
shard = await kb.get_resource_shard(shard_id)
|
685
|
-
if shard is None:
|
686
|
-
logger.warning(
|
687
|
-
"Unable to mark resource as error, shard is None. "
|
688
|
-
"This should not happen so you did something special to get here."
|
689
|
-
)
|
690
|
-
return
|
691
|
-
|
692
717
|
resource.basic.metadata.status = resources_pb2.Metadata.Status.ERROR
|
693
718
|
await resource.set_basic(resource.basic)
|
694
719
|
await txn.commit()
|
695
|
-
|
696
|
-
resource.indexer.set_processing_status(
|
697
|
-
basic=resource.basic, previous_status=resource._previous_status
|
698
|
-
)
|
699
|
-
await self.index_node_shard_manager.add_resource(
|
700
|
-
shard, resource.indexer.brain, seqid, partition=partition, kb=kb.kbid
|
701
|
-
)
|
702
720
|
except Exception:
|
703
721
|
logger.warning("Error while marking resource as error", exc_info=True)
|
704
722
|
|
@@ -745,11 +763,7 @@ def has_vectors_operation(index_message: PBBrainResource) -> bool:
|
|
745
763
|
"""
|
746
764
|
Returns True if the index message has any vectors to index or to delete.
|
747
765
|
"""
|
748
|
-
if (
|
749
|
-
len(index_message.sentences_to_delete) > 0
|
750
|
-
or len(index_message.paragraphs_to_delete) > 0
|
751
|
-
or any([len(deletions.items) for deletions in index_message.vector_prefixes_to_delete.values()])
|
752
|
-
):
|
766
|
+
if any([len(deletions.items) for deletions in index_message.vector_prefixes_to_delete.values()]):
|
753
767
|
return True
|
754
768
|
for field_paragraphs in index_message.paragraphs.values():
|
755
769
|
for paragraph in field_paragraphs.paragraphs.values():
|