nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -20,11 +20,12 @@
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
import asyncio
|
|
23
|
-
from typing import Optional
|
|
23
|
+
from typing import Optional, Sequence
|
|
24
24
|
|
|
25
25
|
from nidx_protos.noderesources_pb2 import Resource as IndexMessage
|
|
26
26
|
|
|
27
27
|
from nucliadb.common import datamanagers
|
|
28
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
|
28
29
|
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
|
29
30
|
from nucliadb.ingest.fields.file import File
|
|
30
31
|
from nucliadb.ingest.orm.brain_v2 import ResourceBrain
|
|
@@ -32,6 +33,7 @@ from nucliadb.ingest.orm.metrics import index_message_observer as observer
|
|
|
32
33
|
from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
|
|
33
34
|
from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
|
|
34
35
|
from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
|
|
36
|
+
from nucliadb_protos.utils_pb2 import ExtractedText
|
|
35
37
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
|
36
38
|
|
|
37
39
|
|
|
@@ -69,6 +71,7 @@ class IndexMessageBuilder:
|
|
|
69
71
|
relations: bool = True,
|
|
70
72
|
replace: bool = True,
|
|
71
73
|
vectorset_configs: Optional[list[VectorSetConfig]] = None,
|
|
74
|
+
append_splits: Optional[set[str]] = None,
|
|
72
75
|
):
|
|
73
76
|
field = await self.resource.get_field(fieldid.field, fieldid.field_type)
|
|
74
77
|
extracted_text = await field.get_extracted_text()
|
|
@@ -120,6 +123,7 @@ class IndexMessageBuilder:
|
|
|
120
123
|
replace_field=replace_paragraphs,
|
|
121
124
|
skip_paragraphs_index=skip_paragraphs_index,
|
|
122
125
|
skip_texts_index=skip_texts_index,
|
|
126
|
+
append_splits=append_splits,
|
|
123
127
|
)
|
|
124
128
|
if vectors:
|
|
125
129
|
assert vectorset_configs is not None
|
|
@@ -137,6 +141,7 @@ class IndexMessageBuilder:
|
|
|
137
141
|
vectorset=vectorset_config.vectorset_id,
|
|
138
142
|
replace_field=replace,
|
|
139
143
|
vector_dimension=dimension,
|
|
144
|
+
append_splits=append_splits,
|
|
140
145
|
)
|
|
141
146
|
if relations:
|
|
142
147
|
await asyncio.to_thread(
|
|
@@ -150,7 +155,7 @@ class IndexMessageBuilder:
|
|
|
150
155
|
def _apply_field_deletions(
|
|
151
156
|
self,
|
|
152
157
|
brain: ResourceBrain,
|
|
153
|
-
field_ids:
|
|
158
|
+
field_ids: Sequence[FieldID],
|
|
154
159
|
) -> None:
|
|
155
160
|
for field_id in field_ids:
|
|
156
161
|
brain.delete_field(self.resource.generate_field_id(field_id))
|
|
@@ -158,20 +163,19 @@ class IndexMessageBuilder:
|
|
|
158
163
|
@observer.wrap({"type": "writer_bm"})
|
|
159
164
|
async def for_writer_bm(
|
|
160
165
|
self,
|
|
161
|
-
|
|
166
|
+
message: BrokerMessage,
|
|
162
167
|
resource_created: bool,
|
|
163
168
|
) -> IndexMessage:
|
|
164
169
|
"""
|
|
165
|
-
Builds the index message for the broker
|
|
170
|
+
Builds the index message for the broker message coming from the writer.
|
|
166
171
|
The writer messages are not adding new vectors to the index.
|
|
167
172
|
"""
|
|
168
|
-
assert
|
|
173
|
+
assert message.source == BrokerMessage.MessageSource.WRITER
|
|
169
174
|
|
|
170
|
-
|
|
171
|
-
self._apply_field_deletions(self.brain, deleted_fields)
|
|
175
|
+
self._apply_field_deletions(self.brain, message.delete_fields)
|
|
172
176
|
await self._apply_resource_index_data(self.brain)
|
|
173
177
|
basic = await self.get_basic()
|
|
174
|
-
prefilter_update = needs_prefilter_update(
|
|
178
|
+
prefilter_update = needs_prefilter_update(message)
|
|
175
179
|
if prefilter_update:
|
|
176
180
|
# Changes on some metadata at the resource level that is used for filtering require that we reindex all the fields
|
|
177
181
|
# in the texts index (as it is the one used for prefiltering).
|
|
@@ -181,16 +185,16 @@ class IndexMessageBuilder:
|
|
|
181
185
|
]
|
|
182
186
|
else:
|
|
183
187
|
# Simply process the fields that are in the message
|
|
184
|
-
fields_to_index = get_bm_modified_fields(
|
|
188
|
+
fields_to_index = get_bm_modified_fields(message)
|
|
185
189
|
for fieldid in fields_to_index:
|
|
186
|
-
if fieldid in
|
|
190
|
+
if fieldid in message.delete_fields:
|
|
187
191
|
continue
|
|
188
192
|
await self._apply_field_index_data(
|
|
189
193
|
self.brain,
|
|
190
194
|
fieldid,
|
|
191
195
|
basic,
|
|
192
|
-
texts=prefilter_update or needs_texts_update(fieldid,
|
|
193
|
-
paragraphs=needs_paragraphs_update(fieldid,
|
|
196
|
+
texts=prefilter_update or needs_texts_update(fieldid, message),
|
|
197
|
+
paragraphs=needs_paragraphs_update(fieldid, message),
|
|
194
198
|
relations=False, # Relations at the field level are not modified by the writer
|
|
195
199
|
vectors=False, # Vectors are never added by the writer
|
|
196
200
|
replace=not resource_created,
|
|
@@ -200,32 +204,45 @@ class IndexMessageBuilder:
|
|
|
200
204
|
@observer.wrap({"type": "processor_bm"})
|
|
201
205
|
async def for_processor_bm(
|
|
202
206
|
self,
|
|
203
|
-
|
|
207
|
+
message: BrokerMessage,
|
|
204
208
|
) -> IndexMessage:
|
|
205
209
|
"""
|
|
206
210
|
Builds the index message for the broker messages coming from the processor.
|
|
207
211
|
The processor can index new data to any index.
|
|
208
212
|
"""
|
|
209
|
-
assert
|
|
210
|
-
|
|
211
|
-
self._apply_field_deletions(self.brain, deleted_fields)
|
|
213
|
+
assert message.source == BrokerMessage.MessageSource.PROCESSOR
|
|
214
|
+
self._apply_field_deletions(self.brain, message.delete_fields)
|
|
212
215
|
await self._apply_resource_index_data(self.brain)
|
|
213
216
|
basic = await self.get_basic()
|
|
214
|
-
fields_to_index = get_bm_modified_fields(
|
|
217
|
+
fields_to_index = get_bm_modified_fields(message)
|
|
215
218
|
vectorsets_configs = await self.get_vectorsets_configs()
|
|
216
219
|
for fieldid in fields_to_index:
|
|
217
|
-
if fieldid in
|
|
220
|
+
if fieldid in message.delete_fields:
|
|
218
221
|
continue
|
|
222
|
+
|
|
223
|
+
# For conversation fields, we only replace the full field if it is not an append messages operation.
|
|
224
|
+
# All other fields are always replaced upon modification.
|
|
225
|
+
replace_field = True
|
|
226
|
+
modified_splits = None
|
|
227
|
+
if fieldid.field_type == FieldType.CONVERSATION:
|
|
228
|
+
modified_splits = await get_bm_modified_split_ids(fieldid, message, self.resource)
|
|
229
|
+
stored_splits = await get_stored_split_ids(fieldid, self.resource)
|
|
230
|
+
is_append_messages_op = modified_splits.issubset(stored_splits) and 0 < len(
|
|
231
|
+
modified_splits
|
|
232
|
+
) < len(stored_splits)
|
|
233
|
+
replace_field = not is_append_messages_op
|
|
234
|
+
|
|
219
235
|
await self._apply_field_index_data(
|
|
220
236
|
self.brain,
|
|
221
237
|
fieldid,
|
|
222
238
|
basic,
|
|
223
|
-
texts=needs_texts_update(fieldid,
|
|
224
|
-
paragraphs=needs_paragraphs_update(fieldid,
|
|
225
|
-
relations=needs_relations_update(fieldid,
|
|
226
|
-
vectors=needs_vectors_update(fieldid,
|
|
227
|
-
replace=
|
|
239
|
+
texts=needs_texts_update(fieldid, message),
|
|
240
|
+
paragraphs=needs_paragraphs_update(fieldid, message),
|
|
241
|
+
relations=needs_relations_update(fieldid, message),
|
|
242
|
+
vectors=needs_vectors_update(fieldid, message),
|
|
243
|
+
replace=replace_field,
|
|
228
244
|
vectorset_configs=vectorsets_configs,
|
|
245
|
+
append_splits=modified_splits,
|
|
229
246
|
)
|
|
230
247
|
return self.brain.brain
|
|
231
248
|
|
|
@@ -270,130 +287,128 @@ class IndexMessageBuilder:
|
|
|
270
287
|
return vectorset_configs
|
|
271
288
|
|
|
272
289
|
|
|
273
|
-
def
|
|
274
|
-
messages: list[BrokerMessage],
|
|
275
|
-
) -> list[FieldID]:
|
|
276
|
-
deleted = []
|
|
277
|
-
for message in messages:
|
|
278
|
-
for field in message.delete_fields:
|
|
279
|
-
if field not in deleted:
|
|
280
|
-
deleted.append(field)
|
|
281
|
-
return deleted
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
def get_bm_modified_fields(messages: list[BrokerMessage]) -> list[FieldID]:
|
|
285
|
-
message_source = get_messages_source(messages)
|
|
290
|
+
def get_bm_modified_fields(message: BrokerMessage) -> list[FieldID]:
|
|
286
291
|
modified = set()
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
292
|
+
# Added or modified fields need indexing
|
|
293
|
+
for link in message.links:
|
|
294
|
+
modified.add((link, FieldType.LINK))
|
|
295
|
+
for file in message.files:
|
|
296
|
+
modified.add((file, FieldType.FILE))
|
|
297
|
+
for conv in message.conversations:
|
|
298
|
+
modified.add((conv, FieldType.CONVERSATION))
|
|
299
|
+
for text in message.texts:
|
|
300
|
+
modified.add((text, FieldType.TEXT))
|
|
301
|
+
if message.HasField("basic"):
|
|
302
|
+
# Add title and summary only if they have changed
|
|
303
|
+
if message.basic.title != "":
|
|
304
|
+
modified.add(("title", FieldType.GENERIC))
|
|
305
|
+
if message.basic.summary != "":
|
|
306
|
+
modified.add(("summary", FieldType.GENERIC))
|
|
307
|
+
|
|
308
|
+
if message.source == BrokerMessage.MessageSource.PROCESSOR:
|
|
309
|
+
# Messages with field metadata, extracted text or field vectors need indexing
|
|
310
|
+
for fm in message.field_metadata:
|
|
311
|
+
modified.add((fm.field.field, fm.field.field_type))
|
|
312
|
+
for et in message.extracted_text:
|
|
313
|
+
modified.add((et.field.field, et.field.field_type))
|
|
314
|
+
for fv in message.field_vectors:
|
|
315
|
+
modified.add((fv.field.field, fv.field.field_type))
|
|
316
|
+
|
|
317
|
+
if message.source == BrokerMessage.MessageSource.WRITER:
|
|
318
|
+
# Any field that has fieldmetadata annotations should be considered as modified
|
|
319
|
+
# and needs to be reindexed
|
|
297
320
|
if message.HasField("basic"):
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
modified.add(("title", FieldType.GENERIC))
|
|
301
|
-
if message.basic.summary != "":
|
|
302
|
-
modified.add(("summary", FieldType.GENERIC))
|
|
303
|
-
|
|
304
|
-
if message_source == BrokerMessage.MessageSource.PROCESSOR:
|
|
305
|
-
# Messages with field metadata, extracted text or field vectors need indexing
|
|
306
|
-
for fm in message.field_metadata:
|
|
307
|
-
modified.add((fm.field.field, fm.field.field_type))
|
|
308
|
-
for et in message.extracted_text:
|
|
309
|
-
modified.add((et.field.field, et.field.field_type))
|
|
310
|
-
for fv in message.field_vectors:
|
|
311
|
-
modified.add((fv.field.field, fv.field.field_type))
|
|
312
|
-
|
|
313
|
-
if message_source == BrokerMessage.MessageSource.WRITER:
|
|
314
|
-
# Any field that has fieldmetadata annotations should be considered as modified
|
|
315
|
-
# and needs to be reindexed
|
|
316
|
-
if message.HasField("basic"):
|
|
317
|
-
for ufm in message.basic.fieldmetadata:
|
|
318
|
-
modified.add((ufm.field.field, ufm.field.field_type))
|
|
321
|
+
for ufm in message.basic.fieldmetadata:
|
|
322
|
+
modified.add((ufm.field.field, ufm.field.field_type))
|
|
319
323
|
return [FieldID(field=field, field_type=field_type) for field, field_type in modified]
|
|
320
324
|
|
|
321
325
|
|
|
322
|
-
def
|
|
323
|
-
|
|
324
|
-
return messages[0].source
|
|
326
|
+
def needs_prefilter_update(message: BrokerMessage) -> bool:
|
|
327
|
+
return message.reindex
|
|
325
328
|
|
|
326
329
|
|
|
327
|
-
def
|
|
328
|
-
return any(message.reindex for message in messages)
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
def needs_paragraphs_update(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
|
|
330
|
+
def needs_paragraphs_update(field_id: FieldID, message: BrokerMessage) -> bool:
|
|
332
331
|
return (
|
|
333
|
-
has_paragraph_annotations(field_id,
|
|
334
|
-
or has_new_extracted_text(field_id,
|
|
335
|
-
or has_new_field_metadata(field_id,
|
|
332
|
+
has_paragraph_annotations(field_id, message)
|
|
333
|
+
or has_new_extracted_text(field_id, message)
|
|
334
|
+
or has_new_field_metadata(field_id, message)
|
|
336
335
|
)
|
|
337
336
|
|
|
338
337
|
|
|
339
|
-
def has_paragraph_annotations(field_id: FieldID,
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
if len(ufm.paragraphs) > 0:
|
|
348
|
-
return True
|
|
349
|
-
return False
|
|
338
|
+
def has_paragraph_annotations(field_id: FieldID, message: BrokerMessage) -> bool:
|
|
339
|
+
ufm = next(
|
|
340
|
+
(fm for fm in message.basic.fieldmetadata if fm.field == field_id),
|
|
341
|
+
None,
|
|
342
|
+
)
|
|
343
|
+
if ufm is None:
|
|
344
|
+
return False
|
|
345
|
+
return len(ufm.paragraphs) > 0
|
|
350
346
|
|
|
351
347
|
|
|
352
348
|
def has_new_field_metadata(
|
|
353
349
|
field_id: FieldID,
|
|
354
|
-
|
|
350
|
+
message: BrokerMessage,
|
|
355
351
|
) -> bool:
|
|
356
|
-
for
|
|
357
|
-
for field_metadata in message.field_metadata:
|
|
358
|
-
if field_metadata.field == field_id:
|
|
359
|
-
return True
|
|
360
|
-
return False
|
|
352
|
+
return any(field_metadata.field == field_id for field_metadata in message.field_metadata)
|
|
361
353
|
|
|
362
354
|
|
|
363
355
|
def has_new_extracted_text(
|
|
364
356
|
field_id: FieldID,
|
|
365
|
-
|
|
357
|
+
message: BrokerMessage,
|
|
366
358
|
) -> bool:
|
|
367
|
-
for
|
|
368
|
-
for extracted_text in message.extracted_text:
|
|
369
|
-
if extracted_text.field == field_id:
|
|
370
|
-
return True
|
|
371
|
-
return False
|
|
359
|
+
return any(extracted_text.field == field_id for extracted_text in message.extracted_text)
|
|
372
360
|
|
|
373
361
|
|
|
374
362
|
def needs_texts_update(
|
|
375
363
|
field_id: FieldID,
|
|
376
|
-
|
|
364
|
+
message: BrokerMessage,
|
|
377
365
|
) -> bool:
|
|
378
|
-
return has_new_extracted_text(field_id,
|
|
366
|
+
return has_new_extracted_text(field_id, message) or has_new_field_metadata(field_id, message)
|
|
379
367
|
|
|
380
368
|
|
|
381
369
|
def needs_vectors_update(
|
|
382
370
|
field_id: FieldID,
|
|
383
|
-
|
|
371
|
+
message: BrokerMessage,
|
|
384
372
|
) -> bool:
|
|
385
|
-
for
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
373
|
+
return any(field_vectors.field == field_id for field_vectors in message.field_vectors)
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
async def get_bm_modified_split_ids(
|
|
377
|
+
conversation_field_id: FieldID,
|
|
378
|
+
message: BrokerMessage,
|
|
379
|
+
resource: Resource,
|
|
380
|
+
) -> set[str]:
|
|
381
|
+
message_etw = next(
|
|
382
|
+
(etw for etw in message.extracted_text if etw.field == conversation_field_id), None
|
|
383
|
+
)
|
|
384
|
+
if message_etw is None:
|
|
385
|
+
return set()
|
|
386
|
+
storage = resource.storage
|
|
387
|
+
if message_etw.HasField("file"):
|
|
388
|
+
raw_payload = await storage.downloadbytescf(message_etw.file)
|
|
389
|
+
message_extracted_text = ExtractedText()
|
|
390
|
+
message_extracted_text.ParseFromString(raw_payload.read())
|
|
391
|
+
raw_payload.flush()
|
|
392
|
+
else:
|
|
393
|
+
message_extracted_text = message_etw.body
|
|
394
|
+
return set(message_extracted_text.split_text.keys())
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
async def get_stored_split_ids(
|
|
398
|
+
conversation_field_id: FieldID,
|
|
399
|
+
resource: Resource,
|
|
400
|
+
) -> set[str]:
|
|
401
|
+
fid = conversation_field_id
|
|
402
|
+
conv: Conversation = await resource.get_field(fid.field, fid.field_type, load=False)
|
|
403
|
+
splits_metadata = await conv.get_splits_metadata()
|
|
404
|
+
return set(splits_metadata.metadata)
|
|
390
405
|
|
|
391
406
|
|
|
392
407
|
def needs_relations_update(
|
|
393
408
|
field_id: FieldID,
|
|
394
|
-
|
|
409
|
+
message: BrokerMessage,
|
|
395
410
|
) -> bool:
|
|
396
|
-
return has_new_field_metadata(field_id,
|
|
411
|
+
return has_new_field_metadata(field_id, message) or has_new_extracted_text(field_id, message)
|
|
397
412
|
|
|
398
413
|
|
|
399
414
|
async def get_resource_index_message(
|
|
@@ -24,7 +24,7 @@ from uuid import uuid4
|
|
|
24
24
|
|
|
25
25
|
from grpc import StatusCode
|
|
26
26
|
from grpc.aio import AioRpcError
|
|
27
|
-
from nidx_protos import noderesources_pb2
|
|
27
|
+
from nidx_protos import nidx_pb2, noderesources_pb2
|
|
28
28
|
|
|
29
29
|
from nucliadb.common import datamanagers
|
|
30
30
|
from nucliadb.common.cluster.exceptions import ShardNotFound
|
|
@@ -36,7 +36,6 @@ from nucliadb.common.datamanagers.resources import (
|
|
|
36
36
|
KB_RESOURCE_SLUG_BASE,
|
|
37
37
|
)
|
|
38
38
|
from nucliadb.common.external_index_providers.base import VectorsetExternalIndex
|
|
39
|
-
from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
|
|
40
39
|
from nucliadb.common.maindb.driver import Driver, Transaction
|
|
41
40
|
from nucliadb.common.maindb.pg import PGTransaction
|
|
42
41
|
from nucliadb.common.nidx import get_nidx_api_client
|
|
@@ -53,7 +52,6 @@ from nucliadb.migrator.utils import get_latest_version
|
|
|
53
52
|
from nucliadb_protos import knowledgebox_pb2, writer_pb2
|
|
54
53
|
from nucliadb_protos.knowledgebox_pb2 import (
|
|
55
54
|
CreateExternalIndexProviderMetadata,
|
|
56
|
-
ExternalIndexProviderType,
|
|
57
55
|
KnowledgeBoxConfig,
|
|
58
56
|
SemanticModelMetadata,
|
|
59
57
|
StoredExternalIndexProviderMetadata,
|
|
@@ -110,6 +108,7 @@ class KnowledgeBox:
|
|
|
110
108
|
external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
|
|
111
109
|
hidden_resources_enabled: bool = False,
|
|
112
110
|
hidden_resources_hide_on_creation: bool = False,
|
|
111
|
+
prewarm_enabled: bool = False,
|
|
113
112
|
) -> tuple[str, str]:
|
|
114
113
|
"""Creates a new knowledge box and return its id and slug."""
|
|
115
114
|
|
|
@@ -127,7 +126,7 @@ class KnowledgeBox:
|
|
|
127
126
|
rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
|
|
128
127
|
|
|
129
128
|
try:
|
|
130
|
-
async with driver.
|
|
129
|
+
async with driver.rw_transaction() as txn:
|
|
131
130
|
exists = await datamanagers.kb.get_kb_uuid(
|
|
132
131
|
txn, slug=slug
|
|
133
132
|
) or await datamanagers.kb.exists_kb(txn, kbid=kbid)
|
|
@@ -196,6 +195,7 @@ class KnowledgeBox:
|
|
|
196
195
|
migration_version=get_latest_version(),
|
|
197
196
|
hidden_resources_enabled=hidden_resources_enabled,
|
|
198
197
|
hidden_resources_hide_on_creation=hidden_resources_hide_on_creation,
|
|
198
|
+
prewarm_enabled=prewarm_enabled,
|
|
199
199
|
)
|
|
200
200
|
config.external_index_provider.CopyFrom(stored_external_index_provider)
|
|
201
201
|
await datamanagers.kb.set_config(txn, kbid=kbid, config=config)
|
|
@@ -222,7 +222,7 @@ class KnowledgeBox:
|
|
|
222
222
|
shard_manager = get_shard_manager()
|
|
223
223
|
# XXX creating a shard is a slow IO operation that requires a write
|
|
224
224
|
# txn to be open!
|
|
225
|
-
await shard_manager.create_shard_by_kbid(txn, kbid)
|
|
225
|
+
await shard_manager.create_shard_by_kbid(txn, kbid, prewarm_enabled=prewarm_enabled)
|
|
226
226
|
# shards don't need a rollback as they will be eventually purged
|
|
227
227
|
|
|
228
228
|
await txn.commit()
|
|
@@ -245,43 +245,90 @@ class KnowledgeBox:
|
|
|
245
245
|
@classmethod
|
|
246
246
|
async def update(
|
|
247
247
|
cls,
|
|
248
|
-
|
|
249
|
-
|
|
248
|
+
driver: Driver,
|
|
249
|
+
kbid: str,
|
|
250
|
+
*,
|
|
250
251
|
slug: Optional[str] = None,
|
|
251
|
-
|
|
252
|
+
title: Optional[str] = None,
|
|
253
|
+
description: Optional[str] = None,
|
|
254
|
+
migration_version: Optional[int] = None,
|
|
255
|
+
external_index_provider: Optional[StoredExternalIndexProviderMetadata] = None,
|
|
256
|
+
hidden_resources_enabled: Optional[bool] = None,
|
|
257
|
+
hidden_resources_hide_on_creation: Optional[bool] = None,
|
|
258
|
+
prewarm_enabled: Optional[bool] = None,
|
|
252
259
|
) -> str:
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
else:
|
|
266
|
-
exist.slug = slug
|
|
260
|
+
async with driver.rw_transaction() as txn:
|
|
261
|
+
stored = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
|
|
262
|
+
if not stored:
|
|
263
|
+
raise datamanagers.exceptions.KnowledgeBoxNotFound()
|
|
264
|
+
|
|
265
|
+
if slug:
|
|
266
|
+
await txn.delete(datamanagers.kb.KB_SLUGS.format(slug=stored.slug))
|
|
267
|
+
await txn.set(
|
|
268
|
+
datamanagers.kb.KB_SLUGS.format(slug=slug),
|
|
269
|
+
kbid.encode(),
|
|
270
|
+
)
|
|
271
|
+
stored.slug = slug
|
|
267
272
|
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
273
|
+
if title is not None:
|
|
274
|
+
stored.title = title
|
|
275
|
+
if description is not None:
|
|
276
|
+
stored.description = description
|
|
272
277
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
278
|
+
if migration_version is not None:
|
|
279
|
+
stored.migration_version = migration_version
|
|
280
|
+
|
|
281
|
+
if external_index_provider is not None:
|
|
282
|
+
stored.external_index_provider.MergeFrom(external_index_provider)
|
|
283
|
+
|
|
284
|
+
if hidden_resources_enabled is not None:
|
|
285
|
+
stored.hidden_resources_enabled = hidden_resources_enabled
|
|
286
|
+
if hidden_resources_hide_on_creation is not None:
|
|
287
|
+
stored.hidden_resources_hide_on_creation = hidden_resources_hide_on_creation
|
|
288
|
+
|
|
289
|
+
update_nidx_prewarm = None
|
|
290
|
+
if prewarm_enabled is not None:
|
|
291
|
+
if stored.prewarm_enabled != prewarm_enabled:
|
|
292
|
+
update_nidx_prewarm = prewarm_enabled
|
|
293
|
+
stored.prewarm_enabled = prewarm_enabled
|
|
294
|
+
|
|
295
|
+
if stored.hidden_resources_hide_on_creation and not stored.hidden_resources_enabled:
|
|
296
|
+
raise KnowledgeBoxCreationError(
|
|
297
|
+
"Cannot hide new resources if the hidden resources feature is disabled"
|
|
298
|
+
)
|
|
277
299
|
|
|
278
|
-
|
|
300
|
+
await datamanagers.kb.set_config(txn, kbid=kbid, config=stored)
|
|
279
301
|
|
|
280
|
-
|
|
302
|
+
await txn.commit()
|
|
303
|
+
|
|
304
|
+
if update_nidx_prewarm is not None:
|
|
305
|
+
await cls.configure_shards(driver, kbid, prewarm=update_nidx_prewarm)
|
|
306
|
+
|
|
307
|
+
return kbid
|
|
308
|
+
|
|
309
|
+
@classmethod
|
|
310
|
+
async def configure_shards(cls, driver: Driver, kbid: str, *, prewarm: bool):
|
|
311
|
+
shards_obj = await datamanagers.atomic.cluster.get_kb_shards(kbid=kbid)
|
|
312
|
+
if shards_obj is None:
|
|
313
|
+
logger.warning(f"Shards not found for KB while updating pre-warm flag", extra={"kbid": kbid})
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
nidx_shard_ids = [shard.nidx_shard_id for shard in shards_obj.shards]
|
|
317
|
+
|
|
318
|
+
nidx_api = get_nidx_api_client()
|
|
319
|
+
if nidx_api is not None and len(nidx_shard_ids) > 0:
|
|
320
|
+
configs = [
|
|
321
|
+
nidx_pb2.ShardConfig(
|
|
322
|
+
shard_id=shard_id,
|
|
323
|
+
prewarm_enabled=prewarm,
|
|
324
|
+
)
|
|
325
|
+
for shard_id in nidx_shard_ids
|
|
326
|
+
]
|
|
327
|
+
await nidx_api.ConfigureShards(nidx_pb2.ShardsConfig(configs=configs))
|
|
281
328
|
|
|
282
329
|
@classmethod
|
|
283
330
|
async def delete(cls, driver: Driver, kbid: str):
|
|
284
|
-
async with driver.
|
|
331
|
+
async with driver.rw_transaction() as txn:
|
|
285
332
|
exists = await datamanagers.kb.exists_kb(txn, kbid=kbid)
|
|
286
333
|
if not exists:
|
|
287
334
|
return
|
|
@@ -347,7 +394,7 @@ class KnowledgeBox:
|
|
|
347
394
|
|
|
348
395
|
nidx_api = get_nidx_api_client()
|
|
349
396
|
|
|
350
|
-
async with driver.
|
|
397
|
+
async with driver.rw_transaction() as txn:
|
|
351
398
|
storage_to_delete = KB_TO_DELETE_STORAGE.format(kbid=kbid)
|
|
352
399
|
await txn.set(storage_to_delete, b"")
|
|
353
400
|
|
|
@@ -377,7 +424,7 @@ class KnowledgeBox:
|
|
|
377
424
|
@classmethod
|
|
378
425
|
async def delete_all_kb_keys(cls, driver: Driver, kbid: str, chunk_size: int = 1_000):
|
|
379
426
|
prefix = KB_KEYS.format(kbid=kbid)
|
|
380
|
-
async with driver.
|
|
427
|
+
async with driver.rw_transaction() as txn:
|
|
381
428
|
await txn.delete_by_prefix(prefix)
|
|
382
429
|
await txn.commit()
|
|
383
430
|
|
|
@@ -535,10 +582,7 @@ class KnowledgeBox:
|
|
|
535
582
|
request: CreateExternalIndexProviderMetadata,
|
|
536
583
|
indexes: list[VectorsetExternalIndex],
|
|
537
584
|
) -> StoredExternalIndexProviderMetadata:
|
|
538
|
-
|
|
539
|
-
return StoredExternalIndexProviderMetadata(type=request.type)
|
|
540
|
-
# Only pinecone is supported for now
|
|
541
|
-
return await PineconeIndexManager.create_indexes(kbid, request, indexes)
|
|
585
|
+
return StoredExternalIndexProviderMetadata(type=request.type)
|
|
542
586
|
|
|
543
587
|
@classmethod
|
|
544
588
|
async def _maybe_delete_external_indexes(
|
|
@@ -546,10 +590,7 @@ class KnowledgeBox:
|
|
|
546
590
|
kbid: str,
|
|
547
591
|
stored: StoredExternalIndexProviderMetadata,
|
|
548
592
|
) -> None:
|
|
549
|
-
|
|
550
|
-
return
|
|
551
|
-
# Only pinecone is supported for now
|
|
552
|
-
await PineconeIndexManager.delete_indexes(kbid, stored)
|
|
593
|
+
return
|
|
553
594
|
|
|
554
595
|
|
|
555
596
|
def chunker(seq: Sequence, size: int):
|
|
@@ -34,7 +34,7 @@ async def collect_audit_fields(
|
|
|
34
34
|
return []
|
|
35
35
|
|
|
36
36
|
audit_storage_fields: list[audit_pb2.AuditField] = []
|
|
37
|
-
async with driver.
|
|
37
|
+
async with driver.ro_transaction() as txn:
|
|
38
38
|
kb = KnowledgeBox(txn, storage, message.kbid)
|
|
39
39
|
resource = Resource(txn, storage, kb, message.uuid)
|
|
40
40
|
field_keys = await resource.get_fields_ids()
|