nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -20,11 +20,12 @@
20
20
 
21
21
 
22
22
  import asyncio
23
- from typing import Optional
23
+ from typing import Optional, Sequence
24
24
 
25
25
  from nidx_protos.noderesources_pb2 import Resource as IndexMessage
26
26
 
27
27
  from nucliadb.common import datamanagers
28
+ from nucliadb.ingest.fields.conversation import Conversation
28
29
  from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
29
30
  from nucliadb.ingest.fields.file import File
30
31
  from nucliadb.ingest.orm.brain_v2 import ResourceBrain
@@ -32,6 +33,7 @@ from nucliadb.ingest.orm.metrics import index_message_observer as observer
32
33
  from nucliadb.ingest.orm.resource import Resource, get_file_page_positions
33
34
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
34
35
  from nucliadb_protos.resources_pb2 import Basic, FieldID, FieldType
36
+ from nucliadb_protos.utils_pb2 import ExtractedText
35
37
  from nucliadb_protos.writer_pb2 import BrokerMessage
36
38
 
37
39
 
@@ -69,6 +71,7 @@ class IndexMessageBuilder:
69
71
  relations: bool = True,
70
72
  replace: bool = True,
71
73
  vectorset_configs: Optional[list[VectorSetConfig]] = None,
74
+ append_splits: Optional[set[str]] = None,
72
75
  ):
73
76
  field = await self.resource.get_field(fieldid.field, fieldid.field_type)
74
77
  extracted_text = await field.get_extracted_text()
@@ -120,6 +123,7 @@ class IndexMessageBuilder:
120
123
  replace_field=replace_paragraphs,
121
124
  skip_paragraphs_index=skip_paragraphs_index,
122
125
  skip_texts_index=skip_texts_index,
126
+ append_splits=append_splits,
123
127
  )
124
128
  if vectors:
125
129
  assert vectorset_configs is not None
@@ -137,6 +141,7 @@ class IndexMessageBuilder:
137
141
  vectorset=vectorset_config.vectorset_id,
138
142
  replace_field=replace,
139
143
  vector_dimension=dimension,
144
+ append_splits=append_splits,
140
145
  )
141
146
  if relations:
142
147
  await asyncio.to_thread(
@@ -150,7 +155,7 @@ class IndexMessageBuilder:
150
155
  def _apply_field_deletions(
151
156
  self,
152
157
  brain: ResourceBrain,
153
- field_ids: list[FieldID],
158
+ field_ids: Sequence[FieldID],
154
159
  ) -> None:
155
160
  for field_id in field_ids:
156
161
  brain.delete_field(self.resource.generate_field_id(field_id))
@@ -158,20 +163,19 @@ class IndexMessageBuilder:
158
163
  @observer.wrap({"type": "writer_bm"})
159
164
  async def for_writer_bm(
160
165
  self,
161
- messages: list[BrokerMessage],
166
+ message: BrokerMessage,
162
167
  resource_created: bool,
163
168
  ) -> IndexMessage:
164
169
  """
165
- Builds the index message for the broker messages coming from the writer.
170
+ Builds the index message for the broker message coming from the writer.
166
171
  The writer messages are not adding new vectors to the index.
167
172
  """
168
- assert all(message.source == BrokerMessage.MessageSource.WRITER for message in messages)
173
+ assert message.source == BrokerMessage.MessageSource.WRITER
169
174
 
170
- deleted_fields = get_bm_deleted_fields(messages)
171
- self._apply_field_deletions(self.brain, deleted_fields)
175
+ self._apply_field_deletions(self.brain, message.delete_fields)
172
176
  await self._apply_resource_index_data(self.brain)
173
177
  basic = await self.get_basic()
174
- prefilter_update = needs_prefilter_update(messages)
178
+ prefilter_update = needs_prefilter_update(message)
175
179
  if prefilter_update:
176
180
  # Changes on some metadata at the resource level that is used for filtering require that we reindex all the fields
177
181
  # in the texts index (as it is the one used for prefiltering).
@@ -181,16 +185,16 @@ class IndexMessageBuilder:
181
185
  ]
182
186
  else:
183
187
  # Simply process the fields that are in the message
184
- fields_to_index = get_bm_modified_fields(messages)
188
+ fields_to_index = get_bm_modified_fields(message)
185
189
  for fieldid in fields_to_index:
186
- if fieldid in deleted_fields:
190
+ if fieldid in message.delete_fields:
187
191
  continue
188
192
  await self._apply_field_index_data(
189
193
  self.brain,
190
194
  fieldid,
191
195
  basic,
192
- texts=prefilter_update or needs_texts_update(fieldid, messages),
193
- paragraphs=needs_paragraphs_update(fieldid, messages),
196
+ texts=prefilter_update or needs_texts_update(fieldid, message),
197
+ paragraphs=needs_paragraphs_update(fieldid, message),
194
198
  relations=False, # Relations at the field level are not modified by the writer
195
199
  vectors=False, # Vectors are never added by the writer
196
200
  replace=not resource_created,
@@ -200,32 +204,45 @@ class IndexMessageBuilder:
200
204
  @observer.wrap({"type": "processor_bm"})
201
205
  async def for_processor_bm(
202
206
  self,
203
- messages: list[BrokerMessage],
207
+ message: BrokerMessage,
204
208
  ) -> IndexMessage:
205
209
  """
206
210
  Builds the index message for the broker messages coming from the processor.
207
211
  The processor can index new data to any index.
208
212
  """
209
- assert all(message.source == BrokerMessage.MessageSource.PROCESSOR for message in messages)
210
- deleted_fields = get_bm_deleted_fields(messages)
211
- self._apply_field_deletions(self.brain, deleted_fields)
213
+ assert message.source == BrokerMessage.MessageSource.PROCESSOR
214
+ self._apply_field_deletions(self.brain, message.delete_fields)
212
215
  await self._apply_resource_index_data(self.brain)
213
216
  basic = await self.get_basic()
214
- fields_to_index = get_bm_modified_fields(messages)
217
+ fields_to_index = get_bm_modified_fields(message)
215
218
  vectorsets_configs = await self.get_vectorsets_configs()
216
219
  for fieldid in fields_to_index:
217
- if fieldid in deleted_fields:
220
+ if fieldid in message.delete_fields:
218
221
  continue
222
+
223
+ # For conversation fields, we only replace the full field if it is not an append messages operation.
224
+ # All other fields are always replaced upon modification.
225
+ replace_field = True
226
+ modified_splits = None
227
+ if fieldid.field_type == FieldType.CONVERSATION:
228
+ modified_splits = await get_bm_modified_split_ids(fieldid, message, self.resource)
229
+ stored_splits = await get_stored_split_ids(fieldid, self.resource)
230
+ is_append_messages_op = modified_splits.issubset(stored_splits) and 0 < len(
231
+ modified_splits
232
+ ) < len(stored_splits)
233
+ replace_field = not is_append_messages_op
234
+
219
235
  await self._apply_field_index_data(
220
236
  self.brain,
221
237
  fieldid,
222
238
  basic,
223
- texts=needs_texts_update(fieldid, messages),
224
- paragraphs=needs_paragraphs_update(fieldid, messages),
225
- relations=needs_relations_update(fieldid, messages),
226
- vectors=needs_vectors_update(fieldid, messages),
227
- replace=True,
239
+ texts=needs_texts_update(fieldid, message),
240
+ paragraphs=needs_paragraphs_update(fieldid, message),
241
+ relations=needs_relations_update(fieldid, message),
242
+ vectors=needs_vectors_update(fieldid, message),
243
+ replace=replace_field,
228
244
  vectorset_configs=vectorsets_configs,
245
+ append_splits=modified_splits,
229
246
  )
230
247
  return self.brain.brain
231
248
 
@@ -270,130 +287,128 @@ class IndexMessageBuilder:
270
287
  return vectorset_configs
271
288
 
272
289
 
273
- def get_bm_deleted_fields(
274
- messages: list[BrokerMessage],
275
- ) -> list[FieldID]:
276
- deleted = []
277
- for message in messages:
278
- for field in message.delete_fields:
279
- if field not in deleted:
280
- deleted.append(field)
281
- return deleted
282
-
283
-
284
- def get_bm_modified_fields(messages: list[BrokerMessage]) -> list[FieldID]:
285
- message_source = get_messages_source(messages)
290
+ def get_bm_modified_fields(message: BrokerMessage) -> list[FieldID]:
286
291
  modified = set()
287
- for message in messages:
288
- # Added or modified fields need indexing
289
- for link in message.links:
290
- modified.add((link, FieldType.LINK))
291
- for file in message.files:
292
- modified.add((file, FieldType.FILE))
293
- for conv in message.conversations:
294
- modified.add((conv, FieldType.CONVERSATION))
295
- for text in message.texts:
296
- modified.add((text, FieldType.TEXT))
292
+ # Added or modified fields need indexing
293
+ for link in message.links:
294
+ modified.add((link, FieldType.LINK))
295
+ for file in message.files:
296
+ modified.add((file, FieldType.FILE))
297
+ for conv in message.conversations:
298
+ modified.add((conv, FieldType.CONVERSATION))
299
+ for text in message.texts:
300
+ modified.add((text, FieldType.TEXT))
301
+ if message.HasField("basic"):
302
+ # Add title and summary only if they have changed
303
+ if message.basic.title != "":
304
+ modified.add(("title", FieldType.GENERIC))
305
+ if message.basic.summary != "":
306
+ modified.add(("summary", FieldType.GENERIC))
307
+
308
+ if message.source == BrokerMessage.MessageSource.PROCESSOR:
309
+ # Messages with field metadata, extracted text or field vectors need indexing
310
+ for fm in message.field_metadata:
311
+ modified.add((fm.field.field, fm.field.field_type))
312
+ for et in message.extracted_text:
313
+ modified.add((et.field.field, et.field.field_type))
314
+ for fv in message.field_vectors:
315
+ modified.add((fv.field.field, fv.field.field_type))
316
+
317
+ if message.source == BrokerMessage.MessageSource.WRITER:
318
+ # Any field that has fieldmetadata annotations should be considered as modified
319
+ # and needs to be reindexed
297
320
  if message.HasField("basic"):
298
- # Add title and summary only if they have changed
299
- if message.basic.title != "":
300
- modified.add(("title", FieldType.GENERIC))
301
- if message.basic.summary != "":
302
- modified.add(("summary", FieldType.GENERIC))
303
-
304
- if message_source == BrokerMessage.MessageSource.PROCESSOR:
305
- # Messages with field metadata, extracted text or field vectors need indexing
306
- for fm in message.field_metadata:
307
- modified.add((fm.field.field, fm.field.field_type))
308
- for et in message.extracted_text:
309
- modified.add((et.field.field, et.field.field_type))
310
- for fv in message.field_vectors:
311
- modified.add((fv.field.field, fv.field.field_type))
312
-
313
- if message_source == BrokerMessage.MessageSource.WRITER:
314
- # Any field that has fieldmetadata annotations should be considered as modified
315
- # and needs to be reindexed
316
- if message.HasField("basic"):
317
- for ufm in message.basic.fieldmetadata:
318
- modified.add((ufm.field.field, ufm.field.field_type))
321
+ for ufm in message.basic.fieldmetadata:
322
+ modified.add((ufm.field.field, ufm.field.field_type))
319
323
  return [FieldID(field=field, field_type=field_type) for field, field_type in modified]
320
324
 
321
325
 
322
- def get_messages_source(messages: list[BrokerMessage]) -> BrokerMessage.MessageSource.ValueType:
323
- assert len(set(message.source for message in messages)) == 1
324
- return messages[0].source
326
+ def needs_prefilter_update(message: BrokerMessage) -> bool:
327
+ return message.reindex
325
328
 
326
329
 
327
- def needs_prefilter_update(messages: list[BrokerMessage]) -> bool:
328
- return any(message.reindex for message in messages)
329
-
330
-
331
- def needs_paragraphs_update(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
330
+ def needs_paragraphs_update(field_id: FieldID, message: BrokerMessage) -> bool:
332
331
  return (
333
- has_paragraph_annotations(field_id, messages)
334
- or has_new_extracted_text(field_id, messages)
335
- or has_new_field_metadata(field_id, messages)
332
+ has_paragraph_annotations(field_id, message)
333
+ or has_new_extracted_text(field_id, message)
334
+ or has_new_field_metadata(field_id, message)
336
335
  )
337
336
 
338
337
 
339
- def has_paragraph_annotations(field_id: FieldID, messages: list[BrokerMessage]) -> bool:
340
- for message in messages:
341
- ufm = next(
342
- (fm for fm in message.basic.fieldmetadata if fm.field == field_id),
343
- None,
344
- )
345
- if ufm is None:
346
- continue
347
- if len(ufm.paragraphs) > 0:
348
- return True
349
- return False
338
+ def has_paragraph_annotations(field_id: FieldID, message: BrokerMessage) -> bool:
339
+ ufm = next(
340
+ (fm for fm in message.basic.fieldmetadata if fm.field == field_id),
341
+ None,
342
+ )
343
+ if ufm is None:
344
+ return False
345
+ return len(ufm.paragraphs) > 0
350
346
 
351
347
 
352
348
  def has_new_field_metadata(
353
349
  field_id: FieldID,
354
- messages: list[BrokerMessage],
350
+ message: BrokerMessage,
355
351
  ) -> bool:
356
- for message in messages:
357
- for field_metadata in message.field_metadata:
358
- if field_metadata.field == field_id:
359
- return True
360
- return False
352
+ return any(field_metadata.field == field_id for field_metadata in message.field_metadata)
361
353
 
362
354
 
363
355
  def has_new_extracted_text(
364
356
  field_id: FieldID,
365
- messages: list[BrokerMessage],
357
+ message: BrokerMessage,
366
358
  ) -> bool:
367
- for message in messages:
368
- for extracted_text in message.extracted_text:
369
- if extracted_text.field == field_id:
370
- return True
371
- return False
359
+ return any(extracted_text.field == field_id for extracted_text in message.extracted_text)
372
360
 
373
361
 
374
362
  def needs_texts_update(
375
363
  field_id: FieldID,
376
- messages: list[BrokerMessage],
364
+ message: BrokerMessage,
377
365
  ) -> bool:
378
- return has_new_extracted_text(field_id, messages) or has_new_field_metadata(field_id, messages)
366
+ return has_new_extracted_text(field_id, message) or has_new_field_metadata(field_id, message)
379
367
 
380
368
 
381
369
  def needs_vectors_update(
382
370
  field_id: FieldID,
383
- messages: list[BrokerMessage],
371
+ message: BrokerMessage,
384
372
  ) -> bool:
385
- for message in messages:
386
- for field_vectors in message.field_vectors:
387
- if field_vectors.field == field_id:
388
- return True
389
- return False
373
+ return any(field_vectors.field == field_id for field_vectors in message.field_vectors)
374
+
375
+
376
+ async def get_bm_modified_split_ids(
377
+ conversation_field_id: FieldID,
378
+ message: BrokerMessage,
379
+ resource: Resource,
380
+ ) -> set[str]:
381
+ message_etw = next(
382
+ (etw for etw in message.extracted_text if etw.field == conversation_field_id), None
383
+ )
384
+ if message_etw is None:
385
+ return set()
386
+ storage = resource.storage
387
+ if message_etw.HasField("file"):
388
+ raw_payload = await storage.downloadbytescf(message_etw.file)
389
+ message_extracted_text = ExtractedText()
390
+ message_extracted_text.ParseFromString(raw_payload.read())
391
+ raw_payload.flush()
392
+ else:
393
+ message_extracted_text = message_etw.body
394
+ return set(message_extracted_text.split_text.keys())
395
+
396
+
397
+ async def get_stored_split_ids(
398
+ conversation_field_id: FieldID,
399
+ resource: Resource,
400
+ ) -> set[str]:
401
+ fid = conversation_field_id
402
+ conv: Conversation = await resource.get_field(fid.field, fid.field_type, load=False)
403
+ splits_metadata = await conv.get_splits_metadata()
404
+ return set(splits_metadata.metadata)
390
405
 
391
406
 
392
407
  def needs_relations_update(
393
408
  field_id: FieldID,
394
- messages: list[BrokerMessage],
409
+ message: BrokerMessage,
395
410
  ) -> bool:
396
- return has_new_field_metadata(field_id, messages) or has_new_extracted_text(field_id, messages)
411
+ return has_new_field_metadata(field_id, message) or has_new_extracted_text(field_id, message)
397
412
 
398
413
 
399
414
  async def get_resource_index_message(
@@ -24,7 +24,7 @@ from uuid import uuid4
24
24
 
25
25
  from grpc import StatusCode
26
26
  from grpc.aio import AioRpcError
27
- from nidx_protos import noderesources_pb2
27
+ from nidx_protos import nidx_pb2, noderesources_pb2
28
28
 
29
29
  from nucliadb.common import datamanagers
30
30
  from nucliadb.common.cluster.exceptions import ShardNotFound
@@ -36,7 +36,6 @@ from nucliadb.common.datamanagers.resources import (
36
36
  KB_RESOURCE_SLUG_BASE,
37
37
  )
38
38
  from nucliadb.common.external_index_providers.base import VectorsetExternalIndex
39
- from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
40
39
  from nucliadb.common.maindb.driver import Driver, Transaction
41
40
  from nucliadb.common.maindb.pg import PGTransaction
42
41
  from nucliadb.common.nidx import get_nidx_api_client
@@ -53,7 +52,6 @@ from nucliadb.migrator.utils import get_latest_version
53
52
  from nucliadb_protos import knowledgebox_pb2, writer_pb2
54
53
  from nucliadb_protos.knowledgebox_pb2 import (
55
54
  CreateExternalIndexProviderMetadata,
56
- ExternalIndexProviderType,
57
55
  KnowledgeBoxConfig,
58
56
  SemanticModelMetadata,
59
57
  StoredExternalIndexProviderMetadata,
@@ -110,6 +108,7 @@ class KnowledgeBox:
110
108
  external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
111
109
  hidden_resources_enabled: bool = False,
112
110
  hidden_resources_hide_on_creation: bool = False,
111
+ prewarm_enabled: bool = False,
113
112
  ) -> tuple[str, str]:
114
113
  """Creates a new knowledge box and return its id and slug."""
115
114
 
@@ -127,7 +126,7 @@ class KnowledgeBox:
127
126
  rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
128
127
 
129
128
  try:
130
- async with driver.transaction() as txn:
129
+ async with driver.rw_transaction() as txn:
131
130
  exists = await datamanagers.kb.get_kb_uuid(
132
131
  txn, slug=slug
133
132
  ) or await datamanagers.kb.exists_kb(txn, kbid=kbid)
@@ -196,6 +195,7 @@ class KnowledgeBox:
196
195
  migration_version=get_latest_version(),
197
196
  hidden_resources_enabled=hidden_resources_enabled,
198
197
  hidden_resources_hide_on_creation=hidden_resources_hide_on_creation,
198
+ prewarm_enabled=prewarm_enabled,
199
199
  )
200
200
  config.external_index_provider.CopyFrom(stored_external_index_provider)
201
201
  await datamanagers.kb.set_config(txn, kbid=kbid, config=config)
@@ -222,7 +222,7 @@ class KnowledgeBox:
222
222
  shard_manager = get_shard_manager()
223
223
  # XXX creating a shard is a slow IO operation that requires a write
224
224
  # txn to be open!
225
- await shard_manager.create_shard_by_kbid(txn, kbid)
225
+ await shard_manager.create_shard_by_kbid(txn, kbid, prewarm_enabled=prewarm_enabled)
226
226
  # shards don't need a rollback as they will be eventually purged
227
227
 
228
228
  await txn.commit()
@@ -245,43 +245,90 @@ class KnowledgeBox:
245
245
  @classmethod
246
246
  async def update(
247
247
  cls,
248
- txn: Transaction,
249
- uuid: str,
248
+ driver: Driver,
249
+ kbid: str,
250
+ *,
250
251
  slug: Optional[str] = None,
251
- config: Optional[KnowledgeBoxConfig] = None,
252
+ title: Optional[str] = None,
253
+ description: Optional[str] = None,
254
+ migration_version: Optional[int] = None,
255
+ external_index_provider: Optional[StoredExternalIndexProviderMetadata] = None,
256
+ hidden_resources_enabled: Optional[bool] = None,
257
+ hidden_resources_hide_on_creation: Optional[bool] = None,
258
+ prewarm_enabled: Optional[bool] = None,
252
259
  ) -> str:
253
- exist = await datamanagers.kb.get_config(txn, kbid=uuid, for_update=True)
254
- if not exist:
255
- raise datamanagers.exceptions.KnowledgeBoxNotFound()
256
-
257
- if slug:
258
- await txn.delete(datamanagers.kb.KB_SLUGS.format(slug=exist.slug))
259
- await txn.set(
260
- datamanagers.kb.KB_SLUGS.format(slug=slug),
261
- uuid.encode(),
262
- )
263
- if config:
264
- config.slug = slug
265
- else:
266
- exist.slug = slug
260
+ async with driver.rw_transaction() as txn:
261
+ stored = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
262
+ if not stored:
263
+ raise datamanagers.exceptions.KnowledgeBoxNotFound()
264
+
265
+ if slug:
266
+ await txn.delete(datamanagers.kb.KB_SLUGS.format(slug=stored.slug))
267
+ await txn.set(
268
+ datamanagers.kb.KB_SLUGS.format(slug=slug),
269
+ kbid.encode(),
270
+ )
271
+ stored.slug = slug
267
272
 
268
- if config and exist != config:
269
- exist.MergeFrom(config)
270
- exist.hidden_resources_enabled = config.hidden_resources_enabled
271
- exist.hidden_resources_hide_on_creation = config.hidden_resources_hide_on_creation
273
+ if title is not None:
274
+ stored.title = title
275
+ if description is not None:
276
+ stored.description = description
272
277
 
273
- if exist.hidden_resources_hide_on_creation and not exist.hidden_resources_enabled:
274
- raise KnowledgeBoxCreationError(
275
- "Cannot hide new resources if the hidden resources feature is disabled"
276
- )
278
+ if migration_version is not None:
279
+ stored.migration_version = migration_version
280
+
281
+ if external_index_provider is not None:
282
+ stored.external_index_provider.MergeFrom(external_index_provider)
283
+
284
+ if hidden_resources_enabled is not None:
285
+ stored.hidden_resources_enabled = hidden_resources_enabled
286
+ if hidden_resources_hide_on_creation is not None:
287
+ stored.hidden_resources_hide_on_creation = hidden_resources_hide_on_creation
288
+
289
+ update_nidx_prewarm = None
290
+ if prewarm_enabled is not None:
291
+ if stored.prewarm_enabled != prewarm_enabled:
292
+ update_nidx_prewarm = prewarm_enabled
293
+ stored.prewarm_enabled = prewarm_enabled
294
+
295
+ if stored.hidden_resources_hide_on_creation and not stored.hidden_resources_enabled:
296
+ raise KnowledgeBoxCreationError(
297
+ "Cannot hide new resources if the hidden resources feature is disabled"
298
+ )
277
299
 
278
- await datamanagers.kb.set_config(txn, kbid=uuid, config=exist)
300
+ await datamanagers.kb.set_config(txn, kbid=kbid, config=stored)
279
301
 
280
- return uuid
302
+ await txn.commit()
303
+
304
+ if update_nidx_prewarm is not None:
305
+ await cls.configure_shards(driver, kbid, prewarm=update_nidx_prewarm)
306
+
307
+ return kbid
308
+
309
+ @classmethod
310
+ async def configure_shards(cls, driver: Driver, kbid: str, *, prewarm: bool):
311
+ shards_obj = await datamanagers.atomic.cluster.get_kb_shards(kbid=kbid)
312
+ if shards_obj is None:
313
+ logger.warning(f"Shards not found for KB while updating pre-warm flag", extra={"kbid": kbid})
314
+ return
315
+
316
+ nidx_shard_ids = [shard.nidx_shard_id for shard in shards_obj.shards]
317
+
318
+ nidx_api = get_nidx_api_client()
319
+ if nidx_api is not None and len(nidx_shard_ids) > 0:
320
+ configs = [
321
+ nidx_pb2.ShardConfig(
322
+ shard_id=shard_id,
323
+ prewarm_enabled=prewarm,
324
+ )
325
+ for shard_id in nidx_shard_ids
326
+ ]
327
+ await nidx_api.ConfigureShards(nidx_pb2.ShardsConfig(configs=configs))
281
328
 
282
329
  @classmethod
283
330
  async def delete(cls, driver: Driver, kbid: str):
284
- async with driver.transaction() as txn:
331
+ async with driver.rw_transaction() as txn:
285
332
  exists = await datamanagers.kb.exists_kb(txn, kbid=kbid)
286
333
  if not exists:
287
334
  return
@@ -347,7 +394,7 @@ class KnowledgeBox:
347
394
 
348
395
  nidx_api = get_nidx_api_client()
349
396
 
350
- async with driver.transaction() as txn:
397
+ async with driver.rw_transaction() as txn:
351
398
  storage_to_delete = KB_TO_DELETE_STORAGE.format(kbid=kbid)
352
399
  await txn.set(storage_to_delete, b"")
353
400
 
@@ -377,7 +424,7 @@ class KnowledgeBox:
377
424
  @classmethod
378
425
  async def delete_all_kb_keys(cls, driver: Driver, kbid: str, chunk_size: int = 1_000):
379
426
  prefix = KB_KEYS.format(kbid=kbid)
380
- async with driver.transaction() as txn:
427
+ async with driver.rw_transaction() as txn:
381
428
  await txn.delete_by_prefix(prefix)
382
429
  await txn.commit()
383
430
 
@@ -535,10 +582,7 @@ class KnowledgeBox:
535
582
  request: CreateExternalIndexProviderMetadata,
536
583
  indexes: list[VectorsetExternalIndex],
537
584
  ) -> StoredExternalIndexProviderMetadata:
538
- if request.type != ExternalIndexProviderType.PINECONE:
539
- return StoredExternalIndexProviderMetadata(type=request.type)
540
- # Only pinecone is supported for now
541
- return await PineconeIndexManager.create_indexes(kbid, request, indexes)
585
+ return StoredExternalIndexProviderMetadata(type=request.type)
542
586
 
543
587
  @classmethod
544
588
  async def _maybe_delete_external_indexes(
@@ -546,10 +590,7 @@ class KnowledgeBox:
546
590
  kbid: str,
547
591
  stored: StoredExternalIndexProviderMetadata,
548
592
  ) -> None:
549
- if stored.type != ExternalIndexProviderType.PINECONE:
550
- return
551
- # Only pinecone is supported for now
552
- await PineconeIndexManager.delete_indexes(kbid, stored)
593
+ return
553
594
 
554
595
 
555
596
  def chunker(seq: Sequence, size: int):
@@ -34,7 +34,7 @@ async def collect_audit_fields(
34
34
  return []
35
35
 
36
36
  audit_storage_fields: list[audit_pb2.AuditField] = []
37
- async with driver.transaction(read_only=True) as txn:
37
+ async with driver.ro_transaction() as txn:
38
38
  kb = KnowledgeBox(txn, storage, message.kbid)
39
39
  resource = Resource(txn, storage, kb, message.uuid)
40
40
  field_keys = await resource.get_fields_ids()