nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -261,6 +261,7 @@ class ProcessingEngine:
261
261
  "content_type": file.file.content_type,
262
262
  "password": file.password,
263
263
  "language": file.language,
264
+ "extract_strategy": file.extract_strategy,
264
265
  }
265
266
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
266
267
 
@@ -278,6 +279,8 @@ class ProcessingEngine:
278
279
  headers["X-LANGUAGE"] = file.language
279
280
  headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode() # type: ignore
280
281
  headers["X-MD5"] = file.file.md5
282
+ if file.extract_strategy is not None:
283
+ headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
281
284
  headers["CONTENT_TYPE"] = file.file.content_type
282
285
  headers["CONTENT-LENGTH"] = str(len(file.file.payload)) # type: ignore
283
286
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
@@ -317,6 +320,7 @@ class ProcessingEngine:
317
320
  "content_type": file_field.file.content_type,
318
321
  "language": file_field.language,
319
322
  "password": file_field.password,
323
+ "extract_strategy": file_field.extract_strategy,
320
324
  }
321
325
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
322
326
 
@@ -341,6 +345,8 @@ class ProcessingEngine:
341
345
  headers["CONTENT-TYPE"] = file.file.content_type
342
346
  if file.file.size:
343
347
  headers["CONTENT-LENGTH"] = str(file.file.size)
348
+ if file.extract_strategy != "":
349
+ headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
344
350
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
345
351
 
346
352
  iterator = storage.downloadbytescf_iterator(file.file)
@@ -406,13 +412,13 @@ class ProcessingEngine:
406
412
  # Upload the payload
407
413
  item.partition = partition
408
414
  resp = await self.session.post(
409
- url=self.nuclia_internal_push, data=item.json(), headers=headers
415
+ url=self.nuclia_internal_push, data=item.model_dump_json(), headers=headers
410
416
  )
411
417
  else:
412
418
  headers.update({"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"})
413
419
  # Upload the payload
414
420
  resp = await self.session.post(
415
- url=self.nuclia_external_push_v2, data=item.json(), headers=headers
421
+ url=self.nuclia_external_push_v2, data=item.model_dump_json(), headers=headers
416
422
  )
417
423
  if resp.status == 200:
418
424
  data = await resp.json()
@@ -18,18 +18,20 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
21
+ from typing import Optional, Union
22
22
 
23
23
  import nucliadb_models as models
24
+ from nucliadb.common import datamanagers
24
25
  from nucliadb.common.maindb.driver import Transaction
25
26
  from nucliadb.common.maindb.utils import get_driver
27
+ from nucliadb.common.models_utils import from_proto
26
28
  from nucliadb.ingest.fields.base import Field
27
29
  from nucliadb.ingest.fields.conversation import Conversation
28
30
  from nucliadb.ingest.fields.file import File
29
31
  from nucliadb.ingest.fields.link import Link
30
32
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
31
33
  from nucliadb.ingest.orm.resource import Resource as ORMResource
32
- from nucliadb_models.common import FIELD_TYPES_MAP, FieldTypeName
34
+ from nucliadb_models.common import FieldTypeName
33
35
  from nucliadb_models.resource import (
34
36
  ConversationFieldData,
35
37
  ConversationFieldExtractedData,
@@ -49,7 +51,9 @@ from nucliadb_models.resource import (
49
51
  )
50
52
  from nucliadb_models.search import ResourceProperties
51
53
  from nucliadb_models.security import ResourceSecurity
52
- from nucliadb_utils.utilities import get_storage
54
+ from nucliadb_protos.writer_pb2 import FieldStatus
55
+ from nucliadb_utils import const
56
+ from nucliadb_utils.utilities import get_storage, has_feature
53
57
 
54
58
 
55
59
  async def set_resource_field_extracted_data(
@@ -64,7 +68,7 @@ async def set_resource_field_extracted_data(
64
68
  if ExtractedDataTypeName.TEXT in wanted_extracted_data:
65
69
  data_et = await field.get_extracted_text()
66
70
  if data_et is not None:
67
- field_data.text = models.ExtractedText.from_message(data_et)
71
+ field_data.text = from_proto.extracted_text(data_et)
68
72
 
69
73
  metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
70
74
  shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
@@ -72,24 +76,35 @@ async def set_resource_field_extracted_data(
72
76
  data_fcm = await field.get_field_metadata()
73
77
 
74
78
  if data_fcm is not None:
75
- field_data.metadata = models.FieldComputedMetadata.from_message(
79
+ field_data.metadata = from_proto.field_computed_metadata(
76
80
  data_fcm, shortened=shortened_metadata_wanted and not metadata_wanted
77
81
  )
78
82
 
79
83
  if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
80
84
  data_lcm = await field.get_large_field_metadata()
81
85
  if data_lcm is not None:
82
- field_data.large_metadata = models.LargeComputedMetadata.from_message(data_lcm)
86
+ field_data.large_metadata = from_proto.large_computed_metadata(data_lcm)
83
87
 
84
88
  if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
85
- data_vec = await field.get_vectors()
89
+ # XXX: our extracted API is not vectorset-compatible, so we'll get the
90
+ # first vectorset and return the values. Ideally, we should provide a
91
+ # way to select a vectorset
92
+ vectorset_id = None
93
+ async with datamanagers.with_ro_transaction() as txn:
94
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(
95
+ txn=txn,
96
+ kbid=field.resource.kb.kbid,
97
+ ):
98
+ break
99
+ assert vectorset_id is not None, "All KBs must have at least a vectorset"
100
+ data_vec = await field.get_vectors(vectorset_id, vs.storage_key_kind)
86
101
  if data_vec is not None:
87
- field_data.vectors = models.VectorObject.from_message(data_vec)
102
+ field_data.vectors = from_proto.vector_object(data_vec)
88
103
 
89
104
  if ExtractedDataTypeName.QA in wanted_extracted_data:
90
105
  qa = await field.get_question_answers()
91
106
  if qa is not None:
92
- field_data.question_answers = models.FieldQuestionAnswers.from_message(qa)
107
+ field_data.question_answers = from_proto.field_question_answers(qa)
93
108
 
94
109
  if (
95
110
  isinstance(field, File)
@@ -98,7 +113,7 @@ async def set_resource_field_extracted_data(
98
113
  ):
99
114
  data_fed = await field.get_file_extracted_data()
100
115
  if data_fed is not None:
101
- field_data.file = models.FileExtractedData.from_message(data_fed)
116
+ field_data.file = from_proto.file_extracted_data(data_fed)
102
117
 
103
118
  if (
104
119
  isinstance(field, Link)
@@ -107,7 +122,7 @@ async def set_resource_field_extracted_data(
107
122
  ):
108
123
  data_led = await field.get_link_extracted_data()
109
124
  if data_led is not None:
110
- field_data.link = models.LinkExtractedData.from_message(data_led)
125
+ field_data.link = from_proto.link_extracted_data(data_led)
111
126
 
112
127
 
113
128
  async def serialize(
@@ -133,6 +148,40 @@ async def serialize(
133
148
  )
134
149
 
135
150
 
151
+ async def serialize_field_errors(
152
+ field: Field,
153
+ serialized: Union[
154
+ TextFieldData, FileFieldData, LinkFieldData, ConversationFieldData, GenericFieldData
155
+ ],
156
+ ):
157
+ if has_feature(const.Features.FIELD_STATUS):
158
+ status = await field.get_status()
159
+ if status is None:
160
+ status = FieldStatus()
161
+ serialized.status = status.Status.Name(status.status)
162
+ if status.errors:
163
+ serialized.errors = []
164
+ for error in status.errors:
165
+ serialized.errors.append(
166
+ Error(
167
+ body=error.source_error.error,
168
+ code=error.source_error.code,
169
+ code_str=error.source_error.ErrorCode.Name(error.source_error.code),
170
+ created=error.created.ToDatetime(),
171
+ )
172
+ )
173
+ serialized.error = serialized.errors[-1]
174
+ else:
175
+ field_error = await field.get_error()
176
+ if field_error is not None:
177
+ serialized.error = Error(
178
+ body=field_error.error,
179
+ code=field_error.code,
180
+ code_str=field_error.ErrorCode.Name(field_error.code),
181
+ created=None,
182
+ )
183
+
184
+
136
185
  async def managed_serialize(
137
186
  txn: Transaction,
138
187
  kbid: str,
@@ -174,14 +223,12 @@ async def managed_serialize(
174
223
  else None
175
224
  )
176
225
 
177
- resource.metadata = models.Metadata.from_message(orm_resource.basic.metadata)
178
- resource.usermetadata = models.UserMetadata.from_message(orm_resource.basic.usermetadata)
226
+ resource.metadata = from_proto.metadata(orm_resource.basic.metadata)
227
+ resource.usermetadata = from_proto.user_metadata(orm_resource.basic.usermetadata)
179
228
  resource.fieldmetadata = [
180
- models.UserFieldMetadata.from_message(fm) for fm in orm_resource.basic.fieldmetadata
229
+ from_proto.user_field_metadata(fm) for fm in orm_resource.basic.fieldmetadata
181
230
  ]
182
- resource.computedmetadata = models.ComputedMetadata.from_message(
183
- orm_resource.basic.computedmetadata
184
- )
231
+ resource.computedmetadata = from_proto.computed_metadata(orm_resource.basic.computedmetadata)
185
232
 
186
233
  resource.last_seqid = orm_resource.basic.last_seqid
187
234
 
@@ -195,18 +242,18 @@ async def managed_serialize(
195
242
  await orm_resource.get_relations()
196
243
  if orm_resource.relations is not None:
197
244
  resource.relations = [
198
- models.Relation.from_message(relation) for relation in orm_resource.relations.relations
245
+ from_proto.relation(relation) for relation in orm_resource.relations.relations
199
246
  ]
200
247
 
201
248
  if ResourceProperties.ORIGIN in show:
202
249
  await orm_resource.get_origin()
203
250
  if orm_resource.origin is not None:
204
- resource.origin = models.Origin.from_message(orm_resource.origin)
251
+ resource.origin = from_proto.origin(orm_resource.origin)
205
252
 
206
253
  if ResourceProperties.EXTRA in show:
207
254
  await orm_resource.get_extra()
208
255
  if orm_resource.extra is not None:
209
- resource.extra = models.Extra.from_message(orm_resource.extra)
256
+ resource.extra = from_proto.extra(orm_resource.extra)
210
257
 
211
258
  include_errors = ResourceProperties.ERRORS in show
212
259
 
@@ -221,7 +268,7 @@ async def managed_serialize(
221
268
  await orm_resource.get_fields()
222
269
  resource.data = ResourceData()
223
270
  for (field_type, _), field in orm_resource.fields.items():
224
- field_type_name = FIELD_TYPES_MAP[field_type]
271
+ field_type_name = from_proto.field_type_name(field_type)
225
272
  if field_type_name not in field_type_filter:
226
273
  continue
227
274
 
@@ -236,14 +283,10 @@ async def managed_serialize(
236
283
  if field.id not in resource.data.texts:
237
284
  resource.data.texts[field.id] = TextFieldData()
238
285
  if include_value:
239
- serialized_value = (
240
- models.FieldText.from_message(value) if value is not None else None
241
- )
286
+ serialized_value = from_proto.field_text(value) if value is not None else None
242
287
  resource.data.texts[field.id].value = serialized_value
243
288
  if include_errors:
244
- error = await field.get_error()
245
- if error is not None:
246
- resource.data.texts[field.id].error = Error(body=error.error, code=error.code)
289
+ await serialize_field_errors(field, resource.data.texts[field.id])
247
290
  if include_extracted_data:
248
291
  resource.data.texts[field.id].extracted = TextFieldExtractedData()
249
292
  await set_resource_field_extracted_data(
@@ -259,14 +302,12 @@ async def managed_serialize(
259
302
  resource.data.files[field.id] = FileFieldData()
260
303
  if include_value:
261
304
  if value is not None:
262
- resource.data.files[field.id].value = models.FieldFile.from_message(value)
305
+ resource.data.files[field.id].value = from_proto.field_file(value)
263
306
  else:
264
307
  resource.data.files[field.id].value = None
265
308
 
266
309
  if include_errors:
267
- error = await field.get_error()
268
- if error is not None:
269
- resource.data.files[field.id].error = Error(body=error.error, code=error.code)
310
+ await serialize_field_errors(field, resource.data.files[field.id])
270
311
 
271
312
  if include_extracted_data:
272
313
  resource.data.files[field.id].extracted = FileFieldExtractedData()
@@ -282,12 +323,10 @@ async def managed_serialize(
282
323
  if field.id not in resource.data.links:
283
324
  resource.data.links[field.id] = LinkFieldData()
284
325
  if include_value and value is not None:
285
- resource.data.links[field.id].value = models.FieldLink.from_message(value)
326
+ resource.data.links[field.id].value = from_proto.field_link(value)
286
327
 
287
328
  if include_errors:
288
- error = await field.get_error()
289
- if error is not None:
290
- resource.data.links[field.id].error = Error(body=error.error, code=error.code)
329
+ await serialize_field_errors(field, resource.data.links[field.id])
291
330
 
292
331
  if include_extracted_data:
293
332
  resource.data.links[field.id].extracted = LinkFieldExtractedData()
@@ -303,16 +342,10 @@ async def managed_serialize(
303
342
  if field.id not in resource.data.conversations:
304
343
  resource.data.conversations[field.id] = ConversationFieldData()
305
344
  if include_errors:
306
- error = await field.get_error()
307
- if error is not None:
308
- resource.data.conversations[field.id].error = Error(
309
- body=error.error, code=error.code
310
- )
345
+ await serialize_field_errors(field, resource.data.conversations[field.id])
311
346
  if include_value and isinstance(field, Conversation):
312
347
  value = await field.get_metadata()
313
- resource.data.conversations[field.id].value = models.FieldConversation.from_message(
314
- value
315
- )
348
+ resource.data.conversations[field.id].value = from_proto.field_conversation(value)
316
349
  if include_extracted_data:
317
350
  resource.data.conversations[field.id].extracted = ConversationFieldExtractedData()
318
351
  await set_resource_field_extracted_data(
@@ -329,9 +362,7 @@ async def managed_serialize(
329
362
  if include_value:
330
363
  resource.data.generics[field.id].value = value
331
364
  if include_errors:
332
- error = await field.get_error()
333
- if error is not None:
334
- resource.data.generics[field.id].error = Error(body=error.error, code=error.code)
365
+ await serialize_field_errors(field, resource.data.generics[field.id])
335
366
  if include_extracted_data:
336
367
  resource.data.generics[field.id].extracted = TextFieldExtractedData(
337
368
  text=models.ExtractedText(text=resource.data.generics[field.id].value)
@@ -31,12 +31,12 @@ from nucliadb.common.maindb.utils import setup_driver
31
31
  from nucliadb.ingest import SERVICE_NAME, logger
32
32
  from nucliadb.ingest.orm.broker_message import generate_broker_message
33
33
  from nucliadb.ingest.orm.entities import EntitiesManager
34
- from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict, VectorSetConflict
34
+ from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
35
35
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
36
36
  from nucliadb.ingest.orm.processor import Processor, sequence_manager
37
37
  from nucliadb.ingest.orm.resource import Resource as ResourceORM
38
38
  from nucliadb.ingest.settings import settings
39
- from nucliadb_protos import nodewriter_pb2, writer_pb2, writer_pb2_grpc
39
+ from nucliadb_protos import writer_pb2, writer_pb2_grpc
40
40
  from nucliadb_protos.knowledgebox_pb2 import (
41
41
  DeleteKnowledgeBoxResponse,
42
42
  KnowledgeBoxID,
@@ -44,13 +44,10 @@ from nucliadb_protos.knowledgebox_pb2 import (
44
44
  KnowledgeBoxUpdate,
45
45
  SemanticModelMetadata,
46
46
  UpdateKnowledgeBoxResponse,
47
- VectorSetConfig,
48
47
  )
49
48
  from nucliadb_protos.writer_pb2 import (
50
49
  BrokerMessage,
51
50
  DelEntitiesRequest,
52
- DelVectorSetRequest,
53
- DelVectorSetResponse,
54
51
  GetEntitiesGroupRequest,
55
52
  GetEntitiesGroupResponse,
56
53
  GetEntitiesRequest,
@@ -63,8 +60,6 @@ from nucliadb_protos.writer_pb2 import (
63
60
  ListMembersResponse,
64
61
  NewEntitiesGroupRequest,
65
62
  NewEntitiesGroupResponse,
66
- NewVectorSetRequest,
67
- NewVectorSetResponse,
68
63
  OpStatusWriter,
69
64
  SetEntitiesRequest,
70
65
  UpdateEntitiesGroupRequest,
@@ -472,52 +467,3 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
472
467
  errors.capture_exception(e)
473
468
  logger.error("Error in ingest gRPC servicer", exc_info=True)
474
469
  raise
475
-
476
- async def NewVectorSet( # type: ignore
477
- self, request: NewVectorSetRequest, context=None
478
- ) -> NewVectorSetResponse:
479
- config = VectorSetConfig(
480
- vectorset_id=request.vectorset_id,
481
- vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
482
- similarity=request.similarity,
483
- normalize_vectors=request.normalize_vectors,
484
- vector_type=request.vector_type,
485
- vector_dimension=request.vector_dimension,
486
- ),
487
- matryoshka_dimensions=request.matryoshka_dimensions,
488
- )
489
- response = NewVectorSetResponse()
490
- try:
491
- async with self.driver.transaction() as txn:
492
- kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
493
- await kbobj.create_vectorset(config)
494
- await txn.commit()
495
- except VectorSetConflict as exc:
496
- response.status = NewVectorSetResponse.Status.ERROR
497
- response.details = str(exc)
498
- except Exception as exc:
499
- errors.capture_exception(exc)
500
- logger.error("Error in ingest gRPC while creating a vectorset", exc_info=True)
501
- response.status = NewVectorSetResponse.Status.ERROR
502
- response.details = str(exc)
503
- else:
504
- response.status = NewVectorSetResponse.Status.OK
505
- return response
506
-
507
- async def DelVectorSet( # type: ignore
508
- self, request: DelVectorSetRequest, context=None
509
- ) -> DelVectorSetResponse:
510
- response = DelVectorSetResponse()
511
- try:
512
- async with self.driver.transaction() as txn:
513
- kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
514
- await kbobj.delete_vectorset(request.vectorset_id)
515
- await txn.commit()
516
- except Exception as exc:
517
- errors.capture_exception(exc)
518
- logger.error("Error in ingest gRPC while deleting a vectorset", exc_info=True)
519
- response.status = DelVectorSetResponse.Status.ERROR
520
- response.details = str(exc)
521
- else:
522
- response.status = DelVectorSetResponse.Status.OK
523
- return response
@@ -77,7 +77,7 @@ class Settings(DriverSettings):
77
77
  total_replicas: int = 1 # number of ingest processor replicas in the cluster
78
78
  nuclia_partitions: int = 50
79
79
 
80
- max_receive_message_length: int = 4
80
+ max_receive_message_length: int = 500 # In MB
81
81
 
82
82
  # Search query timeouts
83
83
  relation_search_timeout: float = 10.0
@@ -85,8 +85,5 @@ class Settings(DriverSettings):
85
85
 
86
86
  max_concurrent_ingest_processing: int = 5
87
87
 
88
- # Ingest processor settings
89
- ingest_delete_resource_storage_max_parallel: int = 20
90
-
91
88
 
92
89
  settings = Settings()
@@ -155,10 +155,9 @@ class LearningConfiguration(BaseModel):
155
155
 
156
156
 
157
157
  class ProxiedLearningConfigError(Exception):
158
- def __init__(self, status_code: int, content: bytes, content_type: str):
158
+ def __init__(self, status_code: int, content: Union[str, dict[str, Any]]):
159
159
  self.status_code = status_code
160
160
  self.content = content
161
- self.content_type = content_type
162
161
 
163
162
 
164
163
  def raise_for_status(response: httpx.Response) -> None:
@@ -166,10 +165,13 @@ def raise_for_status(response: httpx.Response) -> None:
166
165
  response.raise_for_status()
167
166
  except httpx.HTTPStatusError as err:
168
167
  content_type = err.response.headers.get("Content-Type", "application/json")
168
+ if content_type == "application/json":
169
+ content = err.response.json()
170
+ else:
171
+ content = err.response.text
169
172
  raise ProxiedLearningConfigError(
170
173
  status_code=err.response.status_code,
171
- content=err.response.content,
172
- content_type=content_type,
174
+ content=content,
173
175
  )
174
176
 
175
177
 
@@ -25,15 +25,19 @@ from nucliadb.common.cluster.exceptions import NodeError, ShardNotFound
25
25
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
26
26
  from nucliadb.common.maindb.driver import Driver
27
27
  from nucliadb.common.maindb.utils import setup_driver, teardown_driver
28
+ from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
28
29
  from nucliadb.ingest import SERVICE_NAME, logger
30
+ from nucliadb.ingest.fields.base import Field
29
31
  from nucliadb.ingest.orm.knowledgebox import (
30
32
  KB_TO_DELETE,
31
33
  KB_TO_DELETE_BASE,
32
34
  KB_TO_DELETE_STORAGE_BASE,
33
35
  KB_VECTORSET_TO_DELETE,
34
36
  KB_VECTORSET_TO_DELETE_BASE,
37
+ RESOURCE_TO_DELETE_STORAGE_BASE,
35
38
  KnowledgeBox,
36
39
  )
40
+ from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig, VectorSetPurge
37
41
  from nucliadb_telemetry import errors
38
42
  from nucliadb_telemetry.logs import setup_logging
39
43
  from nucliadb_utils.storages.storage import Storage
@@ -131,6 +135,67 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
131
135
  logger.info("FINISH PURGING KB STORAGE")
132
136
 
133
137
 
138
+ async def purge_deleted_resource_storage(driver: Driver, storage: Storage) -> None:
139
+ """
140
+ Remove from storage all resources marked as deleted.
141
+
142
+ Returns the number of resources purged.
143
+ """
144
+ logger.info("Starting purge of deleted resource storage")
145
+ to_purge = await _count_resources_storage_to_purge(driver)
146
+ logger.info(f"Found {to_purge} resources to purge")
147
+ while True:
148
+ try:
149
+ purged = await _purge_resources_storage_batch(driver, storage, batch_size=100)
150
+ if not purged:
151
+ logger.info("No more resources to purge found")
152
+ return
153
+ logger.info(f"Purged {purged} resources")
154
+
155
+ except asyncio.CancelledError:
156
+ logger.info("Purge of deleted resource storage was cancelled")
157
+ return
158
+
159
+
160
+ async def _count_resources_storage_to_purge(driver: Driver) -> int:
161
+ """
162
+ Count the number of resources marked as deleted in storage.
163
+ """
164
+ async with driver.transaction(read_only=True) as txn:
165
+ return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
166
+
167
+
168
+ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch_size: int = 100) -> int:
169
+ """
170
+ Remove from storage a batch of resources marked as deleted. Returns the
171
+ number of resources purged.
172
+ """
173
+ # Get the keys of the resources to delete in batches of 100
174
+ to_delete_batch = []
175
+ async with driver.transaction(read_only=True) as txn:
176
+ async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
177
+ to_delete_batch.append(key)
178
+
179
+ if not to_delete_batch:
180
+ return 0
181
+
182
+ # Delete the resources blobs from storage
183
+ logger.info(f"Purging {len(to_delete_batch)} deleted resources")
184
+ tasks = []
185
+ for key in to_delete_batch:
186
+ kbid, resource_id = key.split("/")[-2:]
187
+ tasks.append(asyncio.create_task(storage.delete_resource(kbid, resource_id)))
188
+ await asyncio.gather(*tasks)
189
+
190
+ # Delete the schedule-to-delete keys
191
+ async with driver.transaction() as txn:
192
+ for key in to_delete_batch:
193
+ await txn.delete(key)
194
+ await txn.commit()
195
+
196
+ return len(to_delete_batch)
197
+
198
+
134
199
  async def purge_kb_vectorsets(driver: Driver, storage: Storage):
135
200
  """Vectors for a vectorset are stored in a key inside each resource. Iterate
136
201
  through all resources of the KB and remove any storage object containing
@@ -139,8 +204,8 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
139
204
  """
140
205
  logger.info("START PURGING KB VECTORSETS")
141
206
 
142
- purged = []
143
- async for key in _iter_keys(driver, KB_VECTORSET_TO_DELETE_BASE):
207
+ vectorsets_to_delete = [key async for key in _iter_keys(driver, KB_VECTORSET_TO_DELETE_BASE)]
208
+ for key in vectorsets_to_delete:
144
209
  logger.info(f"Purging vectorsets {key}")
145
210
  try:
146
211
  _base, kbid, vectorset = key.lstrip("/").split("/")
@@ -149,13 +214,38 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
149
214
  continue
150
215
 
151
216
  try:
217
+ async with driver.transaction(read_only=True) as txn:
218
+ value = await txn.get(key)
219
+ assert value is not None, "Key must exist or we wouldn't had fetch it iterating keys"
220
+ purge_payload = VectorSetPurge()
221
+ purge_payload.ParseFromString(value)
222
+
223
+ fields: list[Field] = []
152
224
  async with driver.transaction(read_only=True) as txn:
153
225
  kb = KnowledgeBox(txn, storage, kbid)
154
226
  async for resource in kb.iterate_resources():
155
- fields = await resource.get_fields(force=True)
227
+ fields.extend((await resource.get_fields(force=True)).values())
228
+
156
229
  # we don't need the maindb transaction anymore to remove vectors from storage
157
- for field in fields.values():
158
- await field.delete_vectors(vectorset)
230
+ for field in fields:
231
+ if purge_payload.storage_key_kind == VectorSetConfig.StorageKeyKind.UNSET:
232
+ # Bw/c for purge before adding purge payload. We assume
233
+ # there's only 2 kinds of KBs: with one or with more than
234
+ # one vectorset. KBs with one vectorset are not allowed to
235
+ # delete their vectorset, so we wouldn't be here. It has to
236
+ # be a KB with multiple, so the storage key kind has to be
237
+ # this:
238
+ await field.delete_vectors(
239
+ vectorset, VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX
240
+ )
241
+ else:
242
+ await field.delete_vectors(vectorset, purge_payload.storage_key_kind)
243
+
244
+ # Finally, delete the key
245
+ async with driver.transaction() as txn:
246
+ await txn.delete(key)
247
+ await txn.commit()
248
+
159
249
  except Exception as exc:
160
250
  errors.capture_exception(exc)
161
251
  logger.error(
@@ -165,13 +255,6 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
165
255
  )
166
256
  continue
167
257
 
168
- purged.append(key)
169
-
170
- async with driver.transaction() as txn:
171
- for key in purged:
172
- await txn.delete(key)
173
- await txn.commit()
174
-
175
258
  logger.info("FINISH PURGING KB VECTORSETS")
176
259
 
177
260
 
@@ -180,22 +263,29 @@ async def main():
180
263
  This script will purge all knowledge boxes marked to be deleted in maindb.
181
264
  """
182
265
  await setup_cluster()
266
+ await start_nidx_utility()
183
267
  driver = await setup_driver()
184
268
  storage = await get_storage(
185
269
  gcs_scopes=["https://www.googleapis.com/auth/devstorage.full_control"],
186
270
  service_name=SERVICE_NAME,
187
271
  )
188
272
  try:
273
+ purge_resources_storage_task = asyncio.create_task(
274
+ purge_deleted_resource_storage(driver, storage)
275
+ )
189
276
  await purge_kb(driver)
190
277
  await purge_kb_storage(driver, storage)
191
278
  await purge_kb_vectorsets(driver, storage)
279
+ await purge_resources_storage_task
192
280
  except Exception as ex: # pragma: no cover
193
281
  logger.exception("Unhandled exception on purge command")
194
282
  errors.capture_exception(ex)
195
283
  finally:
196
284
  try:
285
+ purge_resources_storage_task.cancel()
197
286
  await storage.finalize()
198
287
  await teardown_driver()
288
+ await stop_nidx_utility()
199
289
  await teardown_cluster()
200
290
  except Exception: # pragma: no cover
201
291
  logger.exception("Error tearing down utilities on purge command")
@@ -33,6 +33,7 @@ from nucliadb.common.cluster.manager import KBShardManager
33
33
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
34
34
  from nucliadb.common.maindb.driver import Driver
35
35
  from nucliadb.common.maindb.utils import setup_driver, teardown_driver
36
+ from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
36
37
  from nucliadb.ingest import logger
37
38
  from nucliadb_telemetry import errors
38
39
  from nucliadb_telemetry.logs import setup_logging
@@ -135,10 +136,9 @@ async def _get_stored_shards(driver: Driver) -> dict[str, ShardLocation]:
135
136
  continue
136
137
  else:
137
138
  for shard_object_pb in kb_shards:
138
- for shard_replica_pb in shard_object_pb.replicas:
139
- shard_replica_id = shard_replica_pb.shard.id
140
- node_id = shard_replica_pb.node
141
- stored_shards[shard_replica_id] = ShardLocation(kbid=kbid, node_id=node_id)
139
+ stored_shards[shard_object_pb.nidx_shard_id] = ShardLocation(
140
+ kbid=kbid, node_id="nidx"
141
+ )
142
142
  return stored_shards
143
143
 
144
144
 
@@ -241,6 +241,7 @@ async def main():
241
241
  """
242
242
  args = parse_arguments()
243
243
 
244
+ await start_nidx_utility()
244
245
  await setup_cluster()
245
246
  driver = await setup_driver()
246
247
 
@@ -253,6 +254,7 @@ async def main():
253
254
  finally:
254
255
  await teardown_driver()
255
256
  await teardown_cluster()
257
+ await stop_nidx_utility()
256
258
 
257
259
 
258
260
  def run() -> int: # pragma: no cover