nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nucliadb might be problematic. Click here for more details.
- migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
- migrations/0017_multiple_writable_shards.py +1 -1
- migrations/0018_purge_orphan_kbslugs.py +1 -1
- migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
- migrations/0021_overwrite_vectorsets_key.py +1 -1
- migrations/0023_backfill_pg_catalog.py +7 -3
- migrations/0025_assign_models_to_kbs_v2.py +3 -3
- migrations/0027_rollover_texts3.py +1 -1
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +1 -1
- migrations/0032_remove_old_relations.py +1 -1
- migrations/0036_backfill_catalog_slug.py +1 -1
- migrations/0037_backfill_catalog_facets.py +1 -1
- migrations/0038_backfill_catalog_field_labels.py +7 -3
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/backups/create.py +3 -3
- nucliadb/backups/restore.py +3 -3
- nucliadb/common/cache.py +1 -1
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +3 -19
- nucliadb/common/cluster/rebalance.py +484 -110
- nucliadb/common/cluster/rollover.py +29 -0
- nucliadb/common/cluster/settings.py +1 -1
- nucliadb/common/cluster/utils.py +26 -0
- nucliadb/common/datamanagers/atomic.py +6 -0
- nucliadb/common/datamanagers/utils.py +2 -2
- nucliadb/common/external_index_providers/manager.py +1 -29
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +16 -33
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +4 -0
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +77 -55
- nucliadb/common/locking.py +4 -4
- nucliadb/common/maindb/driver.py +11 -1
- nucliadb/common/maindb/local.py +1 -1
- nucliadb/common/maindb/pg.py +1 -1
- nucliadb/common/nidx.py +19 -1
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +3 -3
- nucliadb/ingest/consumer/pull.py +7 -0
- nucliadb/ingest/consumer/service.py +2 -27
- nucliadb/ingest/consumer/shard_creator.py +17 -6
- nucliadb/ingest/fields/base.py +9 -17
- nucliadb/ingest/fields/conversation.py +47 -1
- nucliadb/ingest/orm/brain_v2.py +21 -3
- nucliadb/ingest/orm/index_message.py +126 -111
- nucliadb/ingest/orm/knowledgebox.py +84 -43
- nucliadb/ingest/orm/processor/auditing.py +1 -1
- nucliadb/ingest/orm/processor/processor.py +95 -149
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +10 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/serialize.py +2 -2
- nucliadb/ingest/service/writer.py +26 -19
- nucliadb/ingest/settings.py +33 -11
- nucliadb/learning_proxy.py +12 -15
- nucliadb/metrics_exporter.py +17 -4
- nucliadb/migrator/datamanager.py +11 -17
- nucliadb/migrator/migrator.py +2 -2
- nucliadb/purge/__init__.py +12 -17
- nucliadb/purge/orphan_shards.py +2 -2
- nucliadb/reader/api/v1/knowledgebox.py +40 -12
- nucliadb/reader/api/v1/learning_config.py +30 -10
- nucliadb/reader/api/v1/resource.py +2 -2
- nucliadb/reader/api/v1/services.py +1 -1
- nucliadb/reader/reader/notifications.py +1 -1
- nucliadb/search/api/v1/__init__.py +1 -0
- nucliadb/search/api/v1/catalog.py +4 -4
- nucliadb/search/api/v1/find.py +1 -4
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/resource/ask.py +21 -1
- nucliadb/search/api/v1/search.py +1 -4
- nucliadb/search/predict.py +9 -2
- nucliadb/search/search/cache.py +1 -20
- nucliadb/search/search/chat/ask.py +50 -8
- nucliadb/search/search/chat/prompt.py +47 -15
- nucliadb/search/search/chat/query.py +8 -1
- nucliadb/search/search/fetch.py +1 -1
- nucliadb/search/search/find.py +1 -6
- nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
- nucliadb/search/search/hydrator/fields.py +175 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +307 -0
- nucliadb/search/search/hydrator/resources.py +56 -0
- nucliadb/search/search/metrics.py +16 -0
- nucliadb/search/search/predict_proxy.py +33 -11
- nucliadb/search/search/query.py +0 -23
- nucliadb/search/search/query_parser/fetcher.py +5 -5
- nucliadb/search/search/query_parser/models.py +1 -30
- nucliadb/search/search/query_parser/parsers/ask.py +1 -1
- nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
- nucliadb/search/search/query_parser/parsers/common.py +16 -7
- nucliadb/search/search/query_parser/parsers/find.py +0 -11
- nucliadb/search/search/query_parser/parsers/graph.py +5 -5
- nucliadb/search/search/query_parser/parsers/search.py +0 -11
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
- nucliadb/search/search/rerankers.py +1 -1
- nucliadb/search/search/summarize.py +1 -1
- nucliadb/standalone/run.py +3 -0
- nucliadb/tasks/retries.py +4 -4
- nucliadb/train/generators/sentence_classifier.py +2 -8
- nucliadb/train/generators/utils.py +1 -1
- nucliadb/train/nodes.py +4 -4
- nucliadb/train/servicer.py +1 -1
- nucliadb/train/uploader.py +1 -1
- nucliadb/writer/api/v1/field.py +14 -9
- nucliadb/writer/api/v1/knowledgebox.py +15 -52
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +2 -2
- nucliadb/writer/resource/field.py +38 -2
- nucliadb/writer/tus/azure.py +4 -4
- nucliadb/writer/tus/gcs.py +11 -17
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
|
@@ -126,6 +126,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
126
126
|
external_index_provider=request.external_index_provider,
|
|
127
127
|
hidden_resources_enabled=request.hidden_resources_enabled,
|
|
128
128
|
hidden_resources_hide_on_creation=request.hidden_resources_hide_on_creation,
|
|
129
|
+
prewarm_enabled=request.prewarm_enabled,
|
|
129
130
|
)
|
|
130
131
|
|
|
131
132
|
except KnowledgeBoxConflict:
|
|
@@ -167,11 +168,17 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
167
168
|
)
|
|
168
169
|
|
|
169
170
|
try:
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
171
|
+
kbid = await KnowledgeBoxORM.update(
|
|
172
|
+
self.driver,
|
|
173
|
+
kbid=request.uuid,
|
|
174
|
+
slug=request.slug,
|
|
175
|
+
title=request.config.title or None,
|
|
176
|
+
description=request.config.description or None,
|
|
177
|
+
external_index_provider=request.config.external_index_provider or None,
|
|
178
|
+
hidden_resources_enabled=request.config.hidden_resources_enabled,
|
|
179
|
+
hidden_resources_hide_on_creation=request.config.hidden_resources_hide_on_creation,
|
|
180
|
+
prewarm_enabled=request.config.prewarm_enabled,
|
|
181
|
+
)
|
|
175
182
|
except KnowledgeBoxNotFound:
|
|
176
183
|
return UpdateKnowledgeBoxResponse(status=KnowledgeBoxResponseStatus.NOTFOUND)
|
|
177
184
|
except Exception:
|
|
@@ -221,13 +228,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
221
228
|
self, request: NewEntitiesGroupRequest, context=None
|
|
222
229
|
) -> NewEntitiesGroupResponse:
|
|
223
230
|
response = NewEntitiesGroupResponse()
|
|
224
|
-
async with self.driver.
|
|
231
|
+
async with self.driver.ro_transaction() as ro_txn:
|
|
225
232
|
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
226
233
|
if kbobj is None:
|
|
227
234
|
response.status = NewEntitiesGroupResponse.Status.KB_NOT_FOUND
|
|
228
235
|
return response
|
|
229
236
|
|
|
230
|
-
async with self.driver.
|
|
237
|
+
async with self.driver.rw_transaction() as txn:
|
|
231
238
|
kbobj.txn = txn
|
|
232
239
|
entities_manager = EntitiesManager(kbobj, txn)
|
|
233
240
|
try:
|
|
@@ -244,7 +251,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
244
251
|
self, request: GetEntitiesRequest, context=None
|
|
245
252
|
) -> GetEntitiesResponse:
|
|
246
253
|
response = GetEntitiesResponse()
|
|
247
|
-
async with self.driver.
|
|
254
|
+
async with self.driver.ro_transaction() as txn:
|
|
248
255
|
kbobj = await self.proc.get_kb_obj(txn, request.kb)
|
|
249
256
|
if kbobj is None:
|
|
250
257
|
response.status = GetEntitiesResponse.Status.NOTFOUND
|
|
@@ -266,7 +273,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
266
273
|
self, request: ListEntitiesGroupsRequest, context=None
|
|
267
274
|
) -> ListEntitiesGroupsResponse:
|
|
268
275
|
response = ListEntitiesGroupsResponse()
|
|
269
|
-
async with self.driver.
|
|
276
|
+
async with self.driver.ro_transaction() as txn:
|
|
270
277
|
kbobj = await self.proc.get_kb_obj(txn, request.kb)
|
|
271
278
|
if kbobj is None:
|
|
272
279
|
response.status = ListEntitiesGroupsResponse.Status.NOTFOUND
|
|
@@ -290,7 +297,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
290
297
|
self, request: GetEntitiesGroupRequest, context=None
|
|
291
298
|
) -> GetEntitiesGroupResponse:
|
|
292
299
|
response = GetEntitiesGroupResponse()
|
|
293
|
-
async with self.driver.
|
|
300
|
+
async with self.driver.ro_transaction() as txn:
|
|
294
301
|
kbobj = await self.proc.get_kb_obj(txn, request.kb)
|
|
295
302
|
if kbobj is None:
|
|
296
303
|
response.status = GetEntitiesGroupResponse.Status.KB_NOT_FOUND
|
|
@@ -315,13 +322,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
315
322
|
|
|
316
323
|
async def SetEntities(self, request: SetEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
|
|
317
324
|
response = OpStatusWriter()
|
|
318
|
-
async with self.driver.
|
|
325
|
+
async with self.driver.ro_transaction() as ro_txn:
|
|
319
326
|
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
320
327
|
if kbobj is None:
|
|
321
328
|
response.status = OpStatusWriter.Status.NOTFOUND
|
|
322
329
|
return response
|
|
323
330
|
|
|
324
|
-
async with self.driver.
|
|
331
|
+
async with self.driver.rw_transaction() as txn:
|
|
325
332
|
kbobj.txn = txn
|
|
326
333
|
entities_manager = EntitiesManager(kbobj, txn)
|
|
327
334
|
try:
|
|
@@ -339,13 +346,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
339
346
|
self, request: UpdateEntitiesGroupRequest, context=None
|
|
340
347
|
) -> UpdateEntitiesGroupResponse:
|
|
341
348
|
response = UpdateEntitiesGroupResponse()
|
|
342
|
-
async with self.driver.
|
|
349
|
+
async with self.driver.ro_transaction() as ro_txn:
|
|
343
350
|
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
344
351
|
if kbobj is None:
|
|
345
352
|
response.status = UpdateEntitiesGroupResponse.Status.KB_NOT_FOUND
|
|
346
353
|
return response
|
|
347
354
|
|
|
348
|
-
async with self.driver.
|
|
355
|
+
async with self.driver.rw_transaction() as txn:
|
|
349
356
|
kbobj.txn = txn
|
|
350
357
|
entities_manager = EntitiesManager(kbobj, txn)
|
|
351
358
|
try:
|
|
@@ -368,13 +375,13 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
368
375
|
async def DelEntities(self, request: DelEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
|
|
369
376
|
response = OpStatusWriter()
|
|
370
377
|
|
|
371
|
-
async with self.driver.
|
|
378
|
+
async with self.driver.ro_transaction() as ro_txn:
|
|
372
379
|
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
373
380
|
if kbobj is None:
|
|
374
381
|
response.status = OpStatusWriter.Status.NOTFOUND
|
|
375
382
|
return response
|
|
376
383
|
|
|
377
|
-
async with self.driver.
|
|
384
|
+
async with self.driver.rw_transaction() as txn:
|
|
378
385
|
kbobj.txn = txn
|
|
379
386
|
entities_manager = EntitiesManager(kbobj, txn)
|
|
380
387
|
try:
|
|
@@ -393,7 +400,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
393
400
|
) -> WriterStatusResponse:
|
|
394
401
|
logger.info("Status Call")
|
|
395
402
|
response = WriterStatusResponse()
|
|
396
|
-
async with self.driver.
|
|
403
|
+
async with self.driver.ro_transaction() as txn:
|
|
397
404
|
async for _, slug in datamanagers.kb.get_kbs(txn):
|
|
398
405
|
response.knowledgeboxes.append(slug)
|
|
399
406
|
|
|
@@ -405,7 +412,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
405
412
|
return response
|
|
406
413
|
|
|
407
414
|
async def Index(self, request: IndexResource, context=None) -> IndexStatus: # type: ignore
|
|
408
|
-
async with self.driver.
|
|
415
|
+
async with self.driver.ro_transaction() as txn:
|
|
409
416
|
kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
|
|
410
417
|
resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
|
|
411
418
|
bm = await generate_broker_message(resobj)
|
|
@@ -419,7 +426,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
419
426
|
|
|
420
427
|
async def ReIndex(self, request: IndexResource, context=None) -> IndexStatus: # type: ignore
|
|
421
428
|
try:
|
|
422
|
-
async with self.driver.
|
|
429
|
+
async with self.driver.rw_transaction() as txn:
|
|
423
430
|
kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
|
|
424
431
|
resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
|
|
425
432
|
resobj.disable_vectors = not request.reindex_vectors
|
nucliadb/ingest/settings.py
CHANGED
|
@@ -67,6 +67,11 @@ class DriverSettings(BaseSettings):
|
|
|
67
67
|
)
|
|
68
68
|
|
|
69
69
|
|
|
70
|
+
class CatalogConfig(Enum):
|
|
71
|
+
UNSET = "unset"
|
|
72
|
+
PG = "pg"
|
|
73
|
+
|
|
74
|
+
|
|
70
75
|
# For use during migration from pull v1 to pull v2
|
|
71
76
|
class ProcessingPullMode(Enum):
|
|
72
77
|
OFF = "off"
|
|
@@ -75,26 +80,43 @@ class ProcessingPullMode(Enum):
|
|
|
75
80
|
|
|
76
81
|
|
|
77
82
|
class Settings(DriverSettings):
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
partitions: list[str] = ["1"]
|
|
83
|
+
# Catalog settings
|
|
84
|
+
catalog: CatalogConfig = Field(default=CatalogConfig.PG, description="Catalog backend")
|
|
81
85
|
|
|
86
|
+
# Pull worker settings
|
|
82
87
|
pull_time_error_backoff: int = 30
|
|
83
88
|
pull_api_timeout: int = 60
|
|
84
|
-
disable_pull_worker: bool =
|
|
89
|
+
disable_pull_worker: bool = Field(
|
|
90
|
+
default=False, description="Set to true to disable the pull worker task"
|
|
91
|
+
)
|
|
85
92
|
|
|
86
|
-
#
|
|
87
|
-
replica_number: int =
|
|
88
|
-
|
|
89
|
-
|
|
93
|
+
# Ingest consumer sts replica settings
|
|
94
|
+
replica_number: int = Field(
|
|
95
|
+
default=-1,
|
|
96
|
+
description="The replica number of this ingest statefulset instance. Leave to -1 to auto-assign based on hostname.",
|
|
97
|
+
)
|
|
98
|
+
total_replicas: int = Field(default=1, description="Number of ingest statefulset replicas deployed")
|
|
99
|
+
nuclia_partitions: int = Field(
|
|
100
|
+
default=50, description="Total number of partitions of the nats stream."
|
|
101
|
+
)
|
|
102
|
+
partitions: list[str] = Field(
|
|
103
|
+
default=["1"],
|
|
104
|
+
description="List of partitions assigned to this ingest statefulset instance. This is automatically assigned based on the replica number and total replicas.",
|
|
105
|
+
)
|
|
106
|
+
max_concurrent_ingest_processing: int = Field(
|
|
107
|
+
default=5,
|
|
108
|
+
description="Controls the number of concurrent messages from different partitions that can be processed at the same time by ingest statefulset consumers.",
|
|
109
|
+
)
|
|
90
110
|
|
|
91
|
-
|
|
111
|
+
# Grpc server settings
|
|
112
|
+
grpc_port: int = 8030
|
|
113
|
+
max_receive_message_length: int = Field(
|
|
114
|
+
default=500, description="Maximum receive grpc message length in MB."
|
|
115
|
+
)
|
|
92
116
|
|
|
93
117
|
# Search query timeouts
|
|
94
118
|
relation_search_timeout: float = 10.0
|
|
95
119
|
relation_types_timeout: float = 10.0
|
|
96
120
|
|
|
97
|
-
max_concurrent_ingest_processing: int = 5
|
|
98
|
-
|
|
99
121
|
|
|
100
122
|
settings = Settings()
|
nucliadb/learning_proxy.py
CHANGED
|
@@ -43,8 +43,8 @@ logger = logging.getLogger(SERVICE_NAME)
|
|
|
43
43
|
WHITELISTED_HEADERS = {
|
|
44
44
|
"x-nucliadb-user",
|
|
45
45
|
"x-nucliadb-roles",
|
|
46
|
-
"x-
|
|
47
|
-
"x-
|
|
46
|
+
"x-nucliadb-account",
|
|
47
|
+
"x-nucliadb-account-type",
|
|
48
48
|
"x-forwarded-for",
|
|
49
49
|
"x-forwarded-host",
|
|
50
50
|
"x-forwarded-proto",
|
|
@@ -204,14 +204,14 @@ async def learning_config_proxy(
|
|
|
204
204
|
request: Request,
|
|
205
205
|
method: str,
|
|
206
206
|
url: str,
|
|
207
|
-
|
|
207
|
+
headers: dict[str, str] = {},
|
|
208
208
|
) -> Union[Response, StreamingResponse]:
|
|
209
209
|
return await proxy(
|
|
210
210
|
service=LearningService.CONFIG,
|
|
211
211
|
request=request,
|
|
212
212
|
method=method,
|
|
213
213
|
url=url,
|
|
214
|
-
|
|
214
|
+
headers=headers,
|
|
215
215
|
)
|
|
216
216
|
|
|
217
217
|
|
|
@@ -244,24 +244,21 @@ async def proxy(
|
|
|
244
244
|
request: Request,
|
|
245
245
|
method: str,
|
|
246
246
|
url: str,
|
|
247
|
-
|
|
247
|
+
headers: dict[str, str] = {},
|
|
248
248
|
) -> Union[Response, StreamingResponse]:
|
|
249
249
|
"""
|
|
250
250
|
Proxy the request to a learning API.
|
|
251
251
|
|
|
252
|
-
service:
|
|
253
|
-
request:
|
|
254
|
-
method:
|
|
255
|
-
url:
|
|
256
|
-
|
|
252
|
+
service: The learning service to proxy the request to.
|
|
253
|
+
request: The incoming request.
|
|
254
|
+
method: The HTTP method to use.
|
|
255
|
+
url: The URL to proxy the request to.
|
|
256
|
+
headers: Extra headers to include in the proxied request.
|
|
257
257
|
|
|
258
258
|
Returns: Response. The response from the learning API. If the response is chunked, a StreamingResponse is returned.
|
|
259
259
|
"""
|
|
260
|
-
|
|
261
|
-
proxied_headers
|
|
262
|
-
proxied_headers.update(
|
|
263
|
-
{k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
|
|
264
|
-
)
|
|
260
|
+
proxied_headers = {k.lower(): v for k, v in request.headers.items() if is_white_listed_header(k)}
|
|
261
|
+
proxied_headers.update(**headers)
|
|
265
262
|
|
|
266
263
|
async with service_client(
|
|
267
264
|
base_url=get_base_url(service=service),
|
nucliadb/metrics_exporter.py
CHANGED
|
@@ -37,19 +37,24 @@ MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "versi
|
|
|
37
37
|
|
|
38
38
|
PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
|
|
39
39
|
|
|
40
|
+
KBS_COUNT = metrics.Gauge("nucliadb_kbs_count")
|
|
41
|
+
RESOURCES_COUNT = metrics.Gauge("nucliadb_resources_count")
|
|
42
|
+
|
|
40
43
|
|
|
41
44
|
async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
|
|
42
45
|
"""
|
|
43
46
|
Return a list of all KB ids.
|
|
44
47
|
"""
|
|
45
|
-
async with context.kv_driver.
|
|
48
|
+
async with context.kv_driver.ro_transaction() as txn:
|
|
46
49
|
async for kbid, _ in datamanagers.kb.get_kbs(txn):
|
|
47
50
|
yield kbid
|
|
48
51
|
|
|
49
52
|
|
|
50
|
-
async def
|
|
53
|
+
async def update_kb_metrics(context: ApplicationContext):
|
|
51
54
|
"""
|
|
52
|
-
Report
|
|
55
|
+
Report metrics at the kb level:
|
|
56
|
+
- total number of KBs
|
|
57
|
+
- the global migration version and the number of KBs per migration version.
|
|
53
58
|
"""
|
|
54
59
|
# Clear previoulsy set values so that we report only the current state
|
|
55
60
|
MIGRATION_COUNT.gauge.clear()
|
|
@@ -60,12 +65,16 @@ async def update_migration_metrics(context: ApplicationContext):
|
|
|
60
65
|
MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
|
|
61
66
|
|
|
62
67
|
version_count: dict[str, int] = {}
|
|
68
|
+
n_kbs = 0
|
|
63
69
|
async for kbid in iter_kbids(context):
|
|
64
70
|
kb_info = await mdm.get_kb_info(kbid)
|
|
65
71
|
if kb_info is not None:
|
|
66
72
|
current_version = str(kb_info.current_version)
|
|
67
73
|
version_count.setdefault(current_version, 0)
|
|
68
74
|
version_count[current_version] += 1
|
|
75
|
+
n_kbs += 1
|
|
76
|
+
|
|
77
|
+
KBS_COUNT.set(n_kbs)
|
|
69
78
|
|
|
70
79
|
for version, count in version_count.items():
|
|
71
80
|
MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
|
|
@@ -88,6 +97,10 @@ async def update_resource_metrics(context: ApplicationContext):
|
|
|
88
97
|
count = cast(Tuple[int], await cur.fetchone())[0]
|
|
89
98
|
PENDING_RESOURCE_COUNT.set(count)
|
|
90
99
|
|
|
100
|
+
await cur.execute("SELECT COUNT(*) FROM catalog")
|
|
101
|
+
count = cast(Tuple[int], await cur.fetchone())[0]
|
|
102
|
+
RESOURCES_COUNT.set(count)
|
|
103
|
+
|
|
91
104
|
|
|
92
105
|
async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
|
|
93
106
|
"""
|
|
@@ -109,7 +122,7 @@ async def run_exporter(context: ApplicationContext):
|
|
|
109
122
|
# Schedule exporter tasks
|
|
110
123
|
tasks = []
|
|
111
124
|
for export_task, interval in [
|
|
112
|
-
(
|
|
125
|
+
(update_kb_metrics, 60 * 3),
|
|
113
126
|
(update_resource_metrics, 60 * 5),
|
|
114
127
|
]:
|
|
115
128
|
tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))
|
nucliadb/migrator/datamanager.py
CHANGED
|
@@ -48,45 +48,39 @@ class MigrationsDataManager:
|
|
|
48
48
|
|
|
49
49
|
async def schedule_all_kbs(self, target_version: int) -> None:
|
|
50
50
|
# Get all kb ids
|
|
51
|
-
async with self.driver.
|
|
51
|
+
async with self.driver.ro_transaction() as txn:
|
|
52
52
|
kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
|
|
53
53
|
# Schedule the migrations
|
|
54
|
-
async with self.driver.
|
|
54
|
+
async with self.driver.rw_transaction() as txn:
|
|
55
55
|
for kbid in kbids:
|
|
56
56
|
await txn.set(MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode())
|
|
57
57
|
await txn.commit()
|
|
58
58
|
|
|
59
59
|
async def get_kb_migrations(self) -> list[str]:
|
|
60
60
|
keys = []
|
|
61
|
-
async with self.driver.
|
|
61
|
+
async with self.driver.ro_transaction() as txn:
|
|
62
62
|
async for key in txn.keys(MIGRATIONS_CONTAINER_KEY):
|
|
63
63
|
keys.append(key.split("/")[-1])
|
|
64
64
|
|
|
65
65
|
return keys
|
|
66
66
|
|
|
67
67
|
async def delete_kb_migration(self, *, kbid: str) -> None:
|
|
68
|
-
async with self.driver.
|
|
68
|
+
async with self.driver.rw_transaction() as txn:
|
|
69
69
|
await txn.delete(MIGRATIONS_KEY.format(kbid=kbid))
|
|
70
70
|
await txn.commit()
|
|
71
71
|
|
|
72
72
|
async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
|
|
73
|
-
async with self.driver.
|
|
73
|
+
async with self.driver.ro_transaction() as txn:
|
|
74
74
|
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
75
75
|
if kb_config is None:
|
|
76
76
|
return None
|
|
77
77
|
return KnowledgeBoxInfo(current_version=kb_config.migration_version)
|
|
78
78
|
|
|
79
79
|
async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
|
|
80
|
-
|
|
81
|
-
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
|
|
82
|
-
if kb_config is None:
|
|
83
|
-
raise Exception(f"KB {kbid} does not exist")
|
|
84
|
-
kb_config.migration_version = current_version
|
|
85
|
-
await KnowledgeBoxORM.update(txn, kbid, config=kb_config)
|
|
86
|
-
await txn.commit()
|
|
80
|
+
await KnowledgeBoxORM.update(self.driver, kbid, migration_version=current_version)
|
|
87
81
|
|
|
88
82
|
async def get_global_info(self) -> GlobalInfo:
|
|
89
|
-
async with self.driver.
|
|
83
|
+
async with self.driver.ro_transaction() as txn:
|
|
90
84
|
raw_pb = await txn.get(MIGRATION_INFO_KEY)
|
|
91
85
|
if raw_pb is None:
|
|
92
86
|
return GlobalInfo(current_version=0, target_version=None)
|
|
@@ -100,7 +94,7 @@ class MigrationsDataManager:
|
|
|
100
94
|
current_version: Union[int, _Unset] = _UNSET,
|
|
101
95
|
target_version: Union[int, None, _Unset] = _UNSET,
|
|
102
96
|
) -> None:
|
|
103
|
-
async with self.driver.
|
|
97
|
+
async with self.driver.rw_transaction() as txn:
|
|
104
98
|
raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
|
|
105
99
|
pb = migrations_pb2.MigrationInfo()
|
|
106
100
|
if raw_pb is not None:
|
|
@@ -116,18 +110,18 @@ class MigrationsDataManager:
|
|
|
116
110
|
|
|
117
111
|
async def get_kbs_to_rollover(self) -> list[str]:
|
|
118
112
|
keys = []
|
|
119
|
-
async with self.driver.
|
|
113
|
+
async with self.driver.ro_transaction() as txn:
|
|
120
114
|
async for key in txn.keys(ROLLOVER_CONTAINER_KEY):
|
|
121
115
|
keys.append(key.split("/")[-1])
|
|
122
116
|
|
|
123
117
|
return keys
|
|
124
118
|
|
|
125
119
|
async def add_kb_rollover(self, kbid: str) -> None:
|
|
126
|
-
async with self.driver.
|
|
120
|
+
async with self.driver.rw_transaction() as txn:
|
|
127
121
|
await txn.set(ROLLOVER_KEY.format(kbid=kbid), b"")
|
|
128
122
|
await txn.commit()
|
|
129
123
|
|
|
130
124
|
async def delete_kb_rollover(self, kbid: str) -> None:
|
|
131
|
-
async with self.driver.
|
|
125
|
+
async with self.driver.rw_transaction() as txn:
|
|
132
126
|
await txn.delete(ROLLOVER_KEY.format(kbid=kbid))
|
|
133
127
|
await txn.commit()
|
nucliadb/migrator/migrator.py
CHANGED
|
@@ -211,7 +211,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
|
|
|
211
211
|
|
|
212
212
|
# The migration uses two transactions. The former is only used to get a lock (pg_advisory_lock)
|
|
213
213
|
# without having to worry about correctly unlocking it (postgres unlocks it when the transaction ends)
|
|
214
|
-
async with driver.
|
|
214
|
+
async with driver.rw_transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock: # type: ignore[attr-defined]
|
|
215
215
|
await cur_lock.execute(
|
|
216
216
|
"CREATE TABLE IF NOT EXISTS migrations (version INT PRIMARY KEY, migrated_at TIMESTAMP NOT NULL DEFAULT NOW())"
|
|
217
217
|
)
|
|
@@ -227,7 +227,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
|
|
|
227
227
|
|
|
228
228
|
# Gets a new transaction for each migration, so if they get interrupted we at least
|
|
229
229
|
# save the state of the last finished transaction
|
|
230
|
-
async with driver.
|
|
230
|
+
async with driver.rw_transaction() as tx, tx.connection.cursor() as cur: # type: ignore[attr-defined]
|
|
231
231
|
await migration.migrate(tx)
|
|
232
232
|
await cur.execute("INSERT INTO migrations (version) VALUES (%s)", (version,))
|
|
233
233
|
await tx.commit()
|
nucliadb/purge/__init__.py
CHANGED
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
import importlib.metadata
|
|
22
|
+
from itertools import batched # type: ignore
|
|
22
23
|
from typing import AsyncGenerator
|
|
23
24
|
|
|
24
25
|
from nucliadb.common import datamanagers
|
|
@@ -47,7 +48,7 @@ from nucliadb_utils.utilities import get_storage
|
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
async def _iter_keys(driver: Driver, match: str) -> AsyncGenerator[str, None]:
|
|
50
|
-
async with driver.
|
|
51
|
+
async with driver.ro_transaction() as keys_txn:
|
|
51
52
|
async for key in keys_txn.keys(match=match):
|
|
52
53
|
yield key
|
|
53
54
|
|
|
@@ -81,9 +82,9 @@ async def purge_kb(driver: Driver):
|
|
|
81
82
|
)
|
|
82
83
|
continue
|
|
83
84
|
|
|
84
|
-
# Now delete the
|
|
85
|
+
# Now delete the delete mark
|
|
85
86
|
try:
|
|
86
|
-
async with driver.
|
|
87
|
+
async with driver.rw_transaction() as txn:
|
|
87
88
|
key_to_purge = KB_TO_DELETE.format(kbid=kbid)
|
|
88
89
|
await txn.delete(key_to_purge)
|
|
89
90
|
await txn.commit()
|
|
@@ -126,7 +127,7 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
|
|
|
126
127
|
|
|
127
128
|
if delete_marker:
|
|
128
129
|
try:
|
|
129
|
-
async with driver.
|
|
130
|
+
async with driver.rw_transaction() as txn:
|
|
130
131
|
await txn.delete(key)
|
|
131
132
|
await txn.commit()
|
|
132
133
|
logger.info(f" √ Deleted storage deletion marker {key}")
|
|
@@ -163,7 +164,7 @@ async def _count_resources_storage_to_purge(driver: Driver) -> int:
|
|
|
163
164
|
"""
|
|
164
165
|
Count the number of resources marked as deleted in storage.
|
|
165
166
|
"""
|
|
166
|
-
async with driver.
|
|
167
|
+
async with driver.ro_transaction() as txn:
|
|
167
168
|
return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
|
|
168
169
|
|
|
169
170
|
|
|
@@ -174,7 +175,7 @@ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch
|
|
|
174
175
|
"""
|
|
175
176
|
# Get the keys of the resources to delete in batches of 100
|
|
176
177
|
to_delete_batch = []
|
|
177
|
-
async with driver.
|
|
178
|
+
async with driver.ro_transaction() as txn:
|
|
178
179
|
async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
|
|
179
180
|
to_delete_batch.append(key)
|
|
180
181
|
|
|
@@ -194,7 +195,7 @@ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch
|
|
|
194
195
|
await asyncio.gather(*tasks)
|
|
195
196
|
|
|
196
197
|
# Delete the schedule-to-delete keys
|
|
197
|
-
async with driver.
|
|
198
|
+
async with driver.rw_transaction() as txn:
|
|
198
199
|
for key in to_delete_batch:
|
|
199
200
|
await txn.delete(key)
|
|
200
201
|
await txn.commit()
|
|
@@ -220,20 +221,20 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
|
|
220
221
|
continue
|
|
221
222
|
|
|
222
223
|
try:
|
|
223
|
-
async with driver.
|
|
224
|
+
async with driver.ro_transaction() as txn:
|
|
224
225
|
value = await txn.get(key)
|
|
225
226
|
assert value is not None, "Key must exist or we wouldn't had fetch it iterating keys"
|
|
226
227
|
purge_payload = VectorSetPurge()
|
|
227
228
|
purge_payload.ParseFromString(value)
|
|
228
229
|
|
|
229
230
|
fields: list[Field] = []
|
|
230
|
-
async with driver.
|
|
231
|
+
async with driver.ro_transaction() as txn:
|
|
231
232
|
kb = KnowledgeBox(txn, storage, kbid)
|
|
232
233
|
async for resource in kb.iterate_resources():
|
|
233
234
|
fields.extend((await resource.get_fields(force=True)).values())
|
|
234
235
|
|
|
235
236
|
logger.info(f"Purging {len(fields)} fields for vectorset {vectorset}", extra={"kbid": kbid})
|
|
236
|
-
for fields_batch in
|
|
237
|
+
for fields_batch in batched(fields, n=20):
|
|
237
238
|
tasks = []
|
|
238
239
|
for field in fields_batch:
|
|
239
240
|
if purge_payload.storage_key_kind == VectorSetConfig.StorageKeyKind.UNSET:
|
|
@@ -259,7 +260,7 @@ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
|
|
259
260
|
await asyncio.gather(*tasks)
|
|
260
261
|
|
|
261
262
|
# Finally, delete the key
|
|
262
|
-
async with driver.
|
|
263
|
+
async with driver.rw_transaction() as txn:
|
|
263
264
|
await txn.delete(key)
|
|
264
265
|
await txn.commit()
|
|
265
266
|
|
|
@@ -317,9 +318,3 @@ def run() -> int: # pragma: no cover
|
|
|
317
318
|
setup_logging()
|
|
318
319
|
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
|
319
320
|
return asyncio.run(main())
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
def batchify(iterable, n=1):
|
|
323
|
-
"""Yield successive n-sized chunks from iterable."""
|
|
324
|
-
for i in range(0, len(iterable), n):
|
|
325
|
-
yield iterable[i : i + n]
|
nucliadb/purge/orphan_shards.py
CHANGED
|
@@ -100,7 +100,7 @@ async def _get_indexed_shards() -> dict[str, ShardKb]:
|
|
|
100
100
|
async def _get_stored_shards(driver: Driver) -> dict[str, ShardKb]:
|
|
101
101
|
stored_shards: dict[str, ShardKb] = {}
|
|
102
102
|
|
|
103
|
-
async with driver.
|
|
103
|
+
async with driver.ro_transaction() as txn:
|
|
104
104
|
async for kbid, _ in datamanagers.kb.get_kbs(txn):
|
|
105
105
|
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
|
106
106
|
if kb_shards is None:
|
|
@@ -135,7 +135,7 @@ async def _get_kbid(shard_id: str) -> Optional[str]:
|
|
|
135
135
|
async def report_orphan_shards(driver: Driver):
|
|
136
136
|
orphan_shards = await detect_orphan_shards(driver)
|
|
137
137
|
logger.info(f"Found {len(orphan_shards)} orphan shards")
|
|
138
|
-
async with driver.
|
|
138
|
+
async with driver.ro_transaction() as txn:
|
|
139
139
|
for shard_id, kbid in orphan_shards.items():
|
|
140
140
|
if kbid == UNKNOWN_KB:
|
|
141
141
|
msg = "Found orphan shard but could not get KB info"
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from fastapi import HTTPException
|
|
20
|
+
from fastapi import Header, HTTPException
|
|
21
21
|
from fastapi_versioning import version
|
|
22
22
|
from starlette.requests import Request
|
|
23
23
|
|
|
@@ -44,12 +44,20 @@ from nucliadb_utils.authentication import requires, requires_one
|
|
|
44
44
|
)
|
|
45
45
|
@requires(NucliaDBRoles.MANAGER)
|
|
46
46
|
@version(1)
|
|
47
|
-
async def get_kbs(
|
|
47
|
+
async def get_kbs(
|
|
48
|
+
request: Request,
|
|
49
|
+
prefix: str = "",
|
|
50
|
+
x_nucliadb_account: str = Header(default="", include_in_schema=False),
|
|
51
|
+
) -> KnowledgeBoxList:
|
|
48
52
|
driver = get_driver()
|
|
49
|
-
async with driver.
|
|
53
|
+
async with driver.ro_transaction() as txn:
|
|
50
54
|
response = KnowledgeBoxList()
|
|
51
55
|
async for kbid, slug in datamanagers.kb.get_kbs(txn, prefix=prefix):
|
|
52
|
-
response.kbs.append(
|
|
56
|
+
response.kbs.append(
|
|
57
|
+
KnowledgeBoxObjSummary(
|
|
58
|
+
slug=user_kb_slug(slug, account_id=x_nucliadb_account) or None, uuid=kbid
|
|
59
|
+
)
|
|
60
|
+
)
|
|
53
61
|
return response
|
|
54
62
|
|
|
55
63
|
|
|
@@ -62,16 +70,18 @@ async def get_kbs(request: Request, prefix: str = "") -> KnowledgeBoxList:
|
|
|
62
70
|
)
|
|
63
71
|
@requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.READER])
|
|
64
72
|
@version(1)
|
|
65
|
-
async def get_kb(
|
|
73
|
+
async def get_kb(
|
|
74
|
+
request: Request, kbid: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
|
|
75
|
+
) -> KnowledgeBoxObj:
|
|
66
76
|
driver = get_driver()
|
|
67
|
-
async with driver.
|
|
77
|
+
async with driver.ro_transaction() as txn:
|
|
68
78
|
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
69
79
|
if kb_config is None:
|
|
70
80
|
raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
|
|
71
81
|
|
|
72
82
|
return KnowledgeBoxObj(
|
|
73
83
|
uuid=kbid,
|
|
74
|
-
slug=kb_config.slug,
|
|
84
|
+
slug=user_kb_slug(kb_config.slug, account_id=x_nucliadb_account),
|
|
75
85
|
config=from_proto.knowledgebox_config(kb_config),
|
|
76
86
|
)
|
|
77
87
|
|
|
@@ -85,12 +95,18 @@ async def get_kb(request: Request, kbid: str) -> KnowledgeBoxObj:
|
|
|
85
95
|
)
|
|
86
96
|
@requires_one([NucliaDBRoles.MANAGER, NucliaDBRoles.READER])
|
|
87
97
|
@version(1)
|
|
88
|
-
async def get_kb_by_slug(
|
|
98
|
+
async def get_kb_by_slug(
|
|
99
|
+
request: Request, slug: str, x_nucliadb_account: str = Header(default="", include_in_schema=False)
|
|
100
|
+
) -> KnowledgeBoxObj:
|
|
89
101
|
driver = get_driver()
|
|
90
|
-
async with driver.
|
|
91
|
-
|
|
102
|
+
async with driver.ro_transaction() as txn:
|
|
103
|
+
# For cloud, the account id is prepended in order to be able to reuse the same slug in different accounts.
|
|
104
|
+
kbid = await datamanagers.kb.get_kb_uuid(txn, slug=f"{x_nucliadb_account}:{slug}")
|
|
92
105
|
if kbid is None:
|
|
93
|
-
|
|
106
|
+
# For onprem, the slug is fully controlled by the user
|
|
107
|
+
kbid = await datamanagers.kb.get_kb_uuid(txn, slug=slug)
|
|
108
|
+
if kbid is None:
|
|
109
|
+
raise HTTPException(status_code=404, detail="Knowledge Box does not exist")
|
|
94
110
|
|
|
95
111
|
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
96
112
|
if kb_config is None:
|
|
@@ -98,6 +114,18 @@ async def get_kb_by_slug(request: Request, slug: str) -> KnowledgeBoxObj:
|
|
|
98
114
|
|
|
99
115
|
return KnowledgeBoxObj(
|
|
100
116
|
uuid=kbid,
|
|
101
|
-
slug=kb_config.slug,
|
|
117
|
+
slug=user_kb_slug(kb_config.slug, account_id=x_nucliadb_account),
|
|
102
118
|
config=from_proto.knowledgebox_config(kb_config),
|
|
103
119
|
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def user_kb_slug(stored_slug: str, account_id: str) -> str:
|
|
123
|
+
if account_id != "":
|
|
124
|
+
# On cloud deployments, backend prepends the account id to the user-defined slug.
|
|
125
|
+
# This is required to make kb slugs reused across different accounts using the same nucliadb.
|
|
126
|
+
# We strip it so the user does not see it.
|
|
127
|
+
return stored_slug.split(f"{account_id}:")[-1]
|
|
128
|
+
else:
|
|
129
|
+
# On on-prem deployments, the account_id is set to "" by default and we don't need to strip
|
|
130
|
+
# anything as the backend is not invovled in the kb creation process.
|
|
131
|
+
return stored_slug
|