nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -18,12 +18,11 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import uuid
|
|
21
|
-
from
|
|
21
|
+
from collections.abc import AsyncIterator
|
|
22
22
|
|
|
23
23
|
from nucliadb.backups import tasks as backup_tasks
|
|
24
24
|
from nucliadb.backups import utils as backup_utils
|
|
25
25
|
from nucliadb.common import datamanagers
|
|
26
|
-
from nucliadb.common.cluster.exceptions import AlreadyExists, EntitiesGroupNotFound
|
|
27
26
|
from nucliadb.common.cluster.utils import get_shard_manager
|
|
28
27
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
|
29
28
|
from nucliadb.common.external_index_providers.exceptions import ExternalIndexCreationError
|
|
@@ -49,7 +48,6 @@ from nucliadb_protos.knowledgebox_pb2 import (
|
|
|
49
48
|
)
|
|
50
49
|
from nucliadb_protos.writer_pb2 import (
|
|
51
50
|
BrokerMessage,
|
|
52
|
-
DelEntitiesRequest,
|
|
53
51
|
GetEntitiesGroupRequest,
|
|
54
52
|
GetEntitiesGroupResponse,
|
|
55
53
|
GetEntitiesRequest,
|
|
@@ -58,12 +56,7 @@ from nucliadb_protos.writer_pb2 import (
|
|
|
58
56
|
IndexStatus,
|
|
59
57
|
ListEntitiesGroupsRequest,
|
|
60
58
|
ListEntitiesGroupsResponse,
|
|
61
|
-
NewEntitiesGroupRequest,
|
|
62
|
-
NewEntitiesGroupResponse,
|
|
63
59
|
OpStatusWriter,
|
|
64
|
-
SetEntitiesRequest,
|
|
65
|
-
UpdateEntitiesGroupRequest,
|
|
66
|
-
UpdateEntitiesGroupResponse,
|
|
67
60
|
WriterStatusRequest,
|
|
68
61
|
WriterStatusResponse,
|
|
69
62
|
)
|
|
@@ -224,29 +217,6 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
224
217
|
logger.info(f"Processed {message.uuid}")
|
|
225
218
|
return response
|
|
226
219
|
|
|
227
|
-
async def NewEntitiesGroup( # type: ignore
|
|
228
|
-
self, request: NewEntitiesGroupRequest, context=None
|
|
229
|
-
) -> NewEntitiesGroupResponse:
|
|
230
|
-
response = NewEntitiesGroupResponse()
|
|
231
|
-
async with self.driver.ro_transaction() as ro_txn:
|
|
232
|
-
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
233
|
-
if kbobj is None:
|
|
234
|
-
response.status = NewEntitiesGroupResponse.Status.KB_NOT_FOUND
|
|
235
|
-
return response
|
|
236
|
-
|
|
237
|
-
async with self.driver.rw_transaction() as txn:
|
|
238
|
-
kbobj.txn = txn
|
|
239
|
-
entities_manager = EntitiesManager(kbobj, txn)
|
|
240
|
-
try:
|
|
241
|
-
await entities_manager.create_entities_group(request.group, request.entities)
|
|
242
|
-
except AlreadyExists:
|
|
243
|
-
response.status = NewEntitiesGroupResponse.Status.ALREADY_EXISTS
|
|
244
|
-
return response
|
|
245
|
-
|
|
246
|
-
await txn.commit()
|
|
247
|
-
response.status = NewEntitiesGroupResponse.Status.OK
|
|
248
|
-
return response
|
|
249
|
-
|
|
250
220
|
async def GetEntities( # type: ignore
|
|
251
221
|
self, request: GetEntitiesRequest, context=None
|
|
252
222
|
) -> GetEntitiesResponse:
|
|
@@ -320,81 +290,6 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
320
290
|
|
|
321
291
|
return response
|
|
322
292
|
|
|
323
|
-
async def SetEntities(self, request: SetEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
|
|
324
|
-
response = OpStatusWriter()
|
|
325
|
-
async with self.driver.ro_transaction() as ro_txn:
|
|
326
|
-
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
327
|
-
if kbobj is None:
|
|
328
|
-
response.status = OpStatusWriter.Status.NOTFOUND
|
|
329
|
-
return response
|
|
330
|
-
|
|
331
|
-
async with self.driver.rw_transaction() as txn:
|
|
332
|
-
kbobj.txn = txn
|
|
333
|
-
entities_manager = EntitiesManager(kbobj, txn)
|
|
334
|
-
try:
|
|
335
|
-
await entities_manager.set_entities_group(request.group, request.entities)
|
|
336
|
-
except Exception as e:
|
|
337
|
-
errors.capture_exception(e)
|
|
338
|
-
logger.error("Error in ingest gRPC servicer", exc_info=True)
|
|
339
|
-
response.status = OpStatusWriter.Status.ERROR
|
|
340
|
-
else:
|
|
341
|
-
response.status = OpStatusWriter.Status.OK
|
|
342
|
-
await txn.commit()
|
|
343
|
-
return response
|
|
344
|
-
|
|
345
|
-
async def UpdateEntitiesGroup( # type: ignore
|
|
346
|
-
self, request: UpdateEntitiesGroupRequest, context=None
|
|
347
|
-
) -> UpdateEntitiesGroupResponse:
|
|
348
|
-
response = UpdateEntitiesGroupResponse()
|
|
349
|
-
async with self.driver.ro_transaction() as ro_txn:
|
|
350
|
-
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
351
|
-
if kbobj is None:
|
|
352
|
-
response.status = UpdateEntitiesGroupResponse.Status.KB_NOT_FOUND
|
|
353
|
-
return response
|
|
354
|
-
|
|
355
|
-
async with self.driver.rw_transaction() as txn:
|
|
356
|
-
kbobj.txn = txn
|
|
357
|
-
entities_manager = EntitiesManager(kbobj, txn)
|
|
358
|
-
try:
|
|
359
|
-
await entities_manager.set_entities_group_metadata(
|
|
360
|
-
request.group,
|
|
361
|
-
title=request.title,
|
|
362
|
-
color=request.color,
|
|
363
|
-
)
|
|
364
|
-
updates = {**request.add, **request.update}
|
|
365
|
-
await entities_manager.update_entities(request.group, updates)
|
|
366
|
-
await entities_manager.delete_entities(request.group, request.delete) # type: ignore
|
|
367
|
-
except EntitiesGroupNotFound:
|
|
368
|
-
response.status = UpdateEntitiesGroupResponse.Status.ENTITIES_GROUP_NOT_FOUND
|
|
369
|
-
return response
|
|
370
|
-
|
|
371
|
-
await txn.commit()
|
|
372
|
-
response.status = UpdateEntitiesGroupResponse.Status.OK
|
|
373
|
-
return response
|
|
374
|
-
|
|
375
|
-
async def DelEntities(self, request: DelEntitiesRequest, context=None) -> OpStatusWriter: # type: ignore
|
|
376
|
-
response = OpStatusWriter()
|
|
377
|
-
|
|
378
|
-
async with self.driver.ro_transaction() as ro_txn:
|
|
379
|
-
kbobj = await self.proc.get_kb_obj(ro_txn, request.kb)
|
|
380
|
-
if kbobj is None:
|
|
381
|
-
response.status = OpStatusWriter.Status.NOTFOUND
|
|
382
|
-
return response
|
|
383
|
-
|
|
384
|
-
async with self.driver.rw_transaction() as txn:
|
|
385
|
-
kbobj.txn = txn
|
|
386
|
-
entities_manager = EntitiesManager(kbobj, txn)
|
|
387
|
-
try:
|
|
388
|
-
await entities_manager.delete_entities_group(request.group)
|
|
389
|
-
except Exception as e:
|
|
390
|
-
errors.capture_exception(e)
|
|
391
|
-
logger.error("Error in ingest gRPC servicer", exc_info=True)
|
|
392
|
-
response.status = OpStatusWriter.Status.ERROR
|
|
393
|
-
else:
|
|
394
|
-
await txn.commit()
|
|
395
|
-
response.status = OpStatusWriter.Status.OK
|
|
396
|
-
return response
|
|
397
|
-
|
|
398
293
|
async def Status( # type: ignore
|
|
399
294
|
self, request: WriterStatusRequest, context=None
|
|
400
295
|
) -> WriterStatusResponse:
|
|
@@ -413,8 +308,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
413
308
|
|
|
414
309
|
async def Index(self, request: IndexResource, context=None) -> IndexStatus: # type: ignore
|
|
415
310
|
async with self.driver.ro_transaction() as txn:
|
|
416
|
-
|
|
417
|
-
resobj = ResourceORM(txn, self.storage, kbobj, request.rid)
|
|
311
|
+
resobj = ResourceORM(txn, self.storage, request.kbid, request.rid)
|
|
418
312
|
bm = await generate_broker_message(resobj)
|
|
419
313
|
transaction = get_transaction_utility()
|
|
420
314
|
partitioning = get_partitioning()
|
|
@@ -428,7 +322,7 @@ class WriterServicer(writer_pb2_grpc.WriterServicer):
|
|
|
428
322
|
try:
|
|
429
323
|
async with self.driver.rw_transaction() as txn:
|
|
430
324
|
kbobj = KnowledgeBoxORM(txn, self.storage, request.kbid)
|
|
431
|
-
resobj = ResourceORM(txn, self.storage,
|
|
325
|
+
resobj = ResourceORM(txn, self.storage, request.kbid, request.rid)
|
|
432
326
|
resobj.disable_vectors = not request.reindex_vectors
|
|
433
327
|
index_message = await get_resource_index_message(resobj, reindex=True)
|
|
434
328
|
shard = await self.proc.get_or_assign_resource_shard(txn, kbobj, request.rid)
|
nucliadb/ingest/settings.py
CHANGED
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
from enum import Enum
|
|
21
|
-
from typing import Optional
|
|
22
21
|
|
|
23
22
|
from pydantic import Field
|
|
24
23
|
from pydantic_settings import BaseSettings
|
|
@@ -41,13 +40,13 @@ class DriverConfig(Enum):
|
|
|
41
40
|
|
|
42
41
|
class DriverSettings(BaseSettings):
|
|
43
42
|
driver: DriverConfig = Field(default=DriverConfig.PG, description="K/V storage driver")
|
|
44
|
-
driver_local_url:
|
|
43
|
+
driver_local_url: str | None = Field(
|
|
45
44
|
default=None,
|
|
46
45
|
description="Local path to store data on file system. Example: /nucliadb/data/main",
|
|
47
46
|
)
|
|
48
|
-
driver_pg_url:
|
|
47
|
+
driver_pg_url: str | None = Field(
|
|
49
48
|
default=None,
|
|
50
|
-
description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.",
|
|
49
|
+
description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.",
|
|
51
50
|
)
|
|
52
51
|
driver_pg_connection_pool_min_size: int = Field(
|
|
53
52
|
default=10,
|
nucliadb/ingest/utils.py
CHANGED
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional
|
|
21
20
|
|
|
22
21
|
from nucliadb.common.maindb.utils import setup_driver
|
|
23
22
|
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
|
@@ -26,7 +25,7 @@ from nucliadb_utils.settings import nucliadb_settings
|
|
|
26
25
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
|
27
26
|
|
|
28
27
|
|
|
29
|
-
async def start_ingest(service_name:
|
|
28
|
+
async def start_ingest(service_name: str | None = None):
|
|
30
29
|
await setup_driver()
|
|
31
30
|
|
|
32
31
|
actual_service = get_utility(Utility.INGEST)
|
nucliadb/learning_proxy.py
CHANGED
|
@@ -24,7 +24,7 @@ import os
|
|
|
24
24
|
from abc import ABC, abstractmethod
|
|
25
25
|
from collections.abc import AsyncIterator
|
|
26
26
|
from enum import Enum, IntEnum
|
|
27
|
-
from typing import Any
|
|
27
|
+
from typing import Any
|
|
28
28
|
|
|
29
29
|
import backoff
|
|
30
30
|
import httpx
|
|
@@ -89,12 +89,12 @@ class LearningConfiguration(BaseModel):
|
|
|
89
89
|
# aka similarity function
|
|
90
90
|
semantic_vector_similarity: str
|
|
91
91
|
# aka vector_dimension
|
|
92
|
-
semantic_vector_size:
|
|
92
|
+
semantic_vector_size: int | None = None
|
|
93
93
|
# aka min_score
|
|
94
|
-
semantic_threshold:
|
|
94
|
+
semantic_threshold: float | None = None
|
|
95
95
|
# List of possible subdivisions of the matryoshka embeddings (if the model
|
|
96
96
|
# supports it)
|
|
97
|
-
semantic_matryoshka_dimensions:
|
|
97
|
+
semantic_matryoshka_dimensions: list[int] | None = Field(
|
|
98
98
|
default=None, alias="semantic_matryoshka_dims"
|
|
99
99
|
)
|
|
100
100
|
|
|
@@ -154,7 +154,7 @@ class LearningConfiguration(BaseModel):
|
|
|
154
154
|
|
|
155
155
|
|
|
156
156
|
class ProxiedLearningConfigError(Exception):
|
|
157
|
-
def __init__(self, status_code: int, content:
|
|
157
|
+
def __init__(self, status_code: int, content: str | dict[str, Any]):
|
|
158
158
|
self.status_code = status_code
|
|
159
159
|
self.content = content
|
|
160
160
|
|
|
@@ -176,7 +176,7 @@ def raise_for_status(response: httpx.Response) -> None:
|
|
|
176
176
|
|
|
177
177
|
async def get_configuration(
|
|
178
178
|
kbid: str,
|
|
179
|
-
) ->
|
|
179
|
+
) -> LearningConfiguration | None:
|
|
180
180
|
return await learning_config_service().get_configuration(kbid)
|
|
181
181
|
|
|
182
182
|
|
|
@@ -205,7 +205,7 @@ async def learning_config_proxy(
|
|
|
205
205
|
method: str,
|
|
206
206
|
url: str,
|
|
207
207
|
headers: dict[str, str] = {},
|
|
208
|
-
) ->
|
|
208
|
+
) -> Response | StreamingResponse:
|
|
209
209
|
return await proxy(
|
|
210
210
|
service=LearningService.CONFIG,
|
|
211
211
|
request=request,
|
|
@@ -245,7 +245,7 @@ async def proxy(
|
|
|
245
245
|
method: str,
|
|
246
246
|
url: str,
|
|
247
247
|
headers: dict[str, str] = {},
|
|
248
|
-
) ->
|
|
248
|
+
) -> Response | StreamingResponse:
|
|
249
249
|
"""
|
|
250
250
|
Proxy the request to a learning API.
|
|
251
251
|
|
|
@@ -426,7 +426,7 @@ class DummyClient(httpx.AsyncClient):
|
|
|
426
426
|
|
|
427
427
|
class LearningConfigService(ABC):
|
|
428
428
|
@abstractmethod
|
|
429
|
-
async def get_configuration(self, kbid: str) ->
|
|
429
|
+
async def get_configuration(self, kbid: str) -> LearningConfiguration | None: ...
|
|
430
430
|
|
|
431
431
|
@abstractmethod
|
|
432
432
|
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration: ...
|
|
@@ -439,7 +439,7 @@ class LearningConfigService(ABC):
|
|
|
439
439
|
|
|
440
440
|
|
|
441
441
|
class ProxiedLearningConfig(LearningConfigService):
|
|
442
|
-
async def get_configuration(self, kbid: str) ->
|
|
442
|
+
async def get_configuration(self, kbid: str) -> LearningConfiguration | None:
|
|
443
443
|
async with self._client() as client:
|
|
444
444
|
resp = await client.get(f"config/{kbid}")
|
|
445
445
|
try:
|
|
@@ -483,7 +483,7 @@ class InMemoryLearningConfig(LearningConfigService):
|
|
|
483
483
|
def __init__(self):
|
|
484
484
|
self.in_memory_configs = {}
|
|
485
485
|
|
|
486
|
-
async def get_configuration(self, kbid: str) ->
|
|
486
|
+
async def get_configuration(self, kbid: str) -> LearningConfiguration | None:
|
|
487
487
|
return _IN_MEMORY_CONFIGS.get(kbid, None)
|
|
488
488
|
|
|
489
489
|
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
|
nucliadb/metrics_exporter.py
CHANGED
|
@@ -20,7 +20,8 @@
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
22
|
import asyncio
|
|
23
|
-
from
|
|
23
|
+
from collections.abc import AsyncGenerator, Callable
|
|
24
|
+
from typing import cast
|
|
24
25
|
|
|
25
26
|
from nucliadb import logger
|
|
26
27
|
from nucliadb.common import datamanagers
|
|
@@ -41,7 +42,7 @@ KBS_COUNT = metrics.Gauge("nucliadb_kbs_count")
|
|
|
41
42
|
RESOURCES_COUNT = metrics.Gauge("nucliadb_resources_count")
|
|
42
43
|
|
|
43
44
|
|
|
44
|
-
async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str
|
|
45
|
+
async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str]:
|
|
45
46
|
"""
|
|
46
47
|
Return a list of all KB ids.
|
|
47
48
|
"""
|
|
@@ -94,11 +95,11 @@ async def update_resource_metrics(context: ApplicationContext):
|
|
|
94
95
|
"WHERE labels @> '{/n/s/PENDING}' "
|
|
95
96
|
"AND COALESCE(modified_at, created_at) BETWEEN NOW() - INTERVAL '1 month' AND NOW() - INTERVAL '6 hours'"
|
|
96
97
|
)
|
|
97
|
-
count = cast(
|
|
98
|
+
count = cast(tuple[int], await cur.fetchone())[0]
|
|
98
99
|
PENDING_RESOURCE_COUNT.set(count)
|
|
99
100
|
|
|
100
101
|
await cur.execute("SELECT COUNT(*) FROM catalog")
|
|
101
|
-
count = cast(
|
|
102
|
+
count = cast(tuple[int], await cur.fetchone())[0]
|
|
102
103
|
RESOURCES_COUNT.set(count)
|
|
103
104
|
|
|
104
105
|
|
nucliadb/middleware/__init__.py
CHANGED
|
@@ -17,7 +17,10 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
|
|
20
|
+
import logging
|
|
20
21
|
import time
|
|
22
|
+
from collections import deque
|
|
23
|
+
from typing import ClassVar
|
|
21
24
|
|
|
22
25
|
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
|
23
26
|
from starlette.requests import Request
|
|
@@ -27,6 +30,9 @@ PROCESS_TIME_HEADER = "X-PROCESS-TIME"
|
|
|
27
30
|
ACCESS_CONTROL_EXPOSE_HEADER = "Access-Control-Expose-Headers"
|
|
28
31
|
|
|
29
32
|
|
|
33
|
+
logger = logging.getLogger("nucliadb.middleware")
|
|
34
|
+
|
|
35
|
+
|
|
30
36
|
class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
|
|
31
37
|
def capture_process_time(self, response, duration: float):
|
|
32
38
|
response.headers[PROCESS_TIME_HEADER] = str(duration)
|
|
@@ -44,9 +50,84 @@ class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
|
|
|
44
50
|
start = time.perf_counter()
|
|
45
51
|
try:
|
|
46
52
|
response = await call_next(request)
|
|
53
|
+
return response
|
|
47
54
|
finally:
|
|
48
55
|
if response is not None:
|
|
49
56
|
duration = time.perf_counter() - start
|
|
50
57
|
self.capture_process_time(response, duration)
|
|
51
58
|
self.expose_process_time_header(response)
|
|
52
|
-
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ClientErrorPayloadLoggerMiddleware(BaseHTTPMiddleware):
|
|
62
|
+
"""
|
|
63
|
+
Middleware that logs the payload of client error responses (HTTP 412 and 422).
|
|
64
|
+
This helps supporting clients by providing more context about the errors they
|
|
65
|
+
encounter which otherwise we don't have much visibility on.
|
|
66
|
+
|
|
67
|
+
There is a limit of logs per IP to avoid flooding the logs in case of
|
|
68
|
+
misbehaving clients.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
log_counters: ClassVar[dict[str, "HourlyLogCounter"]] = {}
|
|
72
|
+
max_logs: int = 200
|
|
73
|
+
|
|
74
|
+
def get_request_host(self, request: Request) -> str:
|
|
75
|
+
return request.client.host if request.client else "unknown"
|
|
76
|
+
|
|
77
|
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
|
78
|
+
response = await call_next(request)
|
|
79
|
+
|
|
80
|
+
host = self.get_request_host(request)
|
|
81
|
+
counter = self.log_counters.setdefault(host, HourlyLogCounter())
|
|
82
|
+
if response.status_code in (412, 422) and counter.get_count() < self.max_logs:
|
|
83
|
+
counter.log_event()
|
|
84
|
+
|
|
85
|
+
response_body = b""
|
|
86
|
+
chunk: bytes
|
|
87
|
+
async for chunk in response.body_iterator: # type: ignore
|
|
88
|
+
response_body += chunk
|
|
89
|
+
|
|
90
|
+
logger.info(
|
|
91
|
+
f"Client payload validation error",
|
|
92
|
+
extra={
|
|
93
|
+
"request_method": request.method,
|
|
94
|
+
"request_path": request.url.path,
|
|
95
|
+
"response_status_code": response.status_code,
|
|
96
|
+
"response_payload": response_body.decode("utf-8", errors="replace"),
|
|
97
|
+
},
|
|
98
|
+
)
|
|
99
|
+
# Recreate the response body iterator since it has been consumed
|
|
100
|
+
response = Response(
|
|
101
|
+
content=response_body,
|
|
102
|
+
status_code=response.status_code,
|
|
103
|
+
headers=dict(response.headers),
|
|
104
|
+
media_type=response.media_type,
|
|
105
|
+
background=response.background,
|
|
106
|
+
)
|
|
107
|
+
return response
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class EventCounter:
|
|
111
|
+
def __init__(self, window_seconds: int = 3600):
|
|
112
|
+
self.window_seconds = window_seconds
|
|
113
|
+
self.events: deque[float] = deque()
|
|
114
|
+
|
|
115
|
+
def log_event(self):
|
|
116
|
+
current_time = time.time()
|
|
117
|
+
# Remove events older than the window
|
|
118
|
+
while self.events and self.events[0] < current_time - self.window_seconds:
|
|
119
|
+
self.events.popleft()
|
|
120
|
+
# Add current event
|
|
121
|
+
self.events.append(current_time)
|
|
122
|
+
|
|
123
|
+
def get_count(self) -> int:
|
|
124
|
+
current_time = time.time()
|
|
125
|
+
# Remove old events and return count
|
|
126
|
+
while self.events and self.events[0] < current_time - self.window_seconds:
|
|
127
|
+
self.events.popleft()
|
|
128
|
+
return len(self.events)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class HourlyLogCounter(EventCounter):
|
|
132
|
+
def __init__(self):
|
|
133
|
+
super().__init__(window_seconds=3600)
|
nucliadb/migrator/datamanager.py
CHANGED
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional, Union
|
|
21
20
|
|
|
22
21
|
from nucliadb.common import datamanagers
|
|
23
22
|
from nucliadb.common.maindb.driver import Driver
|
|
@@ -69,7 +68,7 @@ class MigrationsDataManager:
|
|
|
69
68
|
await txn.delete(MIGRATIONS_KEY.format(kbid=kbid))
|
|
70
69
|
await txn.commit()
|
|
71
70
|
|
|
72
|
-
async def get_kb_info(self, kbid: str) ->
|
|
71
|
+
async def get_kb_info(self, kbid: str) -> KnowledgeBoxInfo | None:
|
|
73
72
|
async with self.driver.ro_transaction() as txn:
|
|
74
73
|
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
|
75
74
|
if kb_config is None:
|
|
@@ -91,8 +90,8 @@ class MigrationsDataManager:
|
|
|
91
90
|
async def update_global_info(
|
|
92
91
|
self,
|
|
93
92
|
*,
|
|
94
|
-
current_version:
|
|
95
|
-
target_version:
|
|
93
|
+
current_version: int | _Unset = _UNSET,
|
|
94
|
+
target_version: int | None | _Unset = _UNSET,
|
|
96
95
|
) -> None:
|
|
97
96
|
async with self.driver.rw_transaction() as txn:
|
|
98
97
|
raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
|
nucliadb/migrator/migrator.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
import asyncio
|
|
21
21
|
import logging
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
from nucliadb.common import locking
|
|
25
24
|
from nucliadb.common.cluster.rollover import rollover_kb_index
|
|
@@ -233,7 +232,7 @@ async def run_pg_schema_migrations(driver: PGDriver):
|
|
|
233
232
|
await tx.commit()
|
|
234
233
|
|
|
235
234
|
|
|
236
|
-
async def run(context: ExecutionContext, target_version:
|
|
235
|
+
async def run(context: ExecutionContext, target_version: int | None = None) -> None:
|
|
237
236
|
# Run schema migrations first, since they create the `resources` table needed for the lock below
|
|
238
237
|
# Schema migrations use their own locking system
|
|
239
238
|
if isinstance(context.kv_driver, PGDriver):
|
nucliadb/migrator/models.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
import types
|
|
21
21
|
from dataclasses import dataclass
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
@dataclass
|
|
@@ -36,4 +35,4 @@ class KnowledgeBoxInfo:
|
|
|
36
35
|
@dataclass
|
|
37
36
|
class GlobalInfo:
|
|
38
37
|
current_version: int
|
|
39
|
-
target_version:
|
|
38
|
+
target_version: int | None = None
|
nucliadb/migrator/settings.py
CHANGED
|
@@ -17,14 +17,13 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional
|
|
21
20
|
|
|
22
21
|
import pydantic
|
|
23
22
|
import pydantic_settings
|
|
24
23
|
|
|
25
24
|
|
|
26
25
|
class Settings(pydantic_settings.BaseSettings):
|
|
27
|
-
redis_url:
|
|
26
|
+
redis_url: str | None = None
|
|
28
27
|
max_concurrent_migrations: int = pydantic.Field(
|
|
29
28
|
default=5,
|
|
30
29
|
description="Maximum number of concurrent KB migrations allowed.",
|