nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -18,37 +18,53 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
from datetime import datetime
|
21
|
-
from
|
21
|
+
from functools import partial
|
22
|
+
from typing import Any, AsyncGenerator, Callable, Coroutine, Optional, Sequence
|
22
23
|
from uuid import uuid4
|
23
24
|
|
24
25
|
from grpc import StatusCode
|
25
26
|
from grpc.aio import AioRpcError
|
26
|
-
from nucliadb_protos.knowledgebox_pb2 import (
|
27
|
-
KnowledgeBoxConfig,
|
28
|
-
Labels,
|
29
|
-
LabelSet,
|
30
|
-
SemanticModelMetadata,
|
31
|
-
)
|
32
|
-
from nucliadb_protos.resources_pb2 import Basic
|
33
|
-
from nucliadb_protos.utils_pb2 import ReleaseChannel
|
34
27
|
|
35
28
|
from nucliadb.common import datamanagers
|
36
29
|
from nucliadb.common.cluster.exceptions import ShardNotFound
|
37
30
|
from nucliadb.common.cluster.manager import get_index_node
|
38
31
|
from nucliadb.common.cluster.utils import get_shard_manager
|
39
|
-
|
40
|
-
|
41
|
-
from nucliadb.
|
42
|
-
from nucliadb.ingest.orm.resource import (
|
32
|
+
|
33
|
+
# XXX: this keys shouldn't be exposed outside datamanagers
|
34
|
+
from nucliadb.common.datamanagers.resources import (
|
43
35
|
KB_RESOURCE_SLUG,
|
44
36
|
KB_RESOURCE_SLUG_BASE,
|
45
|
-
Resource,
|
46
37
|
)
|
38
|
+
from nucliadb.common.external_index_providers.base import VectorsetExternalIndex
|
39
|
+
from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
|
40
|
+
from nucliadb.common.maindb.driver import Driver, Transaction
|
41
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
42
|
+
from nucliadb.common.nidx import get_nidx_api_client
|
43
|
+
from nucliadb.ingest import SERVICE_NAME, logger
|
44
|
+
from nucliadb.ingest.orm.exceptions import (
|
45
|
+
KnowledgeBoxConflict,
|
46
|
+
KnowledgeBoxCreationError,
|
47
|
+
VectorSetConflict,
|
48
|
+
)
|
49
|
+
from nucliadb.ingest.orm.metrics import processor_observer
|
50
|
+
from nucliadb.ingest.orm.resource import Resource
|
47
51
|
from nucliadb.ingest.orm.utils import choose_matryoshka_dimension, compute_paragraph_key
|
48
52
|
from nucliadb.migrator.utils import get_latest_version
|
49
|
-
from nucliadb_protos import writer_pb2
|
53
|
+
from nucliadb_protos import knowledgebox_pb2, noderesources_pb2, nodewriter_pb2, writer_pb2
|
54
|
+
from nucliadb_protos.knowledgebox_pb2 import (
|
55
|
+
CreateExternalIndexProviderMetadata,
|
56
|
+
ExternalIndexProviderType,
|
57
|
+
KnowledgeBoxConfig,
|
58
|
+
SemanticModelMetadata,
|
59
|
+
StoredExternalIndexProviderMetadata,
|
60
|
+
)
|
61
|
+
from nucliadb_protos.resources_pb2 import Basic
|
62
|
+
from nucliadb_utils.settings import is_onprem_nucliadb
|
50
63
|
from nucliadb_utils.storages.storage import Storage
|
51
|
-
from nucliadb_utils.utilities import
|
64
|
+
from nucliadb_utils.utilities import (
|
65
|
+
get_audit,
|
66
|
+
get_storage,
|
67
|
+
)
|
52
68
|
|
53
69
|
# XXX Eventually all these keys should be moved to datamanagers.kb
|
54
70
|
KB_RESOURCE = "/kbs/{kbid}/r/{uuid}"
|
@@ -58,9 +74,15 @@ KB_KEYS = "/kbs/{kbid}/"
|
|
58
74
|
KB_TO_DELETE_BASE = "/kbtodelete/"
|
59
75
|
KB_TO_DELETE_STORAGE_BASE = "/storagetodelete/"
|
60
76
|
|
77
|
+
RESOURCE_TO_DELETE_STORAGE_BASE = "/resourcestoragetodelete"
|
78
|
+
RESOURCE_TO_DELETE_STORAGE = f"{RESOURCE_TO_DELETE_STORAGE_BASE}/{{kbid}}/{{uuid}}"
|
79
|
+
|
61
80
|
KB_TO_DELETE = f"{KB_TO_DELETE_BASE}{{kbid}}"
|
62
81
|
KB_TO_DELETE_STORAGE = f"{KB_TO_DELETE_STORAGE_BASE}{{kbid}}"
|
63
82
|
|
83
|
+
KB_VECTORSET_TO_DELETE_BASE = "/vectorsettodelete"
|
84
|
+
KB_VECTORSET_TO_DELETE = f"{KB_VECTORSET_TO_DELETE_BASE}/{{kbid}}/{{vectorset}}"
|
85
|
+
|
64
86
|
|
65
87
|
class KnowledgeBox:
|
66
88
|
def __init__(self, txn: Transaction, storage: Storage, kbid: str):
|
@@ -69,119 +91,153 @@ class KnowledgeBox:
|
|
69
91
|
self.kbid = kbid
|
70
92
|
self._config: Optional[KnowledgeBoxConfig] = None
|
71
93
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
config = await datamanagers.kb.get_config(txn, kbid=self.kbid)
|
76
|
-
if config is not None:
|
77
|
-
self._config = config
|
78
|
-
return config
|
79
|
-
else:
|
80
|
-
return None
|
81
|
-
else:
|
82
|
-
return self._config
|
83
|
-
|
84
|
-
@classmethod
|
85
|
-
async def delete_kb(cls, txn: Transaction, kbid: str):
|
86
|
-
# Mark storage to be deleted
|
87
|
-
# Mark keys to be deleted
|
88
|
-
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
89
|
-
if kb_config is None:
|
90
|
-
# consider KB as deleted
|
91
|
-
return
|
92
|
-
slug = kb_config.slug
|
93
|
-
|
94
|
-
# Delete main anchor
|
95
|
-
async with txn.driver.transaction() as subtxn:
|
96
|
-
key_match = datamanagers.kb.KB_SLUGS.format(slug=slug)
|
97
|
-
await subtxn.delete(key_match)
|
98
|
-
|
99
|
-
when = datetime.now().isoformat()
|
100
|
-
await subtxn.set(KB_TO_DELETE.format(kbid=kbid), when.encode())
|
101
|
-
await subtxn.commit()
|
102
|
-
|
103
|
-
audit_util = get_audit()
|
104
|
-
if audit_util is not None:
|
105
|
-
await audit_util.delete_kb(kbid)
|
106
|
-
return kbid
|
94
|
+
@staticmethod
|
95
|
+
def new_unique_kbid() -> str:
|
96
|
+
return str(uuid4())
|
107
97
|
|
108
98
|
@classmethod
|
99
|
+
@processor_observer.wrap({"type": "create_kb"})
|
109
100
|
async def create(
|
110
101
|
cls,
|
111
|
-
|
102
|
+
driver: Driver,
|
103
|
+
*,
|
104
|
+
kbid: str,
|
112
105
|
slug: str,
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
if
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
datamanagers.kb.KB_SLUGS.format(slug=slug),
|
130
|
-
uuid.encode(),
|
131
|
-
)
|
132
|
-
if config is None:
|
133
|
-
config = KnowledgeBoxConfig()
|
134
|
-
|
135
|
-
config.migration_version = get_latest_version()
|
136
|
-
config.slug = slug
|
137
|
-
await txn.set(
|
138
|
-
datamanagers.kb.KB_UUID.format(kbid=uuid),
|
139
|
-
config.SerializeToString(),
|
140
|
-
)
|
141
|
-
# Create Storage
|
142
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
143
|
-
|
144
|
-
created = await storage.create_kb(uuid)
|
145
|
-
if created is False:
|
146
|
-
logger.error(f"{uuid} KB could not be created")
|
147
|
-
failed = True
|
148
|
-
|
149
|
-
if failed is False:
|
150
|
-
kb_shards = writer_pb2.Shards()
|
151
|
-
kb_shards.kbid = uuid
|
152
|
-
# B/c with Shards.actual
|
153
|
-
kb_shards.actual = -1
|
154
|
-
# B/c with `Shards.similarity`, replaced by `model`
|
155
|
-
kb_shards.similarity = semantic_model.similarity_function
|
156
|
-
|
157
|
-
# if this KB uses a matryoshka model, we can choose a different
|
158
|
-
# dimension
|
159
|
-
if len(semantic_model.matryoshka_dimensions) > 0:
|
160
|
-
semantic_model.vector_dimension = choose_matryoshka_dimension(
|
161
|
-
semantic_model.matryoshka_dimensions # type: ignore
|
162
|
-
)
|
163
|
-
kb_shards.model.CopyFrom(semantic_model)
|
164
|
-
|
165
|
-
kb_shards.release_channel = release_channel
|
166
|
-
|
167
|
-
await datamanagers.cluster.update_kb_shards(
|
168
|
-
txn, kbid=uuid, shards=kb_shards
|
106
|
+
title: str = "",
|
107
|
+
description: str = "",
|
108
|
+
semantic_models: Optional[dict[str, SemanticModelMetadata]] = None,
|
109
|
+
external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
|
110
|
+
hidden_resources_enabled: bool = False,
|
111
|
+
hidden_resources_hide_on_creation: bool = False,
|
112
|
+
) -> tuple[str, str]:
|
113
|
+
"""Creates a new knowledge box and return its id and slug."""
|
114
|
+
|
115
|
+
if not kbid:
|
116
|
+
raise KnowledgeBoxCreationError("A kbid must be provided to create a new KB")
|
117
|
+
if not slug:
|
118
|
+
raise KnowledgeBoxCreationError("A slug must be provided to create a new KB")
|
119
|
+
if hidden_resources_hide_on_creation and not hidden_resources_enabled:
|
120
|
+
raise KnowledgeBoxCreationError(
|
121
|
+
"Cannot hide new resources if the hidden resources feature is disabled"
|
169
122
|
)
|
123
|
+
if semantic_models is None or len(semantic_models) == 0:
|
124
|
+
raise KnowledgeBoxCreationError("KB must define at least one semantic model")
|
170
125
|
|
171
|
-
|
172
|
-
# uses this variable anymore
|
173
|
-
del kb_shards
|
174
|
-
shard_manager = get_shard_manager()
|
175
|
-
try:
|
176
|
-
await shard_manager.create_shard_by_kbid(txn, uuid)
|
177
|
-
except Exception as e:
|
178
|
-
await storage.delete_kb(uuid)
|
179
|
-
raise e
|
126
|
+
rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
|
180
127
|
|
181
|
-
|
182
|
-
|
128
|
+
try:
|
129
|
+
async with driver.transaction() as txn:
|
130
|
+
exists = await datamanagers.kb.get_kb_uuid(
|
131
|
+
txn, slug=slug
|
132
|
+
) or await datamanagers.kb.exists_kb(txn, kbid=kbid)
|
133
|
+
if exists:
|
134
|
+
raise KnowledgeBoxConflict()
|
135
|
+
|
136
|
+
# Create in maindb
|
137
|
+
await datamanagers.kb.set_kbid_for_slug(txn, slug=slug, kbid=kbid)
|
138
|
+
|
139
|
+
# all KBs have the vectorset key initialized, although (for
|
140
|
+
# now), not every KB will store vectorsets there
|
141
|
+
await datamanagers.vectorsets.initialize(txn, kbid=kbid)
|
142
|
+
|
143
|
+
kb_shards = writer_pb2.Shards()
|
144
|
+
kb_shards.kbid = kbid
|
145
|
+
# B/c with Shards.actual
|
146
|
+
kb_shards.actual = -1
|
147
|
+
|
148
|
+
vs_external_indexes = []
|
149
|
+
for vectorset_id, semantic_model in semantic_models.items(): # type: ignore
|
150
|
+
# if this KB uses a matryoshka model, we can choose a different
|
151
|
+
# dimension
|
152
|
+
if len(semantic_model.matryoshka_dimensions) > 0:
|
153
|
+
dimension = choose_matryoshka_dimension(semantic_model.matryoshka_dimensions)
|
154
|
+
else:
|
155
|
+
dimension = semantic_model.vector_dimension
|
156
|
+
|
157
|
+
vs_external_indexes.append(
|
158
|
+
VectorsetExternalIndex(
|
159
|
+
vectorset_id=vectorset_id,
|
160
|
+
dimension=dimension,
|
161
|
+
similarity=semantic_model.similarity_function,
|
162
|
+
)
|
163
|
+
)
|
164
|
+
|
165
|
+
vectorset_config = knowledgebox_pb2.VectorSetConfig(
|
166
|
+
vectorset_id=vectorset_id,
|
167
|
+
vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
|
168
|
+
similarity=semantic_model.similarity_function,
|
169
|
+
# XXX: hardcoded value
|
170
|
+
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
171
|
+
normalize_vectors=len(semantic_model.matryoshka_dimensions) > 0,
|
172
|
+
vector_dimension=dimension,
|
173
|
+
),
|
174
|
+
matryoshka_dimensions=semantic_model.matryoshka_dimensions,
|
175
|
+
)
|
176
|
+
await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset_config)
|
177
|
+
|
178
|
+
stored_external_index_provider = await cls._maybe_create_external_indexes(
|
179
|
+
kbid, request=external_index_provider, indexes=vs_external_indexes
|
180
|
+
)
|
181
|
+
rollback_ops.append(
|
182
|
+
partial(
|
183
|
+
cls._maybe_delete_external_indexes,
|
184
|
+
kbid,
|
185
|
+
stored_external_index_provider,
|
186
|
+
)
|
187
|
+
)
|
183
188
|
|
184
|
-
|
189
|
+
config = KnowledgeBoxConfig(
|
190
|
+
title=title,
|
191
|
+
description=description,
|
192
|
+
slug=slug,
|
193
|
+
migration_version=get_latest_version(),
|
194
|
+
hidden_resources_enabled=hidden_resources_enabled,
|
195
|
+
hidden_resources_hide_on_creation=hidden_resources_hide_on_creation,
|
196
|
+
)
|
197
|
+
config.external_index_provider.CopyFrom(stored_external_index_provider)
|
198
|
+
await datamanagers.kb.set_config(txn, kbid=kbid, config=config)
|
199
|
+
await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
|
200
|
+
|
201
|
+
# shard creation will alter this value on maindb, make sure nobody
|
202
|
+
# uses this variable anymore
|
203
|
+
del kb_shards
|
204
|
+
|
205
|
+
# Create in storage
|
206
|
+
|
207
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
208
|
+
|
209
|
+
created = await storage.create_kb(kbid)
|
210
|
+
if not created:
|
211
|
+
logger.error(f"KB {kbid} could not be created")
|
212
|
+
raise KnowledgeBoxCreationError(
|
213
|
+
f"KB blob storage could not be created (slug={slug})"
|
214
|
+
)
|
215
|
+
rollback_ops.append(partial(storage.delete_kb, kbid))
|
216
|
+
|
217
|
+
# Create shards in index nodes
|
218
|
+
|
219
|
+
shard_manager = get_shard_manager()
|
220
|
+
# XXX creating a shard is a slow IO operation that requires a write
|
221
|
+
# txn to be open!
|
222
|
+
await shard_manager.create_shard_by_kbid(txn, kbid)
|
223
|
+
# shards don't need a rollback as they will be eventually purged
|
224
|
+
|
225
|
+
await txn.commit()
|
226
|
+
|
227
|
+
except Exception as exc:
|
228
|
+
# rollback all changes on the db and raise the exception
|
229
|
+
for op in reversed(rollback_ops):
|
230
|
+
try:
|
231
|
+
await op()
|
232
|
+
except Exception:
|
233
|
+
if isinstance(op, partial):
|
234
|
+
name: str = op.func.__name__
|
235
|
+
else:
|
236
|
+
getattr(op, "__name__", "unknown?")
|
237
|
+
logger.exception(f"Unexpected error rolling back {name}. Keep rolling back")
|
238
|
+
raise exc
|
239
|
+
|
240
|
+
return (kbid, slug)
|
185
241
|
|
186
242
|
@classmethod
|
187
243
|
async def update(
|
@@ -191,7 +247,7 @@ class KnowledgeBox:
|
|
191
247
|
slug: Optional[str] = None,
|
192
248
|
config: Optional[KnowledgeBoxConfig] = None,
|
193
249
|
) -> str:
|
194
|
-
exist = await datamanagers.kb.get_config(txn, kbid=uuid)
|
250
|
+
exist = await datamanagers.kb.get_config(txn, kbid=uuid, for_update=True)
|
195
251
|
if not exist:
|
196
252
|
raise datamanagers.exceptions.KnowledgeBoxNotFound()
|
197
253
|
|
@@ -208,35 +264,61 @@ class KnowledgeBox:
|
|
208
264
|
|
209
265
|
if config and exist != config:
|
210
266
|
exist.MergeFrom(config)
|
267
|
+
exist.hidden_resources_enabled = config.hidden_resources_enabled
|
268
|
+
exist.hidden_resources_hide_on_creation = config.hidden_resources_hide_on_creation
|
269
|
+
|
270
|
+
if exist.hidden_resources_hide_on_creation and not exist.hidden_resources_enabled:
|
271
|
+
raise KnowledgeBoxCreationError(
|
272
|
+
"Cannot hide new resources if the hidden resources feature is disabled"
|
273
|
+
)
|
211
274
|
|
212
275
|
await datamanagers.kb.set_config(txn, kbid=uuid, config=exist)
|
213
276
|
|
214
277
|
return uuid
|
215
278
|
|
216
|
-
|
217
|
-
async def
|
218
|
-
|
219
|
-
|
220
|
-
|
279
|
+
@classmethod
|
280
|
+
async def delete(cls, driver: Driver, kbid: str):
|
281
|
+
async with driver.transaction() as txn:
|
282
|
+
exists = await datamanagers.kb.exists_kb(txn, kbid=kbid)
|
283
|
+
if not exists:
|
284
|
+
return
|
221
285
|
|
222
|
-
|
223
|
-
|
286
|
+
# Delete main anchor
|
287
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
288
|
+
if kb_config is not None:
|
289
|
+
slug = kb_config.slug
|
290
|
+
await datamanagers.kb.delete_kb_slug(txn, slug=slug)
|
224
291
|
|
225
|
-
|
226
|
-
self, labelset: str, labelset_response: writer_pb2.GetLabelSetResponse
|
227
|
-
):
|
228
|
-
ls = await datamanagers.labels.get_labelset(
|
229
|
-
self.txn,
|
230
|
-
kbid=self.kbid,
|
231
|
-
labelset_id=labelset,
|
232
|
-
)
|
233
|
-
if ls is not None:
|
234
|
-
labelset_response.labelset.CopyFrom(ls)
|
292
|
+
await datamanagers.kb.delete_config(txn, kbid=kbid)
|
235
293
|
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
294
|
+
# Mark KB to purge. This will eventually delete all KB keys, storage
|
295
|
+
# and index data (for the old index nodes)
|
296
|
+
when = datetime.now().isoformat()
|
297
|
+
await txn.set(KB_TO_DELETE.format(kbid=kbid), when.encode())
|
298
|
+
|
299
|
+
shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
300
|
+
|
301
|
+
await txn.commit()
|
302
|
+
|
303
|
+
if shards_obj is None:
|
304
|
+
logger.warning(f"Shards not found for KB while deleting it", extra={"kbid": kbid})
|
305
|
+
else:
|
306
|
+
nidx_api = get_nidx_api_client()
|
307
|
+
# Delete shards from nidx. They'll be marked for eventual deletion,
|
308
|
+
# so this call shouldn't be costly
|
309
|
+
if nidx_api is not None:
|
310
|
+
for shard in shards_obj.shards:
|
311
|
+
if shard.nidx_shard_id:
|
312
|
+
await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
|
313
|
+
|
314
|
+
if kb_config is not None:
|
315
|
+
await cls._maybe_delete_external_indexes(kbid, kb_config.external_index_provider)
|
316
|
+
|
317
|
+
audit = get_audit()
|
318
|
+
if audit is not None:
|
319
|
+
audit.delete_kb(kbid=kbid)
|
320
|
+
|
321
|
+
return kbid
|
240
322
|
|
241
323
|
@classmethod
|
242
324
|
async def purge(cls, driver: Driver, kbid: str):
|
@@ -247,6 +329,8 @@ class KnowledgeBox:
|
|
247
329
|
need to delete the kb shards and also deletes the related storage
|
248
330
|
buckets.
|
249
331
|
|
332
|
+
Removes all catalog entries related to the kb.
|
333
|
+
|
250
334
|
As non-empty buckets cannot be deleted, they are scheduled to be
|
251
335
|
deleted instead. Actually, this empties the bucket asynchronouysly
|
252
336
|
but it doesn't delete it. To do it, we save a marker using the
|
@@ -262,16 +346,13 @@ class KnowledgeBox:
|
|
262
346
|
storage_to_delete = KB_TO_DELETE_STORAGE.format(kbid=kbid)
|
263
347
|
await txn.set(storage_to_delete, b"")
|
264
348
|
|
265
|
-
|
266
|
-
shards_match = datamanagers.cluster.KB_SHARDS.format(kbid=kbid)
|
267
|
-
payload = await txn.get(shards_match)
|
349
|
+
await catalog_delete_kb(txn, kbid)
|
268
350
|
|
269
|
-
|
270
|
-
|
351
|
+
# Delete KB Shards
|
352
|
+
shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
353
|
+
if shards_obj is None:
|
354
|
+
logger.warning(f"Shards not found for KB while purging it", extra={"kbid": kbid})
|
271
355
|
else:
|
272
|
-
shards_obj = writer_pb2.Shards()
|
273
|
-
shards_obj.ParseFromString(payload) # type: ignore
|
274
|
-
|
275
356
|
for shard in shards_obj.shards:
|
276
357
|
# Delete the shard on nodes
|
277
358
|
for replica in shard.replicas:
|
@@ -297,29 +378,14 @@ class KnowledgeBox:
|
|
297
378
|
await cls.delete_all_kb_keys(driver, kbid)
|
298
379
|
|
299
380
|
@classmethod
|
300
|
-
async def delete_all_kb_keys(
|
301
|
-
cls, driver: Driver, kbid: str, chunk_size: int = 1_000
|
302
|
-
):
|
381
|
+
async def delete_all_kb_keys(cls, driver: Driver, kbid: str, chunk_size: int = 1_000):
|
303
382
|
prefix = KB_KEYS.format(kbid=kbid)
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
# We commit deletions in chunks because otherwise
|
312
|
-
# tikv complains if there is too much data to commit
|
313
|
-
for chunk_of_keys in chunker(all_keys, chunk_size):
|
314
|
-
async with driver.transaction() as txn:
|
315
|
-
for key in chunk_of_keys:
|
316
|
-
await txn.delete(key)
|
317
|
-
await txn.commit()
|
318
|
-
|
319
|
-
async def get_resource_shard(
|
320
|
-
self, shard_id: str
|
321
|
-
) -> Optional[writer_pb2.ShardObject]:
|
322
|
-
async with datamanagers.with_transaction() as txn:
|
383
|
+
async with driver.transaction() as txn:
|
384
|
+
await txn.delete_by_prefix(prefix)
|
385
|
+
await txn.commit()
|
386
|
+
|
387
|
+
async def get_resource_shard(self, shard_id: str) -> Optional[writer_pb2.ShardObject]:
|
388
|
+
async with datamanagers.with_ro_transaction() as txn:
|
323
389
|
pb = await datamanagers.cluster.get_kb_shards(txn, kbid=self.kbid)
|
324
390
|
if pb is None:
|
325
391
|
logger.warning("Shards not found for kbid", extra={"kbid": self.kbid})
|
@@ -330,9 +396,7 @@ class KnowledgeBox:
|
|
330
396
|
return None
|
331
397
|
|
332
398
|
async def get(self, uuid: str) -> Optional[Resource]:
|
333
|
-
basic = await datamanagers.resources.get_basic(
|
334
|
-
self.txn, kbid=self.kbid, rid=uuid
|
335
|
-
)
|
399
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
|
336
400
|
if basic is None:
|
337
401
|
return None
|
338
402
|
return Resource(
|
@@ -344,24 +408,32 @@ class KnowledgeBox:
|
|
344
408
|
disable_vectors=False,
|
345
409
|
)
|
346
410
|
|
347
|
-
async def
|
348
|
-
basic = await datamanagers.resources.get_basic(
|
349
|
-
|
350
|
-
)
|
351
|
-
|
352
|
-
async for key in self.txn.keys(
|
353
|
-
KB_RESOURCE.format(kbid=self.kbid, uuid=uuid), count=-1
|
354
|
-
):
|
355
|
-
await self.txn.delete(key)
|
356
|
-
|
411
|
+
async def maindb_delete_resource(self, uuid: str):
|
412
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
|
413
|
+
await self.txn.delete_by_prefix(KB_RESOURCE.format(kbid=self.kbid, uuid=uuid))
|
357
414
|
if basic and basic.slug:
|
358
|
-
slug_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
|
359
415
|
try:
|
360
|
-
await self.txn.delete(
|
416
|
+
await self.txn.delete(KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug))
|
361
417
|
except Exception:
|
362
|
-
|
418
|
+
logger.exception("Error deleting slug")
|
363
419
|
|
364
|
-
|
420
|
+
async def storage_delete_resource(self, uuid: str):
|
421
|
+
if is_onprem_nucliadb():
|
422
|
+
await self.storage.delete_resource(self.kbid, uuid)
|
423
|
+
else:
|
424
|
+
# Deleting from storage can be slow, so we schedule its deletion and the purge cronjob
|
425
|
+
# will take care of it
|
426
|
+
await self.schedule_delete_resource(self.kbid, uuid)
|
427
|
+
|
428
|
+
async def schedule_delete_resource(self, kbid: str, uuid: str):
|
429
|
+
key = RESOURCE_TO_DELETE_STORAGE.format(kbid=kbid, uuid=uuid)
|
430
|
+
await self.txn.set(key, b"")
|
431
|
+
|
432
|
+
async def delete_resource(self, uuid: str):
|
433
|
+
with processor_observer({"type": "delete_resource_maindb"}):
|
434
|
+
await self.maindb_delete_resource(uuid)
|
435
|
+
with processor_observer({"type": "delete_resource_storage"}):
|
436
|
+
await self.storage_delete_resource(uuid)
|
365
437
|
|
366
438
|
async def get_resource_uuid_by_slug(self, slug: str) -> Optional[str]:
|
367
439
|
return await datamanagers.resources.get_resource_uuid_from_slug(
|
@@ -372,7 +444,7 @@ class KnowledgeBox:
|
|
372
444
|
key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
|
373
445
|
key_ok = False
|
374
446
|
while key_ok is False:
|
375
|
-
found = await self.txn.get(key)
|
447
|
+
found = await self.txn.get(key, for_update=False)
|
376
448
|
if found and found.decode() != uuid:
|
377
449
|
slug += ".c"
|
378
450
|
key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
|
@@ -380,9 +452,7 @@ class KnowledgeBox:
|
|
380
452
|
key_ok = True
|
381
453
|
return slug
|
382
454
|
|
383
|
-
async def add_resource(
|
384
|
-
self, uuid: str, slug: str, basic: Optional[Basic] = None
|
385
|
-
) -> Resource:
|
455
|
+
async def add_resource(self, uuid: str, slug: str, basic: Optional[Basic] = None) -> Resource:
|
386
456
|
if basic is None:
|
387
457
|
basic = Basic()
|
388
458
|
if slug == "":
|
@@ -390,9 +460,7 @@ class KnowledgeBox:
|
|
390
460
|
slug = await self.get_unique_slug(uuid, slug)
|
391
461
|
basic.slug = slug
|
392
462
|
fix_paragraph_annotation_keys(uuid, basic)
|
393
|
-
await datamanagers.resources.set_basic(
|
394
|
-
self.txn, kbid=self.kbid, rid=uuid, basic=basic
|
395
|
-
)
|
463
|
+
await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=uuid, basic=basic)
|
396
464
|
return Resource(
|
397
465
|
storage=self.storage,
|
398
466
|
txn=self.txn,
|
@@ -404,7 +472,7 @@ class KnowledgeBox:
|
|
404
472
|
|
405
473
|
async def iterate_resources(self) -> AsyncGenerator[Resource, None]:
|
406
474
|
base = KB_RESOURCE_SLUG_BASE.format(kbid=self.kbid)
|
407
|
-
async for key in self.txn.keys(match=base
|
475
|
+
async for key in self.txn.keys(match=base):
|
408
476
|
slug = key.split("/")[-1]
|
409
477
|
uuid = await self.get_resource_uuid_by_slug(slug)
|
410
478
|
if uuid is not None:
|
@@ -416,6 +484,55 @@ class KnowledgeBox:
|
|
416
484
|
disable_vectors=False,
|
417
485
|
)
|
418
486
|
|
487
|
+
async def create_vectorset(self, config: knowledgebox_pb2.VectorSetConfig):
|
488
|
+
if await datamanagers.vectorsets.exists(
|
489
|
+
self.txn, kbid=self.kbid, vectorset_id=config.vectorset_id
|
490
|
+
):
|
491
|
+
raise VectorSetConflict(f"Vectorset {config.vectorset_id} already exists")
|
492
|
+
await datamanagers.vectorsets.set(self.txn, kbid=self.kbid, config=config)
|
493
|
+
|
494
|
+
# Remove the async deletion mark if it exists, just in case there was a previous deletion
|
495
|
+
deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=config.vectorset_id)
|
496
|
+
deletion_mark = await self.txn.get(deletion_mark_key, for_update=True)
|
497
|
+
if deletion_mark is not None:
|
498
|
+
await self.txn.delete(deletion_mark_key)
|
499
|
+
|
500
|
+
shard_manager = get_shard_manager()
|
501
|
+
await shard_manager.create_vectorset(self.kbid, config)
|
502
|
+
|
503
|
+
async def delete_vectorset(self, vectorset_id: str):
|
504
|
+
await datamanagers.vectorsets.delete(self.txn, kbid=self.kbid, vectorset_id=vectorset_id)
|
505
|
+
|
506
|
+
# mark vectorset for async deletion
|
507
|
+
deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=vectorset_id)
|
508
|
+
await self.txn.set(deletion_mark_key, b"")
|
509
|
+
|
510
|
+
shard_manager = get_shard_manager()
|
511
|
+
await shard_manager.delete_vectorset(self.kbid, vectorset_id)
|
512
|
+
|
513
|
+
@classmethod
|
514
|
+
async def _maybe_create_external_indexes(
|
515
|
+
cls,
|
516
|
+
kbid: str,
|
517
|
+
request: CreateExternalIndexProviderMetadata,
|
518
|
+
indexes: list[VectorsetExternalIndex],
|
519
|
+
) -> StoredExternalIndexProviderMetadata:
|
520
|
+
if request.type != ExternalIndexProviderType.PINECONE:
|
521
|
+
return StoredExternalIndexProviderMetadata(type=request.type)
|
522
|
+
# Only pinecone is supported for now
|
523
|
+
return await PineconeIndexManager.create_indexes(kbid, request, indexes)
|
524
|
+
|
525
|
+
@classmethod
|
526
|
+
async def _maybe_delete_external_indexes(
|
527
|
+
cls,
|
528
|
+
kbid: str,
|
529
|
+
stored: StoredExternalIndexProviderMetadata,
|
530
|
+
) -> None:
|
531
|
+
if stored.type != ExternalIndexProviderType.PINECONE:
|
532
|
+
return
|
533
|
+
# Only pinecone is supported for now
|
534
|
+
await PineconeIndexManager.delete_indexes(kbid, stored)
|
535
|
+
|
419
536
|
|
420
537
|
def chunker(seq: Sequence, size: int):
|
421
538
|
return (seq[pos : pos + size] for pos in range(0, len(seq), size))
|
@@ -429,3 +546,11 @@ def fix_paragraph_annotation_keys(uuid: str, basic: Basic) -> None:
|
|
429
546
|
for paragraph_annotation in ufm.paragraphs:
|
430
547
|
key = compute_paragraph_key(uuid, paragraph_annotation.key)
|
431
548
|
paragraph_annotation.key = key
|
549
|
+
|
550
|
+
|
551
|
+
@processor_observer.wrap({"type": "catalog_delete_kb"})
|
552
|
+
async def catalog_delete_kb(txn: Transaction, kbid: str):
|
553
|
+
if not isinstance(txn, PGTransaction):
|
554
|
+
return
|
555
|
+
async with txn.connection.cursor() as cur:
|
556
|
+
await cur.execute("DELETE FROM catalog where kbid = %(kbid)s", {"kbid": kbid})
|