nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,55 +18,71 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
from datetime import datetime
|
21
|
-
from
|
21
|
+
from functools import partial
|
22
|
+
from typing import Any, AsyncGenerator, Callable, Coroutine, Optional, Sequence
|
22
23
|
from uuid import uuid4
|
23
24
|
|
24
25
|
from grpc import StatusCode
|
25
26
|
from grpc.aio import AioRpcError
|
26
|
-
from nucliadb_protos.knowledgebox_pb2 import (
|
27
|
-
KnowledgeBoxConfig,
|
28
|
-
Labels,
|
29
|
-
LabelSet,
|
30
|
-
SemanticModelMetadata,
|
31
|
-
)
|
32
|
-
from nucliadb_protos.knowledgebox_pb2 import Synonyms as PBSynonyms
|
33
|
-
from nucliadb_protos.knowledgebox_pb2 import VectorSet, VectorSets
|
34
|
-
from nucliadb_protos.resources_pb2 import Basic
|
35
|
-
from nucliadb_protos.utils_pb2 import ReleaseChannel
|
36
27
|
|
37
28
|
from nucliadb.common import datamanagers
|
38
|
-
from nucliadb.common.cluster.
|
39
|
-
from nucliadb.common.cluster.exceptions import ShardNotFound, ShardsNotFound
|
29
|
+
from nucliadb.common.cluster.exceptions import ShardNotFound
|
40
30
|
from nucliadb.common.cluster.manager import get_index_node
|
41
31
|
from nucliadb.common.cluster.utils import get_shard_manager
|
42
|
-
|
43
|
-
|
44
|
-
from nucliadb.
|
45
|
-
from nucliadb.ingest.orm.resource import (
|
32
|
+
|
33
|
+
# XXX: this keys shouldn't be exposed outside datamanagers
|
34
|
+
from nucliadb.common.datamanagers.resources import (
|
46
35
|
KB_RESOURCE_SLUG,
|
47
36
|
KB_RESOURCE_SLUG_BASE,
|
48
|
-
Resource,
|
49
37
|
)
|
50
|
-
from nucliadb.
|
51
|
-
from nucliadb.
|
38
|
+
from nucliadb.common.external_index_providers.base import VectorsetExternalIndex
|
39
|
+
from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
|
40
|
+
from nucliadb.common.maindb.driver import Driver, Transaction
|
41
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
42
|
+
from nucliadb.common.nidx import get_nidx_api_client
|
43
|
+
from nucliadb.ingest import SERVICE_NAME, logger
|
44
|
+
from nucliadb.ingest.orm.exceptions import (
|
45
|
+
KnowledgeBoxConflict,
|
46
|
+
KnowledgeBoxCreationError,
|
47
|
+
VectorSetConflict,
|
48
|
+
)
|
49
|
+
from nucliadb.ingest.orm.metrics import processor_observer
|
50
|
+
from nucliadb.ingest.orm.resource import Resource
|
51
|
+
from nucliadb.ingest.orm.utils import choose_matryoshka_dimension, compute_paragraph_key
|
52
52
|
from nucliadb.migrator.utils import get_latest_version
|
53
|
-
from nucliadb_protos import writer_pb2
|
53
|
+
from nucliadb_protos import knowledgebox_pb2, noderesources_pb2, nodewriter_pb2, writer_pb2
|
54
|
+
from nucliadb_protos.knowledgebox_pb2 import (
|
55
|
+
CreateExternalIndexProviderMetadata,
|
56
|
+
ExternalIndexProviderType,
|
57
|
+
KnowledgeBoxConfig,
|
58
|
+
SemanticModelMetadata,
|
59
|
+
StoredExternalIndexProviderMetadata,
|
60
|
+
)
|
61
|
+
from nucliadb_protos.resources_pb2 import Basic
|
62
|
+
from nucliadb_utils.settings import is_onprem_nucliadb
|
54
63
|
from nucliadb_utils.storages.storage import Storage
|
55
|
-
from nucliadb_utils.utilities import
|
64
|
+
from nucliadb_utils.utilities import (
|
65
|
+
get_audit,
|
66
|
+
get_storage,
|
67
|
+
)
|
56
68
|
|
57
69
|
# XXX Eventually all these keys should be moved to datamanagers.kb
|
58
70
|
KB_RESOURCE = "/kbs/{kbid}/r/{uuid}"
|
59
71
|
|
60
72
|
KB_KEYS = "/kbs/{kbid}/"
|
61
73
|
|
62
|
-
KB_VECTORSET = "/kbs/{kbid}/vectorsets"
|
63
|
-
|
64
74
|
KB_TO_DELETE_BASE = "/kbtodelete/"
|
65
75
|
KB_TO_DELETE_STORAGE_BASE = "/storagetodelete/"
|
66
76
|
|
77
|
+
RESOURCE_TO_DELETE_STORAGE_BASE = "/resourcestoragetodelete"
|
78
|
+
RESOURCE_TO_DELETE_STORAGE = f"{RESOURCE_TO_DELETE_STORAGE_BASE}/{{kbid}}/{{uuid}}"
|
79
|
+
|
67
80
|
KB_TO_DELETE = f"{KB_TO_DELETE_BASE}{{kbid}}"
|
68
81
|
KB_TO_DELETE_STORAGE = f"{KB_TO_DELETE_STORAGE_BASE}{{kbid}}"
|
69
82
|
|
83
|
+
KB_VECTORSET_TO_DELETE_BASE = "/vectorsettodelete"
|
84
|
+
KB_VECTORSET_TO_DELETE = f"{KB_VECTORSET_TO_DELETE_BASE}/{{kbid}}/{{vectorset}}"
|
85
|
+
|
70
86
|
|
71
87
|
class KnowledgeBox:
|
72
88
|
def __init__(self, txn: Transaction, storage: Storage, kbid: str):
|
@@ -74,115 +90,154 @@ class KnowledgeBox:
|
|
74
90
|
self.storage = storage
|
75
91
|
self.kbid = kbid
|
76
92
|
self._config: Optional[KnowledgeBoxConfig] = None
|
77
|
-
self.synonyms = Synonyms(self.txn, self.kbid)
|
78
|
-
|
79
|
-
async def get_config(self) -> Optional[KnowledgeBoxConfig]:
|
80
|
-
if self._config is None:
|
81
|
-
async with datamanagers.with_transaction() as txn:
|
82
|
-
config = await datamanagers.kb.get_config(txn, kbid=self.kbid)
|
83
|
-
if config is not None:
|
84
|
-
self._config = config
|
85
|
-
return config
|
86
|
-
else:
|
87
|
-
return None
|
88
|
-
else:
|
89
|
-
return self._config
|
90
|
-
|
91
|
-
@classmethod
|
92
|
-
async def delete_kb(cls, txn: Transaction, slug: str = "", kbid: str = ""):
|
93
|
-
# Mark storage to be deleted
|
94
|
-
# Mark keys to be deleted
|
95
|
-
logger.info(f"Deleting KB kbid={kbid} slug={slug}")
|
96
|
-
if not kbid and not slug:
|
97
|
-
raise AttributeError()
|
98
|
-
|
99
|
-
if slug and not kbid:
|
100
|
-
kbid_bytes = await txn.get(datamanagers.kb.KB_SLUGS.format(slug=slug))
|
101
|
-
if kbid_bytes is None:
|
102
|
-
raise datamanagers.exceptions.KnowledgeBoxNotFound()
|
103
|
-
kbid = kbid_bytes.decode()
|
104
|
-
|
105
|
-
if kbid and not slug:
|
106
|
-
kbconfig_bytes = await txn.get(datamanagers.kb.KB_UUID.format(kbid=kbid))
|
107
|
-
if kbconfig_bytes is None:
|
108
|
-
raise datamanagers.exceptions.KnowledgeBoxNotFound()
|
109
|
-
pbconfig = KnowledgeBoxConfig()
|
110
|
-
pbconfig.ParseFromString(kbconfig_bytes)
|
111
|
-
slug = pbconfig.slug
|
112
|
-
|
113
|
-
# Delete main anchor
|
114
|
-
async with txn.driver.transaction() as subtxn:
|
115
|
-
key_match = datamanagers.kb.KB_SLUGS.format(slug=slug)
|
116
|
-
logger.info(f"Deleting KB with slug: {slug}")
|
117
|
-
await subtxn.delete(key_match)
|
118
93
|
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
audit_util = get_audit()
|
124
|
-
if audit_util is not None:
|
125
|
-
await audit_util.delete_kb(kbid)
|
126
|
-
return kbid
|
94
|
+
@staticmethod
|
95
|
+
def new_unique_kbid() -> str:
|
96
|
+
return str(uuid4())
|
127
97
|
|
128
98
|
@classmethod
|
99
|
+
@processor_observer.wrap({"type": "create_kb"})
|
129
100
|
async def create(
|
130
101
|
cls,
|
131
|
-
|
102
|
+
driver: Driver,
|
103
|
+
*,
|
104
|
+
kbid: str,
|
132
105
|
slug: str,
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
if
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
if config is None:
|
153
|
-
config = KnowledgeBoxConfig()
|
154
|
-
|
155
|
-
config.migration_version = get_latest_version()
|
156
|
-
config.slug = slug
|
157
|
-
await txn.set(
|
158
|
-
datamanagers.kb.KB_UUID.format(kbid=uuid),
|
159
|
-
config.SerializeToString(),
|
160
|
-
)
|
161
|
-
# Create Storage
|
162
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
106
|
+
title: str = "",
|
107
|
+
description: str = "",
|
108
|
+
semantic_models: Optional[dict[str, SemanticModelMetadata]] = None,
|
109
|
+
external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
|
110
|
+
hidden_resources_enabled: bool = False,
|
111
|
+
hidden_resources_hide_on_creation: bool = False,
|
112
|
+
) -> tuple[str, str]:
|
113
|
+
"""Creates a new knowledge box and return its id and slug."""
|
114
|
+
|
115
|
+
if not kbid:
|
116
|
+
raise KnowledgeBoxCreationError("A kbid must be provided to create a new KB")
|
117
|
+
if not slug:
|
118
|
+
raise KnowledgeBoxCreationError("A slug must be provided to create a new KB")
|
119
|
+
if hidden_resources_hide_on_creation and not hidden_resources_enabled:
|
120
|
+
raise KnowledgeBoxCreationError(
|
121
|
+
"Cannot hide new resources if the hidden resources feature is disabled"
|
122
|
+
)
|
123
|
+
if semantic_models is None or len(semantic_models) == 0:
|
124
|
+
raise KnowledgeBoxCreationError("KB must define at least one semantic model")
|
163
125
|
|
164
|
-
|
165
|
-
if created is False:
|
166
|
-
logger.error(f"{uuid} KB could not be created")
|
167
|
-
failed = True
|
126
|
+
rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
|
168
127
|
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
128
|
+
try:
|
129
|
+
async with driver.transaction() as txn:
|
130
|
+
exists = await datamanagers.kb.get_kb_uuid(
|
131
|
+
txn, slug=slug
|
132
|
+
) or await datamanagers.kb.exists_kb(txn, kbid=kbid)
|
133
|
+
if exists:
|
134
|
+
raise KnowledgeBoxConflict()
|
135
|
+
|
136
|
+
# Create in maindb
|
137
|
+
await datamanagers.kb.set_kbid_for_slug(txn, slug=slug, kbid=kbid)
|
138
|
+
|
139
|
+
# all KBs have the vectorset key initialized, although (for
|
140
|
+
# now), not every KB will store vectorsets there
|
141
|
+
await datamanagers.vectorsets.initialize(txn, kbid=kbid)
|
142
|
+
|
143
|
+
kb_shards = writer_pb2.Shards()
|
144
|
+
kb_shards.kbid = kbid
|
145
|
+
# B/c with Shards.actual
|
146
|
+
kb_shards.actual = -1
|
147
|
+
|
148
|
+
vs_external_indexes = []
|
149
|
+
for vectorset_id, semantic_model in semantic_models.items(): # type: ignore
|
150
|
+
# if this KB uses a matryoshka model, we can choose a different
|
151
|
+
# dimension
|
152
|
+
if len(semantic_model.matryoshka_dimensions) > 0:
|
153
|
+
dimension = choose_matryoshka_dimension(semantic_model.matryoshka_dimensions)
|
154
|
+
else:
|
155
|
+
dimension = semantic_model.vector_dimension
|
156
|
+
|
157
|
+
vs_external_indexes.append(
|
158
|
+
VectorsetExternalIndex(
|
159
|
+
vectorset_id=vectorset_id,
|
160
|
+
dimension=dimension,
|
161
|
+
similarity=semantic_model.similarity_function,
|
162
|
+
)
|
163
|
+
)
|
164
|
+
|
165
|
+
vectorset_config = knowledgebox_pb2.VectorSetConfig(
|
166
|
+
vectorset_id=vectorset_id,
|
167
|
+
vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
|
168
|
+
similarity=semantic_model.similarity_function,
|
169
|
+
# XXX: hardcoded value
|
170
|
+
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
171
|
+
normalize_vectors=len(semantic_model.matryoshka_dimensions) > 0,
|
172
|
+
vector_dimension=dimension,
|
173
|
+
),
|
174
|
+
matryoshka_dimensions=semantic_model.matryoshka_dimensions,
|
175
|
+
)
|
176
|
+
await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset_config)
|
177
|
+
|
178
|
+
stored_external_index_provider = await cls._maybe_create_external_indexes(
|
179
|
+
kbid, request=external_index_provider, indexes=vs_external_indexes
|
180
|
+
)
|
181
|
+
rollback_ops.append(
|
182
|
+
partial(
|
183
|
+
cls._maybe_delete_external_indexes,
|
184
|
+
kbid,
|
185
|
+
stored_external_index_provider,
|
186
|
+
)
|
177
187
|
)
|
178
|
-
except Exception as e:
|
179
|
-
await storage.delete_kb(uuid)
|
180
|
-
raise e
|
181
|
-
|
182
|
-
if failed:
|
183
|
-
await storage.delete_kb(uuid)
|
184
188
|
|
185
|
-
|
189
|
+
config = KnowledgeBoxConfig(
|
190
|
+
title=title,
|
191
|
+
description=description,
|
192
|
+
slug=slug,
|
193
|
+
migration_version=get_latest_version(),
|
194
|
+
hidden_resources_enabled=hidden_resources_enabled,
|
195
|
+
hidden_resources_hide_on_creation=hidden_resources_hide_on_creation,
|
196
|
+
)
|
197
|
+
config.external_index_provider.CopyFrom(stored_external_index_provider)
|
198
|
+
await datamanagers.kb.set_config(txn, kbid=kbid, config=config)
|
199
|
+
await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
|
200
|
+
|
201
|
+
# shard creation will alter this value on maindb, make sure nobody
|
202
|
+
# uses this variable anymore
|
203
|
+
del kb_shards
|
204
|
+
|
205
|
+
# Create in storage
|
206
|
+
|
207
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
208
|
+
|
209
|
+
created = await storage.create_kb(kbid)
|
210
|
+
if not created:
|
211
|
+
logger.error(f"KB {kbid} could not be created")
|
212
|
+
raise KnowledgeBoxCreationError(
|
213
|
+
f"KB blob storage could not be created (slug={slug})"
|
214
|
+
)
|
215
|
+
rollback_ops.append(partial(storage.delete_kb, kbid))
|
216
|
+
|
217
|
+
# Create shards in index nodes
|
218
|
+
|
219
|
+
shard_manager = get_shard_manager()
|
220
|
+
# XXX creating a shard is a slow IO operation that requires a write
|
221
|
+
# txn to be open!
|
222
|
+
await shard_manager.create_shard_by_kbid(txn, kbid)
|
223
|
+
# shards don't need a rollback as they will be eventually purged
|
224
|
+
|
225
|
+
await txn.commit()
|
226
|
+
|
227
|
+
except Exception as exc:
|
228
|
+
# rollback all changes on the db and raise the exception
|
229
|
+
for op in reversed(rollback_ops):
|
230
|
+
try:
|
231
|
+
await op()
|
232
|
+
except Exception:
|
233
|
+
if isinstance(op, partial):
|
234
|
+
name: str = op.func.__name__
|
235
|
+
else:
|
236
|
+
getattr(op, "__name__", "unknown?")
|
237
|
+
logger.exception(f"Unexpected error rolling back {name}. Keep rolling back")
|
238
|
+
raise exc
|
239
|
+
|
240
|
+
return (kbid, slug)
|
186
241
|
|
187
242
|
@classmethod
|
188
243
|
async def update(
|
@@ -192,7 +247,7 @@ class KnowledgeBox:
|
|
192
247
|
slug: Optional[str] = None,
|
193
248
|
config: Optional[KnowledgeBoxConfig] = None,
|
194
249
|
) -> str:
|
195
|
-
exist = await datamanagers.kb.get_config(txn, kbid=uuid)
|
250
|
+
exist = await datamanagers.kb.get_config(txn, kbid=uuid, for_update=True)
|
196
251
|
if not exist:
|
197
252
|
raise datamanagers.exceptions.KnowledgeBoxNotFound()
|
198
253
|
|
@@ -209,94 +264,61 @@ class KnowledgeBox:
|
|
209
264
|
|
210
265
|
if config and exist != config:
|
211
266
|
exist.MergeFrom(config)
|
267
|
+
exist.hidden_resources_enabled = config.hidden_resources_enabled
|
268
|
+
exist.hidden_resources_hide_on_creation = config.hidden_resources_hide_on_creation
|
212
269
|
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
270
|
+
if exist.hidden_resources_hide_on_creation and not exist.hidden_resources_enabled:
|
271
|
+
raise KnowledgeBoxCreationError(
|
272
|
+
"Cannot hide new resources if the hidden resources feature is disabled"
|
273
|
+
)
|
274
|
+
|
275
|
+
await datamanagers.kb.set_config(txn, kbid=uuid, config=exist)
|
217
276
|
|
218
277
|
return uuid
|
219
278
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
for shard in shards_obj.shards:
|
227
|
-
for replica in shard.replicas:
|
228
|
-
node = get_index_node(replica.node)
|
229
|
-
if node is not None:
|
230
|
-
yield node, replica.shard.id
|
231
|
-
|
232
|
-
# Vectorset
|
233
|
-
async def get_vectorsets(self, response: writer_pb2.GetVectorSetsResponse):
|
234
|
-
vectorset_key = KB_VECTORSET.format(kbid=self.kbid)
|
235
|
-
payload = await self.txn.get(vectorset_key)
|
236
|
-
if payload is not None:
|
237
|
-
response.vectorsets.ParseFromString(payload)
|
238
|
-
|
239
|
-
async def del_vectorset(self, id: str):
|
240
|
-
vectorset_key = KB_VECTORSET.format(kbid=self.kbid)
|
241
|
-
payload = await self.txn.get(vectorset_key)
|
242
|
-
vts = VectorSets()
|
243
|
-
if payload is not None:
|
244
|
-
vts.ParseFromString(payload)
|
245
|
-
del vts.vectorsets[id]
|
246
|
-
# For each Node on the KB delete the vectorset
|
247
|
-
async for node, shard in self.iterate_kb_nodes():
|
248
|
-
await node.del_vectorset(shard, id)
|
249
|
-
payload = vts.SerializeToString()
|
250
|
-
await self.txn.set(vectorset_key, payload)
|
251
|
-
|
252
|
-
async def set_vectorset(self, id: str, vs: VectorSet):
|
253
|
-
vectorset_key = KB_VECTORSET.format(kbid=self.kbid)
|
254
|
-
payload = await self.txn.get(vectorset_key)
|
255
|
-
vts = VectorSets()
|
256
|
-
if payload is not None:
|
257
|
-
vts.ParseFromString(payload)
|
258
|
-
vts.vectorsets[id].CopyFrom(vs)
|
259
|
-
# For each Node on the KB add the vectorset
|
260
|
-
async for node, shard in self.iterate_kb_nodes():
|
261
|
-
await node.set_vectorset(shard, id, similarity=vs.similarity)
|
262
|
-
payload = vts.SerializeToString()
|
263
|
-
await self.txn.set(vectorset_key, payload)
|
264
|
-
|
265
|
-
# Labels
|
266
|
-
async def set_labelset(self, id: str, labelset: LabelSet):
|
267
|
-
await datamanagers.labels.set_labelset(
|
268
|
-
self.txn, kbid=self.kbid, labelset_id=id, labelset=labelset
|
269
|
-
)
|
279
|
+
@classmethod
|
280
|
+
async def delete(cls, driver: Driver, kbid: str):
|
281
|
+
async with driver.transaction() as txn:
|
282
|
+
exists = await datamanagers.kb.exists_kb(txn, kbid=kbid)
|
283
|
+
if not exists:
|
284
|
+
return
|
270
285
|
|
271
|
-
|
272
|
-
|
286
|
+
# Delete main anchor
|
287
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
288
|
+
if kb_config is not None:
|
289
|
+
slug = kb_config.slug
|
290
|
+
await datamanagers.kb.delete_kb_slug(txn, slug=slug)
|
273
291
|
|
274
|
-
|
275
|
-
self, labelset: str, labelset_response: writer_pb2.GetLabelSetResponse
|
276
|
-
):
|
277
|
-
ls = await datamanagers.labels.get_labelset(
|
278
|
-
self.txn,
|
279
|
-
kbid=self.kbid,
|
280
|
-
labelset_id=labelset,
|
281
|
-
)
|
282
|
-
if ls is not None:
|
283
|
-
labelset_response.labelset.CopyFrom(ls)
|
292
|
+
await datamanagers.kb.delete_config(txn, kbid=kbid)
|
284
293
|
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
294
|
+
# Mark KB to purge. This will eventually delete all KB keys, storage
|
295
|
+
# and index data (for the old index nodes)
|
296
|
+
when = datetime.now().isoformat()
|
297
|
+
await txn.set(KB_TO_DELETE.format(kbid=kbid), when.encode())
|
289
298
|
|
290
|
-
|
291
|
-
pbsyn = await self.synonyms.get()
|
292
|
-
if pbsyn is not None:
|
293
|
-
synonyms.CopyFrom(pbsyn)
|
299
|
+
shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
294
300
|
|
295
|
-
|
296
|
-
await self.synonyms.set(synonyms)
|
301
|
+
await txn.commit()
|
297
302
|
|
298
|
-
|
299
|
-
|
303
|
+
if shards_obj is None:
|
304
|
+
logger.warning(f"Shards not found for KB while deleting it", extra={"kbid": kbid})
|
305
|
+
else:
|
306
|
+
nidx_api = get_nidx_api_client()
|
307
|
+
# Delete shards from nidx. They'll be marked for eventual deletion,
|
308
|
+
# so this call shouldn't be costly
|
309
|
+
if nidx_api is not None:
|
310
|
+
for shard in shards_obj.shards:
|
311
|
+
if shard.nidx_shard_id:
|
312
|
+
await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
|
313
|
+
|
314
|
+
if kb_config is not None:
|
315
|
+
await cls._maybe_delete_external_indexes(kbid, kb_config.external_index_provider)
|
316
|
+
|
317
|
+
audit = get_audit()
|
318
|
+
if audit is not None:
|
319
|
+
audit.delete_kb(kbid=kbid)
|
320
|
+
|
321
|
+
return kbid
|
300
322
|
|
301
323
|
@classmethod
|
302
324
|
async def purge(cls, driver: Driver, kbid: str):
|
@@ -307,6 +329,8 @@ class KnowledgeBox:
|
|
307
329
|
need to delete the kb shards and also deletes the related storage
|
308
330
|
buckets.
|
309
331
|
|
332
|
+
Removes all catalog entries related to the kb.
|
333
|
+
|
310
334
|
As non-empty buckets cannot be deleted, they are scheduled to be
|
311
335
|
deleted instead. Actually, this empties the bucket asynchronouysly
|
312
336
|
but it doesn't delete it. To do it, we save a marker using the
|
@@ -322,16 +346,13 @@ class KnowledgeBox:
|
|
322
346
|
storage_to_delete = KB_TO_DELETE_STORAGE.format(kbid=kbid)
|
323
347
|
await txn.set(storage_to_delete, b"")
|
324
348
|
|
325
|
-
|
326
|
-
shards_match = datamanagers.cluster.KB_SHARDS.format(kbid=kbid)
|
327
|
-
payload = await txn.get(shards_match)
|
349
|
+
await catalog_delete_kb(txn, kbid)
|
328
350
|
|
329
|
-
|
330
|
-
|
351
|
+
# Delete KB Shards
|
352
|
+
shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
353
|
+
if shards_obj is None:
|
354
|
+
logger.warning(f"Shards not found for KB while purging it", extra={"kbid": kbid})
|
331
355
|
else:
|
332
|
-
shards_obj = writer_pb2.Shards()
|
333
|
-
shards_obj.ParseFromString(payload) # type: ignore
|
334
|
-
|
335
356
|
for shard in shards_obj.shards:
|
336
357
|
# Delete the shard on nodes
|
337
358
|
for replica in shard.replicas:
|
@@ -357,29 +378,14 @@ class KnowledgeBox:
|
|
357
378
|
await cls.delete_all_kb_keys(driver, kbid)
|
358
379
|
|
359
380
|
@classmethod
|
360
|
-
async def delete_all_kb_keys(
|
361
|
-
cls, driver: Driver, kbid: str, chunk_size: int = 1_000
|
362
|
-
):
|
381
|
+
async def delete_all_kb_keys(cls, driver: Driver, kbid: str, chunk_size: int = 1_000):
|
363
382
|
prefix = KB_KEYS.format(kbid=kbid)
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
# We commit deletions in chunks because otherwise
|
372
|
-
# tikv complains if there is too much data to commit
|
373
|
-
for chunk_of_keys in chunker(all_keys, chunk_size):
|
374
|
-
async with driver.transaction() as txn:
|
375
|
-
for key in chunk_of_keys:
|
376
|
-
await txn.delete(key)
|
377
|
-
await txn.commit()
|
378
|
-
|
379
|
-
async def get_resource_shard(
|
380
|
-
self, shard_id: str
|
381
|
-
) -> Optional[writer_pb2.ShardObject]:
|
382
|
-
async with datamanagers.with_transaction() as txn:
|
383
|
+
async with driver.transaction() as txn:
|
384
|
+
await txn.delete_by_prefix(prefix)
|
385
|
+
await txn.commit()
|
386
|
+
|
387
|
+
async def get_resource_shard(self, shard_id: str) -> Optional[writer_pb2.ShardObject]:
|
388
|
+
async with datamanagers.with_ro_transaction() as txn:
|
383
389
|
pb = await datamanagers.cluster.get_kb_shards(txn, kbid=self.kbid)
|
384
390
|
if pb is None:
|
385
391
|
logger.warning("Shards not found for kbid", extra={"kbid": self.kbid})
|
@@ -390,52 +396,55 @@ class KnowledgeBox:
|
|
390
396
|
return None
|
391
397
|
|
392
398
|
async def get(self, uuid: str) -> Optional[Resource]:
|
393
|
-
|
394
|
-
if
|
395
|
-
return Resource(
|
396
|
-
txn=self.txn,
|
397
|
-
storage=self.storage,
|
398
|
-
kb=self,
|
399
|
-
uuid=uuid,
|
400
|
-
basic=Resource.parse_basic(raw_basic),
|
401
|
-
disable_vectors=False,
|
402
|
-
)
|
403
|
-
else:
|
399
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
|
400
|
+
if basic is None:
|
404
401
|
return None
|
402
|
+
return Resource(
|
403
|
+
txn=self.txn,
|
404
|
+
storage=self.storage,
|
405
|
+
kb=self,
|
406
|
+
uuid=uuid,
|
407
|
+
basic=basic,
|
408
|
+
disable_vectors=False,
|
409
|
+
)
|
405
410
|
|
406
|
-
async def
|
407
|
-
|
408
|
-
|
409
|
-
basic = Resource.parse_basic(raw_basic)
|
410
|
-
else:
|
411
|
-
basic = None
|
412
|
-
|
413
|
-
async for key in self.txn.keys(
|
414
|
-
KB_RESOURCE.format(kbid=self.kbid, uuid=uuid), count=-1
|
415
|
-
):
|
416
|
-
await self.txn.delete(key)
|
417
|
-
|
411
|
+
async def maindb_delete_resource(self, uuid: str):
|
412
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
|
413
|
+
await self.txn.delete_by_prefix(KB_RESOURCE.format(kbid=self.kbid, uuid=uuid))
|
418
414
|
if basic and basic.slug:
|
419
|
-
slug_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
|
420
415
|
try:
|
421
|
-
await self.txn.delete(
|
416
|
+
await self.txn.delete(KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug))
|
422
417
|
except Exception:
|
423
|
-
|
418
|
+
logger.exception("Error deleting slug")
|
419
|
+
|
420
|
+
async def storage_delete_resource(self, uuid: str):
|
421
|
+
if is_onprem_nucliadb():
|
422
|
+
await self.storage.delete_resource(self.kbid, uuid)
|
423
|
+
else:
|
424
|
+
# Deleting from storage can be slow, so we schedule its deletion and the purge cronjob
|
425
|
+
# will take care of it
|
426
|
+
await self.schedule_delete_resource(self.kbid, uuid)
|
427
|
+
|
428
|
+
async def schedule_delete_resource(self, kbid: str, uuid: str):
|
429
|
+
key = RESOURCE_TO_DELETE_STORAGE.format(kbid=kbid, uuid=uuid)
|
430
|
+
await self.txn.set(key, b"")
|
424
431
|
|
425
|
-
|
432
|
+
async def delete_resource(self, uuid: str):
|
433
|
+
with processor_observer({"type": "delete_resource_maindb"}):
|
434
|
+
await self.maindb_delete_resource(uuid)
|
435
|
+
with processor_observer({"type": "delete_resource_storage"}):
|
436
|
+
await self.storage_delete_resource(uuid)
|
426
437
|
|
427
438
|
async def get_resource_uuid_by_slug(self, slug: str) -> Optional[str]:
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
else:
|
432
|
-
return None
|
439
|
+
return await datamanagers.resources.get_resource_uuid_from_slug(
|
440
|
+
self.txn, kbid=self.kbid, slug=slug
|
441
|
+
)
|
433
442
|
|
434
443
|
async def get_unique_slug(self, uuid: str, slug: str) -> str:
|
435
444
|
key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
|
436
445
|
key_ok = False
|
437
446
|
while key_ok is False:
|
438
|
-
found = await self.txn.get(key)
|
447
|
+
found = await self.txn.get(key, for_update=False)
|
439
448
|
if found and found.decode() != uuid:
|
440
449
|
slug += ".c"
|
441
450
|
key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
|
@@ -443,17 +452,7 @@ class KnowledgeBox:
|
|
443
452
|
key_ok = True
|
444
453
|
return slug
|
445
454
|
|
446
|
-
|
447
|
-
async def resource_slug_exists(
|
448
|
-
self, txn: Transaction, kbid: str, slug: str
|
449
|
-
) -> bool:
|
450
|
-
key = KB_RESOURCE_SLUG.format(kbid=kbid, slug=slug)
|
451
|
-
encoded_slug: Optional[bytes] = await txn.get(key)
|
452
|
-
return encoded_slug not in (None, b"")
|
453
|
-
|
454
|
-
async def add_resource(
|
455
|
-
self, uuid: str, slug: str, basic: Optional[Basic] = None
|
456
|
-
) -> Resource:
|
455
|
+
async def add_resource(self, uuid: str, slug: str, basic: Optional[Basic] = None) -> Resource:
|
457
456
|
if basic is None:
|
458
457
|
basic = Basic()
|
459
458
|
if slug == "":
|
@@ -461,7 +460,7 @@ class KnowledgeBox:
|
|
461
460
|
slug = await self.get_unique_slug(uuid, slug)
|
462
461
|
basic.slug = slug
|
463
462
|
fix_paragraph_annotation_keys(uuid, basic)
|
464
|
-
await set_basic(self.txn, self.kbid, uuid, basic)
|
463
|
+
await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=uuid, basic=basic)
|
465
464
|
return Resource(
|
466
465
|
storage=self.storage,
|
467
466
|
txn=self.txn,
|
@@ -473,7 +472,7 @@ class KnowledgeBox:
|
|
473
472
|
|
474
473
|
async def iterate_resources(self) -> AsyncGenerator[Resource, None]:
|
475
474
|
base = KB_RESOURCE_SLUG_BASE.format(kbid=self.kbid)
|
476
|
-
async for key in self.txn.keys(match=base
|
475
|
+
async for key in self.txn.keys(match=base):
|
477
476
|
slug = key.split("/")[-1]
|
478
477
|
uuid = await self.get_resource_uuid_by_slug(slug)
|
479
478
|
if uuid is not None:
|
@@ -485,6 +484,55 @@ class KnowledgeBox:
|
|
485
484
|
disable_vectors=False,
|
486
485
|
)
|
487
486
|
|
487
|
+
async def create_vectorset(self, config: knowledgebox_pb2.VectorSetConfig):
|
488
|
+
if await datamanagers.vectorsets.exists(
|
489
|
+
self.txn, kbid=self.kbid, vectorset_id=config.vectorset_id
|
490
|
+
):
|
491
|
+
raise VectorSetConflict(f"Vectorset {config.vectorset_id} already exists")
|
492
|
+
await datamanagers.vectorsets.set(self.txn, kbid=self.kbid, config=config)
|
493
|
+
|
494
|
+
# Remove the async deletion mark if it exists, just in case there was a previous deletion
|
495
|
+
deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=config.vectorset_id)
|
496
|
+
deletion_mark = await self.txn.get(deletion_mark_key, for_update=True)
|
497
|
+
if deletion_mark is not None:
|
498
|
+
await self.txn.delete(deletion_mark_key)
|
499
|
+
|
500
|
+
shard_manager = get_shard_manager()
|
501
|
+
await shard_manager.create_vectorset(self.kbid, config)
|
502
|
+
|
503
|
+
async def delete_vectorset(self, vectorset_id: str):
|
504
|
+
await datamanagers.vectorsets.delete(self.txn, kbid=self.kbid, vectorset_id=vectorset_id)
|
505
|
+
|
506
|
+
# mark vectorset for async deletion
|
507
|
+
deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=vectorset_id)
|
508
|
+
await self.txn.set(deletion_mark_key, b"")
|
509
|
+
|
510
|
+
shard_manager = get_shard_manager()
|
511
|
+
await shard_manager.delete_vectorset(self.kbid, vectorset_id)
|
512
|
+
|
513
|
+
@classmethod
|
514
|
+
async def _maybe_create_external_indexes(
|
515
|
+
cls,
|
516
|
+
kbid: str,
|
517
|
+
request: CreateExternalIndexProviderMetadata,
|
518
|
+
indexes: list[VectorsetExternalIndex],
|
519
|
+
) -> StoredExternalIndexProviderMetadata:
|
520
|
+
if request.type != ExternalIndexProviderType.PINECONE:
|
521
|
+
return StoredExternalIndexProviderMetadata(type=request.type)
|
522
|
+
# Only pinecone is supported for now
|
523
|
+
return await PineconeIndexManager.create_indexes(kbid, request, indexes)
|
524
|
+
|
525
|
+
@classmethod
|
526
|
+
async def _maybe_delete_external_indexes(
|
527
|
+
cls,
|
528
|
+
kbid: str,
|
529
|
+
stored: StoredExternalIndexProviderMetadata,
|
530
|
+
) -> None:
|
531
|
+
if stored.type != ExternalIndexProviderType.PINECONE:
|
532
|
+
return
|
533
|
+
# Only pinecone is supported for now
|
534
|
+
await PineconeIndexManager.delete_indexes(kbid, stored)
|
535
|
+
|
488
536
|
|
489
537
|
def chunker(seq: Sequence, size: int):
|
490
538
|
return (seq[pos : pos + size] for pos in range(0, len(seq), size))
|
@@ -498,3 +546,11 @@ def fix_paragraph_annotation_keys(uuid: str, basic: Basic) -> None:
|
|
498
546
|
for paragraph_annotation in ufm.paragraphs:
|
499
547
|
key = compute_paragraph_key(uuid, paragraph_annotation.key)
|
500
548
|
paragraph_annotation.key = key
|
549
|
+
|
550
|
+
|
551
|
+
@processor_observer.wrap({"type": "catalog_delete_kb"})
|
552
|
+
async def catalog_delete_kb(txn: Transaction, kbid: str):
|
553
|
+
if not isinstance(txn, PGTransaction):
|
554
|
+
return
|
555
|
+
async with txn.connection.cursor() as cur:
|
556
|
+
await cur.execute("DELETE FROM catalog where kbid = %(kbid)s", {"kbid": kbid})
|