nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/migrator/datamanager.py
CHANGED
@@ -47,17 +47,19 @@ class MigrationsDataManager:
|
|
47
47
|
self.driver = driver
|
48
48
|
|
49
49
|
async def schedule_all_kbs(self, target_version: int) -> None:
|
50
|
+
# Get all kb ids
|
51
|
+
async with self.driver.transaction(read_only=True) as txn:
|
52
|
+
kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
|
53
|
+
# Schedule the migrations
|
50
54
|
async with self.driver.transaction() as txn:
|
51
|
-
|
52
|
-
await txn.set(
|
53
|
-
MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode()
|
54
|
-
)
|
55
|
+
for kbid in kbids:
|
56
|
+
await txn.set(MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode())
|
55
57
|
await txn.commit()
|
56
58
|
|
57
|
-
async def get_kb_migrations(self
|
59
|
+
async def get_kb_migrations(self) -> list[str]:
|
58
60
|
keys = []
|
59
61
|
async with self.driver.transaction() as txn:
|
60
|
-
async for key in txn.keys(MIGRATIONS_CONTAINER_KEY
|
62
|
+
async for key in txn.keys(MIGRATIONS_CONTAINER_KEY):
|
61
63
|
keys.append(key.split("/")[-1])
|
62
64
|
|
63
65
|
return keys
|
@@ -68,7 +70,7 @@ class MigrationsDataManager:
|
|
68
70
|
await txn.commit()
|
69
71
|
|
70
72
|
async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
|
71
|
-
async with self.driver.transaction() as txn:
|
73
|
+
async with self.driver.transaction(read_only=True) as txn:
|
72
74
|
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
73
75
|
if kb_config is None:
|
74
76
|
return None
|
@@ -76,7 +78,7 @@ class MigrationsDataManager:
|
|
76
78
|
|
77
79
|
async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
|
78
80
|
async with self.driver.transaction() as txn:
|
79
|
-
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
81
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
|
80
82
|
if kb_config is None:
|
81
83
|
raise Exception(f"KB {kbid} does not exist")
|
82
84
|
kb_config.migration_version = current_version
|
@@ -84,15 +86,13 @@ class MigrationsDataManager:
|
|
84
86
|
await txn.commit()
|
85
87
|
|
86
88
|
async def get_global_info(self) -> GlobalInfo:
|
87
|
-
async with self.driver.transaction() as txn:
|
89
|
+
async with self.driver.transaction(read_only=True) as txn:
|
88
90
|
raw_pb = await txn.get(MIGRATION_INFO_KEY)
|
89
91
|
if raw_pb is None:
|
90
92
|
return GlobalInfo(current_version=0, target_version=None)
|
91
93
|
pb = migrations_pb2.MigrationInfo()
|
92
94
|
pb.ParseFromString(raw_pb)
|
93
|
-
return GlobalInfo(
|
94
|
-
current_version=pb.current_version, target_version=pb.target_version
|
95
|
-
)
|
95
|
+
return GlobalInfo(current_version=pb.current_version, target_version=pb.target_version)
|
96
96
|
|
97
97
|
async def update_global_info(
|
98
98
|
self,
|
@@ -101,7 +101,7 @@ class MigrationsDataManager:
|
|
101
101
|
target_version: Union[int, None, _Unset] = _UNSET,
|
102
102
|
) -> None:
|
103
103
|
async with self.driver.transaction() as txn:
|
104
|
-
raw_pb = await txn.get(MIGRATION_INFO_KEY)
|
104
|
+
raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
|
105
105
|
pb = migrations_pb2.MigrationInfo()
|
106
106
|
if raw_pb is not None:
|
107
107
|
pb.ParseFromString(raw_pb)
|
nucliadb/migrator/migrator.py
CHANGED
@@ -22,22 +22,20 @@ import logging
|
|
22
22
|
from typing import Optional
|
23
23
|
|
24
24
|
from nucliadb.common import locking
|
25
|
-
from nucliadb.common.cluster.rollover import
|
25
|
+
from nucliadb.common.cluster.rollover import rollover_kb_index
|
26
|
+
from nucliadb.common.cluster.settings import in_standalone_mode
|
27
|
+
from nucliadb.common.maindb.pg import PGDriver
|
26
28
|
from nucliadb.migrator.context import ExecutionContext
|
27
|
-
from nucliadb.migrator.utils import get_migrations
|
29
|
+
from nucliadb.migrator.utils import get_migrations, get_pg_migrations
|
28
30
|
from nucliadb_telemetry import errors, metrics
|
29
31
|
|
30
|
-
migration_observer = metrics.Observer(
|
31
|
-
"nucliadb_migrations", labels={"type": "kb", "target_version": ""}
|
32
|
-
)
|
32
|
+
migration_observer = metrics.Observer("nucliadb_migrations", labels={"type": "kb", "target_version": ""})
|
33
33
|
|
34
34
|
|
35
35
|
logger = logging.getLogger(__name__)
|
36
36
|
|
37
37
|
|
38
|
-
async def run_kb_migrations(
|
39
|
-
context: ExecutionContext, kbid: str, target_version: int
|
40
|
-
) -> None:
|
38
|
+
async def run_kb_migrations(context: ExecutionContext, kbid: str, target_version: int) -> None:
|
41
39
|
async with locking.distributed_lock(f"migration-{kbid}"):
|
42
40
|
kb_info = await context.data_manager.get_kb_info(kbid)
|
43
41
|
if kb_info is None:
|
@@ -45,9 +43,7 @@ async def run_kb_migrations(
|
|
45
43
|
await context.data_manager.delete_kb_migration(kbid=kbid)
|
46
44
|
return
|
47
45
|
|
48
|
-
migrations = get_migrations(
|
49
|
-
from_version=kb_info.current_version, to_version=target_version
|
50
|
-
)
|
46
|
+
migrations = get_migrations(from_version=kb_info.current_version, to_version=target_version)
|
51
47
|
|
52
48
|
for migration in migrations:
|
53
49
|
migration_info = {
|
@@ -57,15 +53,11 @@ async def run_kb_migrations(
|
|
57
53
|
}
|
58
54
|
|
59
55
|
try:
|
60
|
-
logger.
|
61
|
-
with migration_observer(
|
62
|
-
|
63
|
-
)
|
64
|
-
|
65
|
-
logger.warning("Finished KB Migration", extra=migration_info)
|
66
|
-
await context.data_manager.update_kb_info(
|
67
|
-
kbid=kbid, current_version=migration.version
|
68
|
-
)
|
56
|
+
logger.info("Migrating KB", extra=migration_info)
|
57
|
+
with migration_observer({"type": "kb", "target_version": str(migration.version)}):
|
58
|
+
await migration.module.migrate_kb(context, kbid)
|
59
|
+
logger.info("Finished KB Migration", extra=migration_info)
|
60
|
+
await context.data_manager.update_kb_info(kbid=kbid, current_version=migration.version)
|
69
61
|
except Exception as exc:
|
70
62
|
errors.capture_exception(exc)
|
71
63
|
logger.exception("Failed to migrate KB", extra=migration_info)
|
@@ -73,9 +65,7 @@ async def run_kb_migrations(
|
|
73
65
|
|
74
66
|
refreshed_kb_info = await context.data_manager.get_kb_info(kbid=kbid)
|
75
67
|
if refreshed_kb_info is None:
|
76
|
-
logger.warning(
|
77
|
-
"KB not found. This should not happen.", extra={"kbid": kbid}
|
78
|
-
)
|
68
|
+
logger.warning("KB not found. This should not happen.", extra={"kbid": kbid})
|
79
69
|
return
|
80
70
|
assert refreshed_kb_info.current_version == target_version
|
81
71
|
|
@@ -87,12 +77,14 @@ async def run_all_kb_migrations(context: ExecutionContext, target_version: int)
|
|
87
77
|
Schedule all KB migrations to run in parallel. Only a certain number of migrations will run at the same time.
|
88
78
|
If any of the migrations fail, the whole process will fail.
|
89
79
|
"""
|
90
|
-
to_migrate = await context.data_manager.get_kb_migrations(
|
80
|
+
to_migrate = await context.data_manager.get_kb_migrations()
|
91
81
|
|
92
82
|
if len(to_migrate) == 0:
|
93
83
|
return
|
94
|
-
|
95
|
-
|
84
|
+
if in_standalone_mode():
|
85
|
+
max_concurrent = 1
|
86
|
+
else:
|
87
|
+
max_concurrent = context.settings.max_concurrent_migrations
|
96
88
|
semaphore = asyncio.Semaphore(max_concurrent)
|
97
89
|
|
98
90
|
logger.info(
|
@@ -150,15 +142,11 @@ async def run_global_migrations(context: ExecutionContext, target_version: int)
|
|
150
142
|
"to_version": migration.version,
|
151
143
|
}
|
152
144
|
try:
|
153
|
-
logger.
|
154
|
-
with migration_observer(
|
155
|
-
|
156
|
-
)
|
157
|
-
|
158
|
-
await context.data_manager.update_global_info(
|
159
|
-
current_version=migration.version
|
160
|
-
)
|
161
|
-
logger.warning("Finished migration", extra=migration_info)
|
145
|
+
logger.info("Migrating", extra=migration_info)
|
146
|
+
with migration_observer({"type": "global", "target_version": str(migration.version)}):
|
147
|
+
await migration.module.migrate(context)
|
148
|
+
await context.data_manager.update_global_info(current_version=migration.version)
|
149
|
+
logger.info("Finished migration", extra=migration_info)
|
162
150
|
except Exception as exc:
|
163
151
|
errors.capture_exception(exc)
|
164
152
|
logger.exception("Failed to migrate", extra=migration_info)
|
@@ -174,7 +162,7 @@ async def run_rollover_in_parallel(
|
|
174
162
|
) -> None:
|
175
163
|
async with max_concurrent:
|
176
164
|
try:
|
177
|
-
await
|
165
|
+
await rollover_kb_index(context, kbid)
|
178
166
|
await context.data_manager.delete_kb_rollover(kbid=kbid)
|
179
167
|
except Exception as exc:
|
180
168
|
errors.capture_exception(exc)
|
@@ -218,8 +206,40 @@ async def run_rollovers(context: ExecutionContext) -> None:
|
|
218
206
|
raise Exception(f"Failed to migrate KBs. Failures: {failures}")
|
219
207
|
|
220
208
|
|
209
|
+
async def run_pg_schema_migrations(driver: PGDriver):
|
210
|
+
migrations = get_pg_migrations()
|
211
|
+
|
212
|
+
# The migration uses two transactions. The former is only used to get a lock (pg_advisory_lock)
|
213
|
+
# without having to worry about correctly unlocking it (postgres unlocks it when the transaction ends)
|
214
|
+
async with driver.transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock: # type: ignore[attr-defined]
|
215
|
+
await cur_lock.execute(
|
216
|
+
"CREATE TABLE IF NOT EXISTS migrations (version INT PRIMARY KEY, migrated_at TIMESTAMP NOT NULL DEFAULT NOW())"
|
217
|
+
)
|
218
|
+
await tx_lock.commit()
|
219
|
+
await cur_lock.execute("SELECT pg_advisory_xact_lock(3116614845278015934)")
|
220
|
+
|
221
|
+
await cur_lock.execute("SELECT version FROM migrations")
|
222
|
+
migrated = [r[0] for r in await cur_lock.fetchall()]
|
223
|
+
|
224
|
+
for version, migration in migrations:
|
225
|
+
if version in migrated:
|
226
|
+
continue
|
227
|
+
|
228
|
+
# Gets a new transaction for each migration, so if they get interrupted we at least
|
229
|
+
# save the state of the last finished transaction
|
230
|
+
async with driver.transaction() as tx, tx.connection.cursor() as cur: # type: ignore[attr-defined]
|
231
|
+
await migration.migrate(tx)
|
232
|
+
await cur.execute("INSERT INTO migrations (version) VALUES (%s)", (version,))
|
233
|
+
await tx.commit()
|
234
|
+
|
235
|
+
|
221
236
|
async def run(context: ExecutionContext, target_version: Optional[int] = None) -> None:
|
222
|
-
|
237
|
+
# Run schema migrations first, since they create the `resources` table needed for the lock below
|
238
|
+
# Schema migrations use their own locking system
|
239
|
+
if isinstance(context.kv_driver, PGDriver):
|
240
|
+
await run_pg_schema_migrations(context.kv_driver)
|
241
|
+
|
242
|
+
async with locking.distributed_lock(locking.MIGRATIONS_LOCK):
|
223
243
|
# before we move to managed migrations, see if there are any rollovers
|
224
244
|
# scheduled and run them
|
225
245
|
await run_rollovers(context)
|
nucliadb/migrator/settings.py
CHANGED
@@ -20,9 +20,10 @@
|
|
20
20
|
from typing import Optional
|
21
21
|
|
22
22
|
import pydantic
|
23
|
+
import pydantic_settings
|
23
24
|
|
24
25
|
|
25
|
-
class Settings(
|
26
|
+
class Settings(pydantic_settings.BaseSettings):
|
26
27
|
redis_url: Optional[str] = None
|
27
28
|
max_concurrent_migrations: int = pydantic.Field(
|
28
29
|
default=5,
|
nucliadb/migrator/utils.py
CHANGED
@@ -17,13 +17,12 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import importlib
|
20
21
|
import logging
|
21
22
|
import os
|
22
23
|
import types
|
23
24
|
from functools import lru_cache
|
24
25
|
|
25
|
-
import migrations
|
26
|
-
|
27
26
|
from .models import Migration
|
28
27
|
|
29
28
|
logger = logging.getLogger(__name__)
|
@@ -33,14 +32,27 @@ MIGRATION_DIR = os.path.sep.join(
|
|
33
32
|
)
|
34
33
|
|
35
34
|
|
35
|
+
def get_pg_migrations() -> list[tuple[int, types.ModuleType]]:
|
36
|
+
output = []
|
37
|
+
for filename in os.listdir(os.path.join(MIGRATION_DIR, "pg")):
|
38
|
+
if filename.endswith(".py") and filename != "__init__.py":
|
39
|
+
module_name = filename[:-3]
|
40
|
+
version = int(module_name.split("_")[0])
|
41
|
+
module = importlib.import_module(f"migrations.pg.{module_name}")
|
42
|
+
if not hasattr(module, "migrate"):
|
43
|
+
raise Exception(f"Missing `migrate` function in {module_name}")
|
44
|
+
output.append((version, module))
|
45
|
+
output.sort()
|
46
|
+
return output
|
47
|
+
|
48
|
+
|
36
49
|
def get_migration_modules() -> list[tuple[types.ModuleType, int]]:
|
37
50
|
output = []
|
38
51
|
for filename in os.listdir(MIGRATION_DIR):
|
39
52
|
if filename.endswith(".py") and filename != "__init__.py":
|
40
53
|
module_name = filename[:-3]
|
41
54
|
version = int(module_name.split("_")[0])
|
42
|
-
|
43
|
-
module = getattr(migrations, module_name)
|
55
|
+
module = importlib.import_module(f"migrations.{module_name}")
|
44
56
|
if not hasattr(module, "migrate"):
|
45
57
|
raise Exception(f"Missing `migrate` function in {module_name}")
|
46
58
|
if not hasattr(module, "migrate_kb"):
|
@@ -49,17 +61,13 @@ def get_migration_modules() -> list[tuple[types.ModuleType, int]]:
|
|
49
61
|
return output
|
50
62
|
|
51
63
|
|
52
|
-
def get_migrations(
|
53
|
-
from_version: int = 0, to_version: int = 99999999
|
54
|
-
) -> list[Migration]:
|
64
|
+
def get_migrations(from_version: int = 0, to_version: int = 99999999) -> list[Migration]:
|
55
65
|
migrations: list[Migration] = []
|
56
66
|
for module, version in get_migration_modules():
|
57
67
|
migrations.append(Migration(version=version, module=module))
|
58
68
|
|
59
69
|
migrations.sort(key=lambda m: m.version)
|
60
|
-
return [
|
61
|
-
m for m in migrations if m.version > from_version and m.version <= to_version
|
62
|
-
]
|
70
|
+
return [m for m in migrations if m.version > from_version and m.version <= to_version]
|
63
71
|
|
64
72
|
|
65
73
|
@lru_cache(maxsize=None)
|
nucliadb/purge/__init__.py
CHANGED
@@ -18,10 +18,9 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import asyncio
|
21
|
+
import importlib.metadata
|
21
22
|
from typing import AsyncGenerator
|
22
23
|
|
23
|
-
import pkg_resources
|
24
|
-
|
25
24
|
from nucliadb.common.cluster.exceptions import NodeError, ShardNotFound
|
26
25
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
27
26
|
from nucliadb.common.maindb.driver import Driver
|
@@ -31,6 +30,9 @@ from nucliadb.ingest.orm.knowledgebox import (
|
|
31
30
|
KB_TO_DELETE,
|
32
31
|
KB_TO_DELETE_BASE,
|
33
32
|
KB_TO_DELETE_STORAGE_BASE,
|
33
|
+
KB_VECTORSET_TO_DELETE,
|
34
|
+
KB_VECTORSET_TO_DELETE_BASE,
|
35
|
+
RESOURCE_TO_DELETE_STORAGE_BASE,
|
34
36
|
KnowledgeBox,
|
35
37
|
)
|
36
38
|
from nucliadb_telemetry import errors
|
@@ -41,7 +43,7 @@ from nucliadb_utils.utilities import get_storage
|
|
41
43
|
|
42
44
|
async def _iter_keys(driver: Driver, match: str) -> AsyncGenerator[str, None]:
|
43
45
|
async with driver.transaction(read_only=True) as keys_txn:
|
44
|
-
async for key in keys_txn.keys(match=match
|
46
|
+
async for key in keys_txn.keys(match=match):
|
45
47
|
yield key
|
46
48
|
|
47
49
|
|
@@ -52,9 +54,7 @@ async def purge_kb(driver: Driver):
|
|
52
54
|
try:
|
53
55
|
kbid = key.split("/")[2]
|
54
56
|
except Exception:
|
55
|
-
logger.warning(
|
56
|
-
f" X Skipping purge {key}, wrong key format, expected {KB_TO_DELETE_BASE}"
|
57
|
-
)
|
57
|
+
logger.warning(f" X Skipping purge {key}, wrong key format, expected {KB_TO_DELETE_BASE}")
|
58
58
|
continue
|
59
59
|
|
60
60
|
try:
|
@@ -62,15 +62,11 @@ async def purge_kb(driver: Driver):
|
|
62
62
|
logger.info(f" √ Successfully Purged {kbid}")
|
63
63
|
except ShardNotFound as exc:
|
64
64
|
errors.capture_exception(exc)
|
65
|
-
logger.error(
|
66
|
-
f" X At least one shard was unavailable while purging {kbid}, skipping"
|
67
|
-
)
|
65
|
+
logger.error(f" X At least one shard was unavailable while purging {kbid}, skipping")
|
68
66
|
continue
|
69
67
|
except NodeError as exc:
|
70
68
|
errors.capture_exception(exc)
|
71
|
-
logger.error(
|
72
|
-
f" X At least one node was unavailable while purging {kbid}, skipping"
|
73
|
-
)
|
69
|
+
logger.error(f" X At least one node was unavailable while purging {kbid}, skipping")
|
74
70
|
continue
|
75
71
|
|
76
72
|
except Exception as exc:
|
@@ -82,10 +78,10 @@ async def purge_kb(driver: Driver):
|
|
82
78
|
|
83
79
|
# Now delete the tikv delete mark
|
84
80
|
try:
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
81
|
+
async with driver.transaction() as txn:
|
82
|
+
key_to_purge = KB_TO_DELETE.format(kbid=kbid)
|
83
|
+
await txn.delete(key_to_purge)
|
84
|
+
await txn.commit()
|
89
85
|
logger.info(f" √ Deleted {key_to_purge}")
|
90
86
|
except Exception as exc:
|
91
87
|
errors.capture_exception(exc)
|
@@ -112,16 +108,12 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
|
|
112
108
|
|
113
109
|
delete_marker = False
|
114
110
|
if conflict:
|
115
|
-
logger.info(
|
116
|
-
f" . Nothing was deleted for {key}, (Bucket not yet empty), will try next time"
|
117
|
-
)
|
111
|
+
logger.info(f" . Nothing was deleted for {key}, (Bucket not yet empty), will try next time")
|
118
112
|
# Just in case something failed while setting a lifecycle policy to
|
119
113
|
# remove all elements from the bucket, reschedule it
|
120
114
|
await storage.schedule_delete_kb(kbid)
|
121
115
|
elif not deleted:
|
122
|
-
logger.info(
|
123
|
-
f" ! Expected bucket for {key} was not found, will delete marker"
|
124
|
-
)
|
116
|
+
logger.info(f" ! Expected bucket for {key} was not found, will delete marker")
|
125
117
|
delete_marker = True
|
126
118
|
elif deleted:
|
127
119
|
logger.info(" √ Bucket successfully deleted")
|
@@ -129,19 +121,122 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
|
|
129
121
|
|
130
122
|
if delete_marker:
|
131
123
|
try:
|
132
|
-
|
133
|
-
|
124
|
+
async with driver.transaction() as txn:
|
125
|
+
await txn.delete(key)
|
126
|
+
await txn.commit()
|
134
127
|
logger.info(f" √ Deleted storage deletion marker {key}")
|
135
128
|
except Exception as exc:
|
136
129
|
errors.capture_exception(exc)
|
137
130
|
logger.info(f" X Error while deleting key {key}")
|
138
|
-
await txn.abort()
|
139
|
-
else:
|
140
|
-
await txn.commit()
|
141
131
|
|
142
132
|
logger.info("FINISH PURGING KB STORAGE")
|
143
133
|
|
144
134
|
|
135
|
+
async def purge_deleted_resource_storage(driver: Driver, storage: Storage) -> None:
|
136
|
+
"""
|
137
|
+
Remove from storage all resources marked as deleted.
|
138
|
+
|
139
|
+
Returns the number of resources purged.
|
140
|
+
"""
|
141
|
+
logger.info("Starting purge of deleted resource storage")
|
142
|
+
to_purge = await _count_resources_storage_to_purge(driver)
|
143
|
+
logger.info(f"Found {to_purge} resources to purge")
|
144
|
+
while True:
|
145
|
+
try:
|
146
|
+
purged = await _purge_resources_storage_batch(driver, storage, batch_size=100)
|
147
|
+
if not purged:
|
148
|
+
logger.info("No more resources to purge found")
|
149
|
+
return
|
150
|
+
logger.info(f"Purged {purged} resources")
|
151
|
+
|
152
|
+
except asyncio.CancelledError:
|
153
|
+
logger.info("Purge of deleted resource storage was cancelled")
|
154
|
+
return
|
155
|
+
|
156
|
+
|
157
|
+
async def _count_resources_storage_to_purge(driver: Driver) -> int:
|
158
|
+
"""
|
159
|
+
Count the number of resources marked as deleted in storage.
|
160
|
+
"""
|
161
|
+
async with driver.transaction(read_only=True) as txn:
|
162
|
+
return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
|
163
|
+
|
164
|
+
|
165
|
+
async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch_size: int = 100) -> int:
|
166
|
+
"""
|
167
|
+
Remove from storage a batch of resources marked as deleted. Returns the
|
168
|
+
number of resources purged.
|
169
|
+
"""
|
170
|
+
# Get the keys of the resources to delete in batches of 100
|
171
|
+
to_delete_batch = []
|
172
|
+
async with driver.transaction(read_only=True) as txn:
|
173
|
+
async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
|
174
|
+
to_delete_batch.append(key)
|
175
|
+
|
176
|
+
if not to_delete_batch:
|
177
|
+
return 0
|
178
|
+
|
179
|
+
# Delete the resources blobs from storage
|
180
|
+
logger.info(f"Purging {len(to_delete_batch)} deleted resources")
|
181
|
+
tasks = []
|
182
|
+
for key in to_delete_batch:
|
183
|
+
kbid, resource_id = key.split("/")[-2:]
|
184
|
+
tasks.append(asyncio.create_task(storage.delete_resource(kbid, resource_id)))
|
185
|
+
await asyncio.gather(*tasks)
|
186
|
+
|
187
|
+
# Delete the schedule-to-delete keys
|
188
|
+
async with driver.transaction() as txn:
|
189
|
+
for key in to_delete_batch:
|
190
|
+
await txn.delete(key)
|
191
|
+
await txn.commit()
|
192
|
+
|
193
|
+
return len(to_delete_batch)
|
194
|
+
|
195
|
+
|
196
|
+
async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
197
|
+
"""Vectors for a vectorset are stored in a key inside each resource. Iterate
|
198
|
+
through all resources of the KB and remove any storage object containing
|
199
|
+
vectors for the specific vectorset to purge.
|
200
|
+
|
201
|
+
"""
|
202
|
+
logger.info("START PURGING KB VECTORSETS")
|
203
|
+
|
204
|
+
purged = []
|
205
|
+
async for key in _iter_keys(driver, KB_VECTORSET_TO_DELETE_BASE):
|
206
|
+
logger.info(f"Purging vectorsets {key}")
|
207
|
+
try:
|
208
|
+
_base, kbid, vectorset = key.lstrip("/").split("/")
|
209
|
+
except ValueError:
|
210
|
+
logger.info(f" X Skipping purge {key}, wrong key format, expected {KB_VECTORSET_TO_DELETE}")
|
211
|
+
continue
|
212
|
+
|
213
|
+
try:
|
214
|
+
async with driver.transaction(read_only=True) as txn:
|
215
|
+
kb = KnowledgeBox(txn, storage, kbid)
|
216
|
+
async for resource in kb.iterate_resources():
|
217
|
+
fields = await resource.get_fields(force=True)
|
218
|
+
# we don't need the maindb transaction anymore to remove vectors from storage
|
219
|
+
for field in fields.values():
|
220
|
+
await field.delete_vectors(vectorset)
|
221
|
+
except Exception as exc:
|
222
|
+
errors.capture_exception(exc)
|
223
|
+
logger.error(
|
224
|
+
f" X ERROR while executing KB vectorset purge, skipping",
|
225
|
+
exc_info=exc,
|
226
|
+
extra={"kbid": kbid},
|
227
|
+
)
|
228
|
+
continue
|
229
|
+
|
230
|
+
purged.append(key)
|
231
|
+
|
232
|
+
async with driver.transaction() as txn:
|
233
|
+
for key in purged:
|
234
|
+
await txn.delete(key)
|
235
|
+
await txn.commit()
|
236
|
+
|
237
|
+
logger.info("FINISH PURGING KB VECTORSETS")
|
238
|
+
|
239
|
+
|
145
240
|
async def main():
|
146
241
|
"""
|
147
242
|
This script will purge all knowledge boxes marked to be deleted in maindb.
|
@@ -153,17 +248,28 @@ async def main():
|
|
153
248
|
service_name=SERVICE_NAME,
|
154
249
|
)
|
155
250
|
try:
|
251
|
+
purge_resources_storage_task = asyncio.create_task(
|
252
|
+
purge_deleted_resource_storage(driver, storage)
|
253
|
+
)
|
156
254
|
await purge_kb(driver)
|
157
255
|
await purge_kb_storage(driver, storage)
|
256
|
+
await purge_kb_vectorsets(driver, storage)
|
257
|
+
await purge_resources_storage_task
|
258
|
+
except Exception as ex: # pragma: no cover
|
259
|
+
logger.exception("Unhandled exception on purge command")
|
260
|
+
errors.capture_exception(ex)
|
158
261
|
finally:
|
159
|
-
|
160
|
-
|
161
|
-
|
262
|
+
try:
|
263
|
+
purge_resources_storage_task.cancel()
|
264
|
+
await storage.finalize()
|
265
|
+
await teardown_driver()
|
266
|
+
await teardown_cluster()
|
267
|
+
except Exception: # pragma: no cover
|
268
|
+
logger.exception("Error tearing down utilities on purge command")
|
269
|
+
pass
|
162
270
|
|
163
271
|
|
164
272
|
def run() -> int: # pragma: no cover
|
165
273
|
setup_logging()
|
166
|
-
|
167
|
-
errors.setup_error_handling(pkg_resources.get_distribution("nucliadb").version)
|
168
|
-
|
274
|
+
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
169
275
|
return asyncio.run(main())
|
nucliadb/purge/orphan_shards.py
CHANGED
@@ -19,11 +19,11 @@
|
|
19
19
|
|
20
20
|
import argparse
|
21
21
|
import asyncio
|
22
|
+
import importlib.metadata
|
22
23
|
from dataclasses import dataclass
|
23
24
|
from typing import Optional
|
24
25
|
|
25
|
-
import
|
26
|
-
from grpc.aio import AioRpcError # type: ignore
|
26
|
+
from grpc.aio import AioRpcError
|
27
27
|
|
28
28
|
from nucliadb.common import datamanagers
|
29
29
|
from nucliadb.common.cluster import manager
|
@@ -86,7 +86,7 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardLocation]:
|
|
86
86
|
orphan_shard_ids = indexed_shards.keys() - stored_shards.keys()
|
87
87
|
orphan_shards: dict[str, ShardLocation] = {}
|
88
88
|
unavailable_nodes: set[str] = set()
|
89
|
-
async with datamanagers.
|
89
|
+
async with datamanagers.with_ro_transaction() as txn:
|
90
90
|
for shard_id in orphan_shard_ids:
|
91
91
|
node_id = indexed_shards[shard_id].node_id
|
92
92
|
node = manager.get_index_node(node_id) # type: ignore
|
@@ -99,9 +99,7 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardLocation]:
|
|
99
99
|
# Shards with knwon KB ids can be checked and ignore those comming from
|
100
100
|
# an ongoing migration/rollover
|
101
101
|
if kbid != UNKNOWN_KB:
|
102
|
-
skip = await datamanagers.rollover.is_rollover_shard(
|
103
|
-
txn, kbid=kbid, shard_id=shard_id
|
104
|
-
)
|
102
|
+
skip = await datamanagers.rollover.is_rollover_shard(txn, kbid=kbid, shard_id=shard_id)
|
105
103
|
if skip:
|
106
104
|
continue
|
107
105
|
|
@@ -133,18 +131,14 @@ async def _get_stored_shards(driver: Driver) -> dict[str, ShardLocation]:
|
|
133
131
|
try:
|
134
132
|
kb_shards = await shards_manager.get_shards_by_kbid(kbid)
|
135
133
|
except ShardsNotFound:
|
136
|
-
logger.warning(
|
137
|
-
"KB not found while looking for orphan shards", extra={"kbid": kbid}
|
138
|
-
)
|
134
|
+
logger.warning("KB not found while looking for orphan shards", extra={"kbid": kbid})
|
139
135
|
continue
|
140
136
|
else:
|
141
137
|
for shard_object_pb in kb_shards:
|
142
138
|
for shard_replica_pb in shard_object_pb.replicas:
|
143
139
|
shard_replica_id = shard_replica_pb.shard.id
|
144
140
|
node_id = shard_replica_pb.node
|
145
|
-
stored_shards[shard_replica_id] = ShardLocation(
|
146
|
-
kbid=kbid, node_id=node_id
|
147
|
-
)
|
141
|
+
stored_shards[shard_replica_id] = ShardLocation(kbid=kbid, node_id=node_id)
|
148
142
|
return stored_shards
|
149
143
|
|
150
144
|
|
@@ -264,6 +258,6 @@ async def main():
|
|
264
258
|
def run() -> int: # pragma: no cover
|
265
259
|
setup_logging()
|
266
260
|
|
267
|
-
errors.setup_error_handling(
|
261
|
+
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
268
262
|
|
269
263
|
return asyncio.run(main())
|
nucliadb/reader/__init__.py
CHANGED
@@ -29,9 +29,7 @@ API_PREFIX = "api"
|
|
29
29
|
class EndpointFilter(logging.Filter):
|
30
30
|
def filter(self, record: logging.LogRecord) -> bool:
|
31
31
|
return (
|
32
|
-
record.args is not None
|
33
|
-
and len(record.args) >= 3
|
34
|
-
and record.args[2] not in ("/", "/metrics") # type: ignore
|
32
|
+
record.args is not None and len(record.args) >= 3 and record.args[2] not in ("/", "/metrics") # type: ignore
|
35
33
|
)
|
36
34
|
|
37
35
|
|