nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -23,7 +23,6 @@ import uuid
|
|
23
23
|
from typing import Any, Awaitable, Callable, Optional
|
24
24
|
|
25
25
|
import backoff
|
26
|
-
from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, TypeMessage
|
27
26
|
|
28
27
|
from nucliadb.common import datamanagers
|
29
28
|
from nucliadb.common.cluster.base import AbstractIndexNode
|
@@ -37,12 +36,15 @@ from nucliadb.common.cluster.exceptions import (
|
|
37
36
|
ShardsNotFound,
|
38
37
|
)
|
39
38
|
from nucliadb.common.maindb.driver import Transaction
|
39
|
+
from nucliadb.common.nidx import NIDX_ENABLED, get_nidx, get_nidx_api_client, get_nidx_fake_node
|
40
40
|
from nucliadb_protos import (
|
41
|
+
knowledgebox_pb2,
|
41
42
|
nodereader_pb2,
|
42
43
|
noderesources_pb2,
|
43
44
|
nodewriter_pb2,
|
44
45
|
writer_pb2,
|
45
46
|
)
|
47
|
+
from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, NewShardRequest, TypeMessage
|
46
48
|
from nucliadb_telemetry import errors
|
47
49
|
from nucliadb_utils.utilities import get_indexing, get_storage
|
48
50
|
|
@@ -124,7 +126,7 @@ def remove_index_node(node_id: str, primary_id: Optional[str] = None) -> None:
|
|
124
126
|
class KBShardManager:
|
125
127
|
# TODO: move to data manager
|
126
128
|
async def get_shards_by_kbid_inner(self, kbid: str) -> writer_pb2.Shards:
|
127
|
-
async with datamanagers.
|
129
|
+
async with datamanagers.with_ro_transaction() as txn:
|
128
130
|
result = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
129
131
|
if result is None:
|
130
132
|
# could be None because /shards doesn't exist, or beacause the
|
@@ -142,6 +144,8 @@ class KBShardManager:
|
|
142
144
|
kbid: str,
|
143
145
|
aw: Callable[[AbstractIndexNode, str], Awaitable[Any]],
|
144
146
|
timeout: float,
|
147
|
+
*,
|
148
|
+
use_nidx: bool,
|
145
149
|
use_read_replica_nodes: bool = False,
|
146
150
|
) -> list[Any]:
|
147
151
|
shards = await self.get_shards_by_kbid(kbid)
|
@@ -149,7 +153,7 @@ class KBShardManager:
|
|
149
153
|
|
150
154
|
for shard_obj in shards:
|
151
155
|
node, shard_id = choose_node(
|
152
|
-
shard_obj, use_read_replica_nodes=use_read_replica_nodes
|
156
|
+
shard_obj, use_nidx=use_nidx, use_read_replica_nodes=use_read_replica_nodes
|
153
157
|
)
|
154
158
|
if shard_id is None:
|
155
159
|
raise ShardNotFound("Found a node but not a shard")
|
@@ -158,7 +162,7 @@ class KBShardManager:
|
|
158
162
|
|
159
163
|
try:
|
160
164
|
results = await asyncio.wait_for(
|
161
|
-
asyncio.gather(*ops, return_exceptions=True),
|
165
|
+
asyncio.gather(*ops, return_exceptions=True),
|
162
166
|
timeout=timeout,
|
163
167
|
)
|
164
168
|
except asyncio.TimeoutError as exc:
|
@@ -171,7 +175,7 @@ class KBShardManager:
|
|
171
175
|
async def get_current_active_shard(
|
172
176
|
self, txn: Transaction, kbid: str
|
173
177
|
) -> Optional[writer_pb2.ShardObject]:
|
174
|
-
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
178
|
+
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=False)
|
175
179
|
if kb_shards is None:
|
176
180
|
return None
|
177
181
|
|
@@ -195,23 +199,25 @@ class KBShardManager:
|
|
195
199
|
)
|
196
200
|
raise
|
197
201
|
|
198
|
-
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
202
|
+
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
|
199
203
|
if kb_shards is None:
|
200
|
-
msg = (
|
201
|
-
"Attempting to create a shard for a KB when it has no stored shards in maindb",
|
202
|
-
)
|
204
|
+
msg = ("Attempting to create a shard for a KB when it has no stored shards in maindb",)
|
203
205
|
logger.error(msg, extra={"kbid": kbid})
|
204
206
|
raise ShardsNotFound(msg)
|
205
207
|
|
206
|
-
existing_kb_nodes = [
|
207
|
-
replica.node for shard in kb_shards.shards for replica in shard.replicas
|
208
|
-
]
|
208
|
+
existing_kb_nodes = [replica.node for shard in kb_shards.shards for replica in shard.replicas]
|
209
209
|
nodes = sorted_primary_nodes(
|
210
210
|
avoid_nodes=existing_kb_nodes,
|
211
211
|
ignore_nodes=settings.drain_nodes,
|
212
212
|
)
|
213
213
|
|
214
|
+
vectorsets = {
|
215
|
+
vectorset_id: vectorset_config.vectorset_index_config
|
216
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(txn, kbid=kbid)
|
217
|
+
}
|
218
|
+
|
214
219
|
shard_uuid = uuid.uuid4().hex
|
220
|
+
|
215
221
|
shard = writer_pb2.ShardObject(shard=shard_uuid, read_only=False)
|
216
222
|
try:
|
217
223
|
# Attempt to create configured number of replicas
|
@@ -228,28 +234,56 @@ class KBShardManager:
|
|
228
234
|
if node is None:
|
229
235
|
logger.error(f"Node {node_id} is not found or not available")
|
230
236
|
continue
|
231
|
-
|
237
|
+
|
232
238
|
try:
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
239
|
+
if not vectorsets:
|
240
|
+
# bw/c KBs without vectorsets
|
241
|
+
is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
|
242
|
+
vector_index_config = nodewriter_pb2.VectorIndexConfig(
|
243
|
+
similarity=kb_shards.similarity,
|
244
|
+
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
245
|
+
vector_dimension=kb_shards.model.vector_dimension,
|
246
|
+
normalize_vectors=is_matryoshka,
|
247
|
+
)
|
248
|
+
|
249
|
+
shard_created = await node.new_shard(
|
250
|
+
kbid,
|
251
|
+
vector_index_config=vector_index_config,
|
252
|
+
)
|
253
|
+
|
254
|
+
else:
|
255
|
+
shard_created = await node.new_shard_with_vectorsets(
|
256
|
+
kbid,
|
257
|
+
vectorsets_configs=vectorsets,
|
258
|
+
)
|
259
|
+
|
260
|
+
except Exception as exc:
|
261
|
+
errors.capture_exception(exc)
|
262
|
+
logger.exception(
|
263
|
+
f"Error creating new shard for KB", extra={"kbid": kbid, "node_id": node}
|
238
264
|
)
|
239
|
-
except Exception as e:
|
240
|
-
errors.capture_exception(e)
|
241
|
-
logger.exception(f"Error creating new shard at {node}: {e}")
|
242
265
|
continue
|
243
266
|
|
244
267
|
replica = writer_pb2.ShardReplica(node=str(node_id))
|
245
268
|
replica.shard.CopyFrom(shard_created)
|
246
269
|
shard.replicas.append(replica)
|
247
270
|
replicas_created += 1
|
248
|
-
|
249
|
-
|
250
|
-
|
271
|
+
|
272
|
+
nidx_api = get_nidx_api_client()
|
273
|
+
if nidx_api:
|
274
|
+
req = NewShardRequest(
|
275
|
+
kbid=kbid,
|
276
|
+
vectorsets_configs=vectorsets,
|
277
|
+
)
|
278
|
+
|
279
|
+
resp = await nidx_api.NewShard(req) # type: ignore
|
280
|
+
shard.nidx_shard_id = resp.id
|
281
|
+
|
282
|
+
except Exception as exc:
|
283
|
+
errors.capture_exception(exc)
|
284
|
+
logger.exception(f"Unexpected error creating new shard for KB", extra={"kbid": kbid})
|
251
285
|
await self.rollback_shard(shard)
|
252
|
-
raise
|
286
|
+
raise exc
|
253
287
|
|
254
288
|
# set previous shard as read only, we only have one writable shard at a
|
255
289
|
# time
|
@@ -284,6 +318,17 @@ class KBShardManager:
|
|
284
318
|
exc_info=True,
|
285
319
|
)
|
286
320
|
|
321
|
+
nidx_api = get_nidx_api_client()
|
322
|
+
if nidx_api and shard.nidx_shard_id:
|
323
|
+
try:
|
324
|
+
await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
|
325
|
+
except Exception as rollback_error:
|
326
|
+
errors.capture_exception(rollback_error)
|
327
|
+
logger.error(
|
328
|
+
f"New shard rollback error. Nidx Shard: {shard.nidx_shard_id}",
|
329
|
+
exc_info=True,
|
330
|
+
)
|
331
|
+
|
287
332
|
def indexing_replicas(self, shard: writer_pb2.ShardObject) -> list[tuple[str, str]]:
|
288
333
|
"""
|
289
334
|
Returns the replica ids and nodes for the shard replicas
|
@@ -303,10 +348,9 @@ class KBShardManager:
|
|
303
348
|
) -> None:
|
304
349
|
indexing = get_indexing()
|
305
350
|
storage = await get_storage()
|
351
|
+
nidx = get_nidx()
|
306
352
|
|
307
|
-
await storage.delete_indexing(
|
308
|
-
resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard
|
309
|
-
)
|
353
|
+
await storage.delete_indexing(resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard)
|
310
354
|
|
311
355
|
for replica_id, node_id in self.indexing_replicas(shard):
|
312
356
|
indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
@@ -319,6 +363,13 @@ class KBShardManager:
|
|
319
363
|
indexpb.kbid = kb
|
320
364
|
await indexing.index(indexpb, node_id)
|
321
365
|
|
366
|
+
if nidx is not None and shard.nidx_shard_id:
|
367
|
+
nidxpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
368
|
+
nidxpb.shard = shard.nidx_shard_id
|
369
|
+
nidxpb.resource = uuid
|
370
|
+
nidxpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
|
371
|
+
await nidx.index(nidxpb)
|
372
|
+
|
322
373
|
async def add_resource(
|
323
374
|
self,
|
324
375
|
shard: writer_pb2.ShardObject,
|
@@ -339,7 +390,7 @@ class KBShardManager:
|
|
339
390
|
|
340
391
|
storage = await get_storage()
|
341
392
|
indexing = get_indexing()
|
342
|
-
|
393
|
+
nidx = get_nidx()
|
343
394
|
indexpb = IndexMessage()
|
344
395
|
|
345
396
|
if reindex_id is not None:
|
@@ -366,6 +417,10 @@ class KBShardManager:
|
|
366
417
|
indexpb.shard = replica_id
|
367
418
|
await indexing.index(indexpb, node_id)
|
368
419
|
|
420
|
+
if nidx is not None and shard.nidx_shard_id:
|
421
|
+
indexpb.shard = shard.nidx_shard_id
|
422
|
+
await nidx.index(indexpb)
|
423
|
+
|
369
424
|
def should_create_new_shard(self, num_paragraphs: int) -> bool:
|
370
425
|
return num_paragraphs > settings.max_shard_paragraphs
|
371
426
|
|
@@ -383,6 +438,44 @@ class KBShardManager:
|
|
383
438
|
await self.create_shard_by_kbid(txn, kbid)
|
384
439
|
await txn.commit()
|
385
440
|
|
441
|
+
async def create_vectorset(self, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
|
442
|
+
"""Create a new vectorset in all KB shards."""
|
443
|
+
|
444
|
+
async def _create_vectorset(node: AbstractIndexNode, shard_id: str):
|
445
|
+
vectorset_id = config.vectorset_id
|
446
|
+
index_config = config.vectorset_index_config
|
447
|
+
result = await node.add_vectorset(shard_id, vectorset_id, index_config)
|
448
|
+
if result.status != result.Status.OK:
|
449
|
+
raise NodeError(
|
450
|
+
f"Unable to create vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
|
451
|
+
)
|
452
|
+
|
453
|
+
await self.apply_for_all_shards(
|
454
|
+
kbid, _create_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
|
455
|
+
)
|
456
|
+
if NIDX_ENABLED:
|
457
|
+
await self.apply_for_all_shards(
|
458
|
+
kbid, _create_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
|
459
|
+
)
|
460
|
+
|
461
|
+
async def delete_vectorset(self, kbid: str, vectorset_id: str):
|
462
|
+
"""Delete a vectorset from all KB shards"""
|
463
|
+
|
464
|
+
async def _delete_vectorset(node: AbstractIndexNode, shard_id: str):
|
465
|
+
result = await node.remove_vectorset(shard_id, vectorset_id)
|
466
|
+
if result.status != result.Status.OK:
|
467
|
+
raise NodeError(
|
468
|
+
f"Unable to delete vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
|
469
|
+
)
|
470
|
+
|
471
|
+
await self.apply_for_all_shards(
|
472
|
+
kbid, _delete_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
|
473
|
+
)
|
474
|
+
if NIDX_ENABLED:
|
475
|
+
await self.apply_for_all_shards(
|
476
|
+
kbid, _delete_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
|
477
|
+
)
|
478
|
+
|
386
479
|
|
387
480
|
class StandaloneKBShardManager(KBShardManager):
|
388
481
|
max_ops_before_checks = 200
|
@@ -390,11 +483,9 @@ class StandaloneKBShardManager(KBShardManager):
|
|
390
483
|
def __init__(self):
|
391
484
|
super().__init__()
|
392
485
|
self._lock = asyncio.Lock()
|
393
|
-
self._change_count: dict[tuple[str, str], int] = {}
|
486
|
+
self._change_count: dict[tuple[str, str], int] = {}
|
394
487
|
|
395
|
-
async def _resource_change_event(
|
396
|
-
self, kbid: str, node_id: str, shard_id: str
|
397
|
-
) -> None:
|
488
|
+
async def _resource_change_event(self, kbid: str, node_id: str, shard_id: str) -> None:
|
398
489
|
if (node_id, shard_id) not in self._change_count:
|
399
490
|
self._change_count[(node_id, shard_id)] = 0
|
400
491
|
self._change_count[(node_id, shard_id)] += 1
|
@@ -407,17 +498,15 @@ class StandaloneKBShardManager(KBShardManager):
|
|
407
498
|
if index_node is None:
|
408
499
|
return
|
409
500
|
shard_info: noderesources_pb2.Shard = await index_node.reader.GetShard(
|
410
|
-
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id))
|
501
|
+
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id))
|
411
502
|
)
|
412
503
|
await self.maybe_create_new_shard(
|
413
504
|
kbid,
|
414
505
|
shard_info.paragraphs,
|
415
506
|
)
|
416
|
-
await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id))
|
507
|
+
await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id))
|
417
508
|
|
418
|
-
@backoff.on_exception(
|
419
|
-
backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5
|
420
|
-
)
|
509
|
+
@backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
|
421
510
|
async def delete_resource(
|
422
511
|
self,
|
423
512
|
shard: writer_pb2.ShardObject,
|
@@ -433,19 +522,21 @@ class StandaloneKBShardManager(KBShardManager):
|
|
433
522
|
req.shard_id = shardreplica.shard.id
|
434
523
|
index_node = get_index_node(shardreplica.node)
|
435
524
|
if index_node is None: # pragma: no cover
|
436
|
-
raise NodesUnsync(
|
437
|
-
f"Node {shardreplica.node} is not found or not available"
|
438
|
-
)
|
525
|
+
raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
|
439
526
|
await index_node.writer.RemoveResource(req) # type: ignore
|
440
527
|
asyncio.create_task(
|
441
|
-
self._resource_change_event(
|
442
|
-
kb, shardreplica.node, shardreplica.shard.id
|
443
|
-
)
|
528
|
+
self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
|
444
529
|
)
|
445
530
|
|
446
|
-
|
447
|
-
|
448
|
-
|
531
|
+
nidx = get_nidx()
|
532
|
+
if nidx is not None and shard.nidx_shard_id:
|
533
|
+
indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
534
|
+
indexpb.shard = shard.nidx_shard_id
|
535
|
+
indexpb.resource = uuid
|
536
|
+
indexpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
|
537
|
+
await nidx.index(indexpb)
|
538
|
+
|
539
|
+
@backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
|
449
540
|
async def add_resource(
|
450
541
|
self,
|
451
542
|
shard: writer_pb2.ShardObject,
|
@@ -465,16 +556,36 @@ class StandaloneKBShardManager(KBShardManager):
|
|
465
556
|
resource.shard_id = resource.resource.shard_id = shardreplica.shard.id
|
466
557
|
index_node = get_index_node(shardreplica.node)
|
467
558
|
if index_node is None: # pragma: no cover
|
468
|
-
raise NodesUnsync(
|
469
|
-
f"Node {shardreplica.node} is not found or not available"
|
470
|
-
)
|
559
|
+
raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
|
471
560
|
await index_node.writer.SetResource(resource) # type: ignore
|
472
561
|
asyncio.create_task(
|
473
|
-
self._resource_change_event(
|
474
|
-
kb, shardreplica.node, shardreplica.shard.id
|
475
|
-
)
|
562
|
+
self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
|
476
563
|
)
|
477
564
|
|
565
|
+
nidx = get_nidx()
|
566
|
+
if nidx is not None and shard.nidx_shard_id:
|
567
|
+
storage = await get_storage()
|
568
|
+
indexpb = IndexMessage()
|
569
|
+
storage_key = await storage.indexing(
|
570
|
+
resource, txid, partition, kb=kb, logical_shard=shard.shard
|
571
|
+
)
|
572
|
+
|
573
|
+
indexpb.typemessage = TypeMessage.CREATION
|
574
|
+
indexpb.storage_key = storage_key
|
575
|
+
indexpb.kbid = kb
|
576
|
+
indexpb.source = source
|
577
|
+
indexpb.resource = resource.resource.uuid
|
578
|
+
indexpb.shard = shard.nidx_shard_id
|
579
|
+
|
580
|
+
await nidx.index(indexpb)
|
581
|
+
|
582
|
+
# Delete indexing message (no longer needed)
|
583
|
+
try:
|
584
|
+
if storage.indexing_bucket:
|
585
|
+
await storage.delete_upload(storage_key, storage.indexing_bucket)
|
586
|
+
except Exception:
|
587
|
+
pass
|
588
|
+
|
478
589
|
|
479
590
|
def get_all_shard_nodes(
|
480
591
|
shard: writer_pb2.ShardObject,
|
@@ -506,6 +617,7 @@ def get_all_shard_nodes(
|
|
506
617
|
def choose_node(
|
507
618
|
shard: writer_pb2.ShardObject,
|
508
619
|
*,
|
620
|
+
use_nidx: bool,
|
509
621
|
target_shard_replicas: Optional[list[str]] = None,
|
510
622
|
use_read_replica_nodes: bool = False,
|
511
623
|
) -> tuple[AbstractIndexNode, str]:
|
@@ -521,6 +633,13 @@ def choose_node(
|
|
521
633
|
`target_shard_replicas` is the least preferent.
|
522
634
|
|
523
635
|
"""
|
636
|
+
|
637
|
+
# Use nidx if requested and enabled, fallback to node
|
638
|
+
if shard.nidx_shard_id and use_nidx:
|
639
|
+
fake_node = get_nidx_fake_node()
|
640
|
+
if fake_node:
|
641
|
+
return fake_node, shard.nidx_shard_id
|
642
|
+
|
524
643
|
target_shard_replicas = target_shard_replicas or []
|
525
644
|
|
526
645
|
shard_nodes = get_all_shard_nodes(shard, use_read_replicas=use_read_replica_nodes)
|
@@ -564,9 +683,7 @@ def check_enough_nodes():
|
|
564
683
|
)
|
565
684
|
if settings.max_node_replicas >= 0:
|
566
685
|
available_nodes = list(
|
567
|
-
filter(
|
568
|
-
lambda n: n.shard_count < settings.max_node_replicas, available_nodes # type: ignore
|
569
|
-
)
|
686
|
+
filter(lambda n: n.shard_count < settings.max_node_replicas, available_nodes)
|
570
687
|
)
|
571
688
|
if len(available_nodes) < target_replicas:
|
572
689
|
raise NodeClusterSmall(
|
@@ -598,9 +715,7 @@ def sorted_primary_nodes(
|
|
598
715
|
preferred_nodes = [nid for nid in available_node_ids if nid not in avoid_nodes]
|
599
716
|
|
600
717
|
# Add avoid_nodes to the end of the last nodes
|
601
|
-
result_nodes = preferred_nodes + [
|
602
|
-
nid for nid in available_node_ids if nid not in preferred_nodes
|
603
|
-
]
|
718
|
+
result_nodes = preferred_nodes + [nid for nid in available_node_ids if nid not in preferred_nodes]
|
604
719
|
|
605
720
|
# Remove ignore_nodes from the list
|
606
721
|
result_nodes = [nid for nid in result_nodes if nid not in ignore_nodes]
|
@@ -25,6 +25,7 @@ from nucliadb.common.cluster.manager import choose_node
|
|
25
25
|
from nucliadb.common.cluster.utils import get_shard_manager
|
26
26
|
from nucliadb.common.context import ApplicationContext
|
27
27
|
from nucliadb_protos import nodereader_pb2, noderesources_pb2
|
28
|
+
from nucliadb_telemetry import errors
|
28
29
|
from nucliadb_telemetry.logs import setup_logging
|
29
30
|
from nucliadb_telemetry.utils import setup_telemetry
|
30
31
|
from nucliadb_utils import const
|
@@ -43,28 +44,26 @@ async def get_shards_paragraphs(kbid: str) -> list[tuple[str, int]]:
|
|
43
44
|
"""
|
44
45
|
Ordered shard -> num paragraph by number of paragraphs
|
45
46
|
"""
|
46
|
-
async with datamanagers.
|
47
|
+
async with datamanagers.with_ro_transaction() as txn:
|
47
48
|
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
48
49
|
if kb_shards is None:
|
49
50
|
return []
|
50
51
|
|
51
52
|
results = {}
|
52
53
|
for shard_meta in kb_shards.shards:
|
53
|
-
node
|
54
|
+
# Rebalance using node as source of truth. But it will rebalance nidx
|
55
|
+
node, shard_id = choose_node(shard_meta, use_nidx=False)
|
54
56
|
shard_data: nodereader_pb2.Shard = await node.reader.GetShard(
|
55
57
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
56
58
|
)
|
57
59
|
results[shard_meta.shard] = shard_data.paragraphs
|
58
60
|
|
59
|
-
return [
|
60
|
-
(shard, paragraphs)
|
61
|
-
for shard, paragraphs in sorted(results.items(), key=lambda x: x[1])
|
62
|
-
]
|
61
|
+
return [(shard, paragraphs) for shard, paragraphs in sorted(results.items(), key=lambda x: x[1])]
|
63
62
|
|
64
63
|
|
65
64
|
async def maybe_add_shard(kbid: str) -> None:
|
66
65
|
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
67
|
-
async with datamanagers.
|
66
|
+
async with datamanagers.with_ro_transaction() as txn:
|
68
67
|
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
69
68
|
if kb_shards is None:
|
70
69
|
return
|
@@ -89,12 +88,10 @@ async def move_set_of_kb_resources(
|
|
89
88
|
to_shard_id: str,
|
90
89
|
count: int = 20,
|
91
90
|
) -> None:
|
92
|
-
async with datamanagers.
|
91
|
+
async with datamanagers.with_ro_transaction() as txn:
|
93
92
|
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
94
93
|
if kb_shards is None: # pragma: no cover
|
95
|
-
logger.warning(
|
96
|
-
"No shards found for kb. This should not happen.", extra={"kbid": kbid}
|
97
|
-
)
|
94
|
+
logger.warning("No shards found for kb. This should not happen.", extra={"kbid": kbid})
|
98
95
|
return
|
99
96
|
|
100
97
|
logger.info(
|
@@ -105,7 +102,7 @@ async def move_set_of_kb_resources(
|
|
105
102
|
from_shard = [s for s in kb_shards.shards if s.shard == from_shard_id][0]
|
106
103
|
to_shard = [s for s in kb_shards.shards if s.shard == to_shard_id][0]
|
107
104
|
|
108
|
-
from_node, from_shard_replica_id = choose_node(from_shard)
|
105
|
+
from_node, from_shard_replica_id = choose_node(from_shard, use_nidx=False)
|
109
106
|
search_response: nodereader_pb2.SearchResponse = await from_node.reader.Search( # type: ignore
|
110
107
|
nodereader_pb2.SearchRequest(
|
111
108
|
shard=from_shard_replica_id,
|
@@ -122,13 +119,11 @@ async def move_set_of_kb_resources(
|
|
122
119
|
async with (
|
123
120
|
datamanagers.with_transaction() as txn,
|
124
121
|
locking.distributed_lock(
|
125
|
-
locking.RESOURCE_INDEX_LOCK.format(
|
126
|
-
kbid=kbid, resource_id=resource_id
|
127
|
-
)
|
122
|
+
locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=resource_id)
|
128
123
|
),
|
129
124
|
):
|
130
125
|
found_shard_id = await datamanagers.resources.get_resource_shard_id(
|
131
|
-
txn, kbid=kbid, rid=resource_id
|
126
|
+
txn, kbid=kbid, rid=resource_id, for_update=True
|
132
127
|
)
|
133
128
|
if found_shard_id is None:
|
134
129
|
# resource deleted
|
@@ -175,9 +170,7 @@ async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
|
|
175
170
|
|
176
171
|
shard_paragraphs = await get_shards_paragraphs(kbid)
|
177
172
|
rebalanced_shards = set()
|
178
|
-
while any(
|
179
|
-
paragraphs > settings.max_shard_paragraphs for _, paragraphs in shard_paragraphs
|
180
|
-
):
|
173
|
+
while any(paragraphs > settings.max_shard_paragraphs for _, paragraphs in shard_paragraphs):
|
181
174
|
# find the shard with the least/most paragraphs
|
182
175
|
smallest_shard = shard_paragraphs[0][0]
|
183
176
|
largest_shard = shard_paragraphs[-1][0]
|
@@ -198,13 +191,13 @@ async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
|
|
198
191
|
async def run(context: ApplicationContext) -> None:
|
199
192
|
try:
|
200
193
|
async with locking.distributed_lock(REBALANCE_LOCK):
|
194
|
+
# get all kb ids
|
195
|
+
async with datamanagers.with_ro_transaction() as txn:
|
196
|
+
kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
|
201
197
|
# go through each kb and see if shards need to be reduced in size
|
202
|
-
|
203
|
-
async
|
204
|
-
|
205
|
-
locking.KB_SHARDS_LOCK.format(kbid=kbid)
|
206
|
-
):
|
207
|
-
await rebalance_kb(context, kbid)
|
198
|
+
for kbid in kbids:
|
199
|
+
async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
|
200
|
+
await rebalance_kb(context, kbid)
|
208
201
|
except locking.ResourceLocked as exc:
|
209
202
|
if exc.key == REBALANCE_LOCK:
|
210
203
|
logger.warning("Another rebalance process is already running.")
|
@@ -222,13 +215,18 @@ async def run_command(context: ApplicationContext) -> None:
|
|
222
215
|
|
223
216
|
try:
|
224
217
|
await run(context)
|
225
|
-
except (asyncio.CancelledError, RuntimeError):
|
218
|
+
except (asyncio.CancelledError, RuntimeError): # pragma: no cover
|
226
219
|
return
|
227
|
-
except Exception:
|
220
|
+
except Exception as ex: # pragma: no cover
|
228
221
|
logger.exception("Failed to run rebalancing.")
|
222
|
+
errors.capture_exception(ex)
|
229
223
|
finally:
|
230
|
-
|
231
|
-
|
224
|
+
try:
|
225
|
+
await metrics_server.shutdown()
|
226
|
+
await context.finalize()
|
227
|
+
except Exception: # pragma: no cover
|
228
|
+
logger.exception("Error tearing down utilities on rebalance command")
|
229
|
+
pass
|
232
230
|
|
233
231
|
|
234
232
|
def main():
|