nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,13 +19,10 @@
|
|
19
19
|
#
|
20
20
|
import asyncio
|
21
21
|
import logging
|
22
|
-
import random
|
23
22
|
import uuid
|
24
23
|
from typing import Any, Awaitable, Callable, Optional
|
25
24
|
|
26
25
|
import backoff
|
27
|
-
from nucliadb_protos.knowledgebox_pb2 import SemanticModelMetadata # type: ignore
|
28
|
-
from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, TypeMessage
|
29
26
|
|
30
27
|
from nucliadb.common import datamanagers
|
31
28
|
from nucliadb.common.cluster.base import AbstractIndexNode
|
@@ -39,13 +36,15 @@ from nucliadb.common.cluster.exceptions import (
|
|
39
36
|
ShardsNotFound,
|
40
37
|
)
|
41
38
|
from nucliadb.common.maindb.driver import Transaction
|
39
|
+
from nucliadb.common.nidx import NIDX_ENABLED, get_nidx, get_nidx_api_client, get_nidx_fake_node
|
42
40
|
from nucliadb_protos import (
|
41
|
+
knowledgebox_pb2,
|
43
42
|
nodereader_pb2,
|
44
43
|
noderesources_pb2,
|
45
44
|
nodewriter_pb2,
|
46
|
-
utils_pb2,
|
47
45
|
writer_pb2,
|
48
46
|
)
|
47
|
+
from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, NewShardRequest, TypeMessage
|
49
48
|
from nucliadb_telemetry import errors
|
50
49
|
from nucliadb_utils.utilities import get_indexing, get_storage
|
51
50
|
|
@@ -71,6 +70,11 @@ def get_index_node(node_id: str) -> Optional[AbstractIndexNode]:
|
|
71
70
|
return INDEX_NODES.get(node_id)
|
72
71
|
|
73
72
|
|
73
|
+
def clear_index_nodes():
|
74
|
+
INDEX_NODES.clear()
|
75
|
+
READ_REPLICA_INDEX_NODES.clear()
|
76
|
+
|
77
|
+
|
74
78
|
def get_read_replica_node_ids(node_id: str) -> list[str]:
|
75
79
|
return list(READ_REPLICA_INDEX_NODES.get(node_id, set()))
|
76
80
|
|
@@ -122,7 +126,7 @@ def remove_index_node(node_id: str, primary_id: Optional[str] = None) -> None:
|
|
122
126
|
class KBShardManager:
|
123
127
|
# TODO: move to data manager
|
124
128
|
async def get_shards_by_kbid_inner(self, kbid: str) -> writer_pb2.Shards:
|
125
|
-
async with datamanagers.
|
129
|
+
async with datamanagers.with_ro_transaction() as txn:
|
126
130
|
result = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
127
131
|
if result is None:
|
128
132
|
# could be None because /shards doesn't exist, or beacause the
|
@@ -140,6 +144,8 @@ class KBShardManager:
|
|
140
144
|
kbid: str,
|
141
145
|
aw: Callable[[AbstractIndexNode, str], Awaitable[Any]],
|
142
146
|
timeout: float,
|
147
|
+
*,
|
148
|
+
use_nidx: bool,
|
143
149
|
use_read_replica_nodes: bool = False,
|
144
150
|
) -> list[Any]:
|
145
151
|
shards = await self.get_shards_by_kbid(kbid)
|
@@ -147,7 +153,7 @@ class KBShardManager:
|
|
147
153
|
|
148
154
|
for shard_obj in shards:
|
149
155
|
node, shard_id = choose_node(
|
150
|
-
shard_obj, use_read_replica_nodes=use_read_replica_nodes
|
156
|
+
shard_obj, use_nidx=use_nidx, use_read_replica_nodes=use_read_replica_nodes
|
151
157
|
)
|
152
158
|
if shard_id is None:
|
153
159
|
raise ShardNotFound("Found a node but not a shard")
|
@@ -156,7 +162,7 @@ class KBShardManager:
|
|
156
162
|
|
157
163
|
try:
|
158
164
|
results = await asyncio.wait_for(
|
159
|
-
asyncio.gather(*ops, return_exceptions=True),
|
165
|
+
asyncio.gather(*ops, return_exceptions=True),
|
160
166
|
timeout=timeout,
|
161
167
|
)
|
162
168
|
except asyncio.TimeoutError as exc:
|
@@ -169,7 +175,7 @@ class KBShardManager:
|
|
169
175
|
async def get_current_active_shard(
|
170
176
|
self, txn: Transaction, kbid: str
|
171
177
|
) -> Optional[writer_pb2.ShardObject]:
|
172
|
-
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
178
|
+
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=False)
|
173
179
|
if kb_shards is None:
|
174
180
|
return None
|
175
181
|
|
@@ -183,8 +189,6 @@ class KBShardManager:
|
|
183
189
|
self,
|
184
190
|
txn: Transaction,
|
185
191
|
kbid: str,
|
186
|
-
semantic_model: SemanticModelMetadata,
|
187
|
-
release_channel: utils_pb2.ReleaseChannel.ValueType,
|
188
192
|
) -> writer_pb2.ShardObject:
|
189
193
|
try:
|
190
194
|
check_enough_nodes()
|
@@ -195,26 +199,25 @@ class KBShardManager:
|
|
195
199
|
)
|
196
200
|
raise
|
197
201
|
|
198
|
-
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
202
|
+
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
|
199
203
|
if kb_shards is None:
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
pass
|
204
|
+
msg = ("Attempting to create a shard for a KB when it has no stored shards in maindb",)
|
205
|
+
logger.error(msg, extra={"kbid": kbid})
|
206
|
+
raise ShardsNotFound(msg)
|
207
|
+
|
208
|
+
existing_kb_nodes = [replica.node for shard in kb_shards.shards for replica in shard.replicas]
|
209
|
+
nodes = sorted_primary_nodes(
|
210
|
+
avoid_nodes=existing_kb_nodes,
|
211
|
+
ignore_nodes=settings.drain_nodes,
|
212
|
+
)
|
210
213
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
nodes = sorted_primary_nodes(avoid_nodes=existing_kb_nodes)
|
214
|
+
vectorsets = {
|
215
|
+
vectorset_id: vectorset_config.vectorset_index_config
|
216
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(txn, kbid=kbid)
|
217
|
+
}
|
216
218
|
|
217
219
|
shard_uuid = uuid.uuid4().hex
|
220
|
+
|
218
221
|
shard = writer_pb2.ShardObject(shard=shard_uuid, read_only=False)
|
219
222
|
try:
|
220
223
|
# Attempt to create configured number of replicas
|
@@ -231,26 +234,56 @@ class KBShardManager:
|
|
231
234
|
if node is None:
|
232
235
|
logger.error(f"Node {node_id} is not found or not available")
|
233
236
|
continue
|
237
|
+
|
234
238
|
try:
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
+
if not vectorsets:
|
240
|
+
# bw/c KBs without vectorsets
|
241
|
+
is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
|
242
|
+
vector_index_config = nodewriter_pb2.VectorIndexConfig(
|
243
|
+
similarity=kb_shards.similarity,
|
244
|
+
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
245
|
+
vector_dimension=kb_shards.model.vector_dimension,
|
246
|
+
normalize_vectors=is_matryoshka,
|
247
|
+
)
|
248
|
+
|
249
|
+
shard_created = await node.new_shard(
|
250
|
+
kbid,
|
251
|
+
vector_index_config=vector_index_config,
|
252
|
+
)
|
253
|
+
|
254
|
+
else:
|
255
|
+
shard_created = await node.new_shard_with_vectorsets(
|
256
|
+
kbid,
|
257
|
+
vectorsets_configs=vectorsets,
|
258
|
+
)
|
259
|
+
|
260
|
+
except Exception as exc:
|
261
|
+
errors.capture_exception(exc)
|
262
|
+
logger.exception(
|
263
|
+
f"Error creating new shard for KB", extra={"kbid": kbid, "node_id": node}
|
239
264
|
)
|
240
|
-
except Exception as e:
|
241
|
-
errors.capture_exception(e)
|
242
|
-
logger.exception(f"Error creating new shard at {node}: {e}")
|
243
265
|
continue
|
244
266
|
|
245
267
|
replica = writer_pb2.ShardReplica(node=str(node_id))
|
246
268
|
replica.shard.CopyFrom(shard_created)
|
247
269
|
shard.replicas.append(replica)
|
248
270
|
replicas_created += 1
|
249
|
-
|
250
|
-
|
251
|
-
|
271
|
+
|
272
|
+
nidx_api = get_nidx_api_client()
|
273
|
+
if nidx_api:
|
274
|
+
req = NewShardRequest(
|
275
|
+
kbid=kbid,
|
276
|
+
vectorsets_configs=vectorsets,
|
277
|
+
)
|
278
|
+
|
279
|
+
resp = await nidx_api.NewShard(req) # type: ignore
|
280
|
+
shard.nidx_shard_id = resp.id
|
281
|
+
|
282
|
+
except Exception as exc:
|
283
|
+
errors.capture_exception(exc)
|
284
|
+
logger.exception(f"Unexpected error creating new shard for KB", extra={"kbid": kbid})
|
252
285
|
await self.rollback_shard(shard)
|
253
|
-
raise
|
286
|
+
raise exc
|
254
287
|
|
255
288
|
# set previous shard as read only, we only have one writable shard at a
|
256
289
|
# time
|
@@ -259,8 +292,8 @@ class KBShardManager:
|
|
259
292
|
|
260
293
|
# Append the created shard and make `actual` point to it.
|
261
294
|
kb_shards.shards.append(shard)
|
262
|
-
# B/c with Shards.actual
|
263
|
-
kb_shards.actual
|
295
|
+
# B/c with Shards.actual - we only use last created shard
|
296
|
+
kb_shards.actual = len(kb_shards.shards) - 1
|
264
297
|
|
265
298
|
await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
|
266
299
|
|
@@ -273,7 +306,7 @@ class KBShardManager:
|
|
273
306
|
node = get_index_node(node_id)
|
274
307
|
if node is not None:
|
275
308
|
try:
|
276
|
-
logger.
|
309
|
+
logger.info(
|
277
310
|
"Deleting shard replica",
|
278
311
|
extra={"shard": replica_id, "node": node_id},
|
279
312
|
)
|
@@ -285,6 +318,17 @@ class KBShardManager:
|
|
285
318
|
exc_info=True,
|
286
319
|
)
|
287
320
|
|
321
|
+
nidx_api = get_nidx_api_client()
|
322
|
+
if nidx_api and shard.nidx_shard_id:
|
323
|
+
try:
|
324
|
+
await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
|
325
|
+
except Exception as rollback_error:
|
326
|
+
errors.capture_exception(rollback_error)
|
327
|
+
logger.error(
|
328
|
+
f"New shard rollback error. Nidx Shard: {shard.nidx_shard_id}",
|
329
|
+
exc_info=True,
|
330
|
+
)
|
331
|
+
|
288
332
|
def indexing_replicas(self, shard: writer_pb2.ShardObject) -> list[tuple[str, str]]:
|
289
333
|
"""
|
290
334
|
Returns the replica ids and nodes for the shard replicas
|
@@ -304,10 +348,9 @@ class KBShardManager:
|
|
304
348
|
) -> None:
|
305
349
|
indexing = get_indexing()
|
306
350
|
storage = await get_storage()
|
351
|
+
nidx = get_nidx()
|
307
352
|
|
308
|
-
await storage.delete_indexing(
|
309
|
-
resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard
|
310
|
-
)
|
353
|
+
await storage.delete_indexing(resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard)
|
311
354
|
|
312
355
|
for replica_id, node_id in self.indexing_replicas(shard):
|
313
356
|
indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
@@ -320,6 +363,13 @@ class KBShardManager:
|
|
320
363
|
indexpb.kbid = kb
|
321
364
|
await indexing.index(indexpb, node_id)
|
322
365
|
|
366
|
+
if nidx is not None and shard.nidx_shard_id:
|
367
|
+
nidxpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
368
|
+
nidxpb.shard = shard.nidx_shard_id
|
369
|
+
nidxpb.resource = uuid
|
370
|
+
nidxpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
|
371
|
+
await nidx.index(nidxpb)
|
372
|
+
|
323
373
|
async def add_resource(
|
324
374
|
self,
|
325
375
|
shard: writer_pb2.ShardObject,
|
@@ -330,6 +380,9 @@ class KBShardManager:
|
|
330
380
|
reindex_id: Optional[str] = None,
|
331
381
|
source: IndexMessageSource.ValueType = IndexMessageSource.PROCESSOR,
|
332
382
|
) -> None:
|
383
|
+
"""
|
384
|
+
Stores the Resource object in the object storage and sends an IndexMessage to the indexing Nats stream.
|
385
|
+
"""
|
333
386
|
if txid == -1 and reindex_id is None:
|
334
387
|
# This means we are injecting a complete resource via ingest gRPC
|
335
388
|
# outside of a transaction. We need to treat this as a reindex operation.
|
@@ -337,7 +390,7 @@ class KBShardManager:
|
|
337
390
|
|
338
391
|
storage = await get_storage()
|
339
392
|
indexing = get_indexing()
|
340
|
-
|
393
|
+
nidx = get_nidx()
|
341
394
|
indexpb = IndexMessage()
|
342
395
|
|
343
396
|
if reindex_id is not None:
|
@@ -364,34 +417,65 @@ class KBShardManager:
|
|
364
417
|
indexpb.shard = replica_id
|
365
418
|
await indexing.index(indexpb, node_id)
|
366
419
|
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
420
|
+
if nidx is not None and shard.nidx_shard_id:
|
421
|
+
indexpb.shard = shard.nidx_shard_id
|
422
|
+
await nidx.index(indexpb)
|
423
|
+
|
424
|
+
def should_create_new_shard(self, num_paragraphs: int) -> bool:
|
425
|
+
return num_paragraphs > settings.max_shard_paragraphs
|
372
426
|
|
373
427
|
async def maybe_create_new_shard(
|
374
428
|
self,
|
375
429
|
kbid: str,
|
376
430
|
num_paragraphs: int,
|
377
|
-
num_fields: int,
|
378
|
-
release_channel: utils_pb2.ReleaseChannel.ValueType = utils_pb2.ReleaseChannel.STABLE,
|
379
431
|
):
|
380
|
-
if not self.should_create_new_shard(num_paragraphs
|
432
|
+
if not self.should_create_new_shard(num_paragraphs):
|
381
433
|
return
|
382
434
|
|
383
|
-
logger.
|
435
|
+
logger.info({"message": "Adding shard", "kbid": kbid})
|
384
436
|
|
385
437
|
async with datamanagers.with_transaction() as txn:
|
386
|
-
|
387
|
-
await self.create_shard_by_kbid(
|
388
|
-
txn,
|
389
|
-
kbid,
|
390
|
-
semantic_model=model,
|
391
|
-
release_channel=release_channel,
|
392
|
-
)
|
438
|
+
await self.create_shard_by_kbid(txn, kbid)
|
393
439
|
await txn.commit()
|
394
440
|
|
441
|
+
async def create_vectorset(self, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
|
442
|
+
"""Create a new vectorset in all KB shards."""
|
443
|
+
|
444
|
+
async def _create_vectorset(node: AbstractIndexNode, shard_id: str):
|
445
|
+
vectorset_id = config.vectorset_id
|
446
|
+
index_config = config.vectorset_index_config
|
447
|
+
result = await node.add_vectorset(shard_id, vectorset_id, index_config)
|
448
|
+
if result.status != result.Status.OK:
|
449
|
+
raise NodeError(
|
450
|
+
f"Unable to create vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
|
451
|
+
)
|
452
|
+
|
453
|
+
await self.apply_for_all_shards(
|
454
|
+
kbid, _create_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
|
455
|
+
)
|
456
|
+
if NIDX_ENABLED:
|
457
|
+
await self.apply_for_all_shards(
|
458
|
+
kbid, _create_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
|
459
|
+
)
|
460
|
+
|
461
|
+
async def delete_vectorset(self, kbid: str, vectorset_id: str):
|
462
|
+
"""Delete a vectorset from all KB shards"""
|
463
|
+
|
464
|
+
async def _delete_vectorset(node: AbstractIndexNode, shard_id: str):
|
465
|
+
result = await node.remove_vectorset(shard_id, vectorset_id)
|
466
|
+
if result.status != result.Status.OK:
|
467
|
+
raise NodeError(
|
468
|
+
f"Unable to delete vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
|
469
|
+
)
|
470
|
+
|
471
|
+
await self.apply_for_all_shards(
|
472
|
+
kbid, _delete_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
|
473
|
+
)
|
474
|
+
if NIDX_ENABLED:
|
475
|
+
await self.apply_for_all_shards(
|
476
|
+
kbid, _delete_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
|
477
|
+
)
|
478
|
+
|
395
479
|
|
396
480
|
class StandaloneKBShardManager(KBShardManager):
|
397
481
|
max_ops_before_checks = 200
|
@@ -399,11 +483,9 @@ class StandaloneKBShardManager(KBShardManager):
|
|
399
483
|
def __init__(self):
|
400
484
|
super().__init__()
|
401
485
|
self._lock = asyncio.Lock()
|
402
|
-
self._change_count: dict[tuple[str, str], int] = {}
|
486
|
+
self._change_count: dict[tuple[str, str], int] = {}
|
403
487
|
|
404
|
-
async def _resource_change_event(
|
405
|
-
self, kbid: str, node_id: str, shard_id: str
|
406
|
-
) -> None:
|
488
|
+
async def _resource_change_event(self, kbid: str, node_id: str, shard_id: str) -> None:
|
407
489
|
if (node_id, shard_id) not in self._change_count:
|
408
490
|
self._change_count[(node_id, shard_id)] = 0
|
409
491
|
self._change_count[(node_id, shard_id)] += 1
|
@@ -416,19 +498,15 @@ class StandaloneKBShardManager(KBShardManager):
|
|
416
498
|
if index_node is None:
|
417
499
|
return
|
418
500
|
shard_info: noderesources_pb2.Shard = await index_node.reader.GetShard(
|
419
|
-
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id))
|
501
|
+
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id))
|
420
502
|
)
|
421
503
|
await self.maybe_create_new_shard(
|
422
504
|
kbid,
|
423
505
|
shard_info.paragraphs,
|
424
|
-
shard_info.fields,
|
425
|
-
shard_info.metadata.release_channel,
|
426
506
|
)
|
427
|
-
await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id))
|
507
|
+
await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id))
|
428
508
|
|
429
|
-
@backoff.on_exception(
|
430
|
-
backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5
|
431
|
-
)
|
509
|
+
@backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
|
432
510
|
async def delete_resource(
|
433
511
|
self,
|
434
512
|
shard: writer_pb2.ShardObject,
|
@@ -444,19 +522,21 @@ class StandaloneKBShardManager(KBShardManager):
|
|
444
522
|
req.shard_id = shardreplica.shard.id
|
445
523
|
index_node = get_index_node(shardreplica.node)
|
446
524
|
if index_node is None: # pragma: no cover
|
447
|
-
raise NodesUnsync(
|
448
|
-
f"Node {shardreplica.node} is not found or not available"
|
449
|
-
)
|
525
|
+
raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
|
450
526
|
await index_node.writer.RemoveResource(req) # type: ignore
|
451
527
|
asyncio.create_task(
|
452
|
-
self._resource_change_event(
|
453
|
-
kb, shardreplica.node, shardreplica.shard.id
|
454
|
-
)
|
528
|
+
self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
|
455
529
|
)
|
456
530
|
|
457
|
-
|
458
|
-
|
459
|
-
|
531
|
+
nidx = get_nidx()
|
532
|
+
if nidx is not None and shard.nidx_shard_id:
|
533
|
+
indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
|
534
|
+
indexpb.shard = shard.nidx_shard_id
|
535
|
+
indexpb.resource = uuid
|
536
|
+
indexpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
|
537
|
+
await nidx.index(indexpb)
|
538
|
+
|
539
|
+
@backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
|
460
540
|
async def add_resource(
|
461
541
|
self,
|
462
542
|
shard: writer_pb2.ShardObject,
|
@@ -467,21 +547,45 @@ class StandaloneKBShardManager(KBShardManager):
|
|
467
547
|
reindex_id: Optional[str] = None,
|
468
548
|
source: IndexMessageSource.ValueType = IndexMessageSource.PROCESSOR,
|
469
549
|
) -> None:
|
550
|
+
"""
|
551
|
+
Calls the node writer's SetResource method directly to store the resource in the node.
|
552
|
+
There is no queuing for standalone nodes at the moment -- indexing is done synchronously.
|
553
|
+
"""
|
470
554
|
index_node = None
|
471
555
|
for shardreplica in shard.replicas:
|
472
556
|
resource.shard_id = resource.resource.shard_id = shardreplica.shard.id
|
473
557
|
index_node = get_index_node(shardreplica.node)
|
474
558
|
if index_node is None: # pragma: no cover
|
475
|
-
raise NodesUnsync(
|
476
|
-
f"Node {shardreplica.node} is not found or not available"
|
477
|
-
)
|
559
|
+
raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
|
478
560
|
await index_node.writer.SetResource(resource) # type: ignore
|
479
561
|
asyncio.create_task(
|
480
|
-
self._resource_change_event(
|
481
|
-
|
482
|
-
|
562
|
+
self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
|
563
|
+
)
|
564
|
+
|
565
|
+
nidx = get_nidx()
|
566
|
+
if nidx is not None and shard.nidx_shard_id:
|
567
|
+
storage = await get_storage()
|
568
|
+
indexpb = IndexMessage()
|
569
|
+
storage_key = await storage.indexing(
|
570
|
+
resource, txid, partition, kb=kb, logical_shard=shard.shard
|
483
571
|
)
|
484
572
|
|
573
|
+
indexpb.typemessage = TypeMessage.CREATION
|
574
|
+
indexpb.storage_key = storage_key
|
575
|
+
indexpb.kbid = kb
|
576
|
+
indexpb.source = source
|
577
|
+
indexpb.resource = resource.resource.uuid
|
578
|
+
indexpb.shard = shard.nidx_shard_id
|
579
|
+
|
580
|
+
await nidx.index(indexpb)
|
581
|
+
|
582
|
+
# Delete indexing message (no longer needed)
|
583
|
+
try:
|
584
|
+
if storage.indexing_bucket:
|
585
|
+
await storage.delete_upload(storage_key, storage.indexing_bucket)
|
586
|
+
except Exception:
|
587
|
+
pass
|
588
|
+
|
485
589
|
|
486
590
|
def get_all_shard_nodes(
|
487
591
|
shard: writer_pb2.ShardObject,
|
@@ -513,6 +617,7 @@ def get_all_shard_nodes(
|
|
513
617
|
def choose_node(
|
514
618
|
shard: writer_pb2.ShardObject,
|
515
619
|
*,
|
620
|
+
use_nidx: bool,
|
516
621
|
target_shard_replicas: Optional[list[str]] = None,
|
517
622
|
use_read_replica_nodes: bool = False,
|
518
623
|
) -> tuple[AbstractIndexNode, str]:
|
@@ -528,6 +633,13 @@ def choose_node(
|
|
528
633
|
`target_shard_replicas` is the least preferent.
|
529
634
|
|
530
635
|
"""
|
636
|
+
|
637
|
+
# Use nidx if requested and enabled, fallback to node
|
638
|
+
if shard.nidx_shard_id and use_nidx:
|
639
|
+
fake_node = get_nidx_fake_node()
|
640
|
+
if fake_node:
|
641
|
+
return fake_node, shard.nidx_shard_id
|
642
|
+
|
531
643
|
target_shard_replicas = target_shard_replicas or []
|
532
644
|
|
533
645
|
shard_nodes = get_all_shard_nodes(shard, use_read_replicas=use_read_replica_nodes)
|
@@ -550,7 +662,10 @@ def choose_node(
|
|
550
662
|
ranked_nodes.setdefault(score, []).append((node, shard_replica_id))
|
551
663
|
|
552
664
|
top = ranked_nodes[max(ranked_nodes)]
|
553
|
-
|
665
|
+
# As shard replica ids are random numbers, we sort by shard replica id and choose its
|
666
|
+
# node to make sure we choose in deterministically but we don't favour any node in particular
|
667
|
+
top.sort(key=lambda x: x[1])
|
668
|
+
selected_node, shard_replica_id = top[0]
|
554
669
|
return selected_node, shard_replica_id
|
555
670
|
|
556
671
|
|
@@ -558,17 +673,17 @@ def check_enough_nodes():
|
|
558
673
|
"""
|
559
674
|
It raises an exception if it can't find enough nodes for the configured replicas.
|
560
675
|
"""
|
676
|
+
drain_nodes = settings.drain_nodes
|
561
677
|
target_replicas = settings.node_replicas
|
562
678
|
available_nodes = get_index_nodes()
|
679
|
+
available_nodes = [node for node in available_nodes if node.id not in drain_nodes]
|
563
680
|
if len(available_nodes) < target_replicas:
|
564
681
|
raise NodeClusterSmall(
|
565
682
|
f"Not enough nodes. Total: {len(available_nodes)}, Required: {target_replicas}"
|
566
683
|
)
|
567
684
|
if settings.max_node_replicas >= 0:
|
568
685
|
available_nodes = list(
|
569
|
-
filter(
|
570
|
-
lambda n: n.shard_count < settings.max_node_replicas, available_nodes # type: ignore
|
571
|
-
)
|
686
|
+
filter(lambda n: n.shard_count < settings.max_node_replicas, available_nodes)
|
572
687
|
)
|
573
688
|
if len(available_nodes) < target_replicas:
|
574
689
|
raise NodeClusterSmall(
|
@@ -576,26 +691,32 @@ def check_enough_nodes():
|
|
576
691
|
)
|
577
692
|
|
578
693
|
|
579
|
-
def sorted_primary_nodes(
|
694
|
+
def sorted_primary_nodes(
|
695
|
+
avoid_nodes: Optional[list[str]] = None,
|
696
|
+
ignore_nodes: Optional[list[str]] = None,
|
697
|
+
) -> list[str]:
|
580
698
|
"""
|
581
699
|
Returns the list of all primary node ids sorted by decreasing available
|
582
700
|
disk space (from more to less available disk reported).
|
583
701
|
|
584
|
-
|
702
|
+
Nodes in `avoid_nodes` are placed at the tail of the list.
|
703
|
+
Nodes in `ignore_nodes` are ignored and never returned.
|
585
704
|
"""
|
586
705
|
primary_nodes = get_index_nodes(include_secondary=False)
|
587
706
|
|
588
707
|
# Sort by available disk
|
589
|
-
|
590
|
-
|
591
|
-
)
|
592
|
-
available_node_ids = [node.id for node in sorted_primary_nodes]
|
708
|
+
sorted_nodes = sorted(primary_nodes, key=lambda n: n.available_disk, reverse=True)
|
709
|
+
available_node_ids = [node.id for node in sorted_nodes]
|
593
710
|
|
594
711
|
avoid_nodes = avoid_nodes or []
|
595
|
-
|
712
|
+
ignore_nodes = ignore_nodes or []
|
713
|
+
|
714
|
+
# Get the non-avoided nodes first
|
596
715
|
preferred_nodes = [nid for nid in available_node_ids if nid not in avoid_nodes]
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
716
|
+
|
717
|
+
# Add avoid_nodes to the end of the last nodes
|
718
|
+
result_nodes = preferred_nodes + [nid for nid in available_node_ids if nid not in preferred_nodes]
|
719
|
+
|
720
|
+
# Remove ignore_nodes from the list
|
721
|
+
result_nodes = [nid for nid in result_nodes if nid not in ignore_nodes]
|
722
|
+
return result_nodes
|