nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@
|
|
19
19
|
#
|
20
20
|
import argparse
|
21
21
|
import asyncio
|
22
|
-
import enum
|
23
22
|
import logging
|
24
23
|
from datetime import datetime
|
25
24
|
from typing import Optional
|
@@ -27,145 +26,213 @@ from typing import Optional
|
|
27
26
|
from nucliadb.common import datamanagers, locking
|
28
27
|
from nucliadb.common.cluster import manager as cluster_manager
|
29
28
|
from nucliadb.common.context import ApplicationContext
|
30
|
-
from
|
29
|
+
from nucliadb.common.datamanagers.rollover import RolloverState, RolloverStateNotFoundError
|
30
|
+
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
31
|
+
from nucliadb.common.external_index_providers.manager import (
|
32
|
+
get_external_index_manager,
|
33
|
+
)
|
34
|
+
from nucliadb.common.nidx import get_nidx_fake_node
|
35
|
+
from nucliadb_protos import nodewriter_pb2, writer_pb2
|
31
36
|
from nucliadb_telemetry import errors
|
32
37
|
|
33
38
|
from .manager import get_index_node
|
34
39
|
from .settings import settings
|
35
|
-
from .utils import
|
40
|
+
from .utils import (
|
41
|
+
delete_resource_from_shard,
|
42
|
+
get_resource,
|
43
|
+
get_resource_index_message,
|
44
|
+
index_resource_to_shard,
|
45
|
+
wait_for_node,
|
46
|
+
)
|
36
47
|
|
37
48
|
logger = logging.getLogger(__name__)
|
38
49
|
|
39
50
|
|
40
|
-
class
|
41
|
-
|
42
|
-
RESOURCES_INDEXED = "resources_indexed"
|
43
|
-
RESOURCES_VALIDATED = "resources_validated"
|
44
|
-
|
51
|
+
class UnexpectedRolloverError(Exception):
|
52
|
+
pass
|
45
53
|
|
46
|
-
def _get_rollover_status(
|
47
|
-
rollover_shards: writer_pb2.Shards, status: RolloverStatus
|
48
|
-
) -> bool:
|
49
|
-
return rollover_shards.extra.get(status.value) == "true"
|
50
54
|
|
55
|
+
async def create_rollover_index(
|
56
|
+
app_context: ApplicationContext,
|
57
|
+
kbid: str,
|
58
|
+
drain_nodes: Optional[list[str]] = None,
|
59
|
+
external: Optional[ExternalIndexManager] = None,
|
60
|
+
) -> None:
|
61
|
+
"""
|
62
|
+
Creates a new index for a knowledgebox in the index node cluster (and to the external index provider if configured).
|
63
|
+
For the external index case, we still need the shard on the index node cluster to be created because
|
64
|
+
it is used to store the rollover state during the rollover. However, the actual indexing will be done
|
65
|
+
by the external index provider.
|
66
|
+
"""
|
67
|
+
await create_rollover_shards(app_context, kbid, drain_nodes=drain_nodes)
|
68
|
+
if external is not None:
|
69
|
+
if external.supports_rollover:
|
70
|
+
await create_rollover_external_index(kbid, external)
|
71
|
+
else:
|
72
|
+
logger.info(
|
73
|
+
"External index provider does not support rollover",
|
74
|
+
extra={"kbid": kbid, "external_index_provider": external.type.value},
|
75
|
+
)
|
51
76
|
|
52
|
-
def _set_rollover_status(rollover_shards: writer_pb2.Shards, status: RolloverStatus):
|
53
|
-
rollover_shards.extra[status.value] = "true"
|
54
77
|
|
78
|
+
async def create_rollover_external_index(kbid: str, external: ExternalIndexManager) -> None:
|
79
|
+
extra = {"kbid": kbid, "external_index_provider": external.type.value}
|
80
|
+
async with datamanagers.with_ro_transaction() as txn:
|
81
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
82
|
+
if state.external_index_created:
|
83
|
+
logger.info("Rollover external index already created, skipping", extra=extra)
|
84
|
+
return
|
55
85
|
|
56
|
-
|
57
|
-
|
58
|
-
|
86
|
+
logger.info("Creating rollover external index", extra=extra)
|
87
|
+
async with datamanagers.with_ro_transaction() as txn:
|
88
|
+
stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
|
89
|
+
if stored_metadata is None:
|
90
|
+
raise UnexpectedRolloverError("External index metadata not found")
|
59
91
|
|
92
|
+
rollover_metadata = await external.rollover_create_indexes(stored_metadata)
|
60
93
|
|
61
|
-
|
62
|
-
|
94
|
+
async with datamanagers.with_rw_transaction() as txn:
|
95
|
+
await datamanagers.rollover.update_kb_rollover_external_index_metadata(
|
96
|
+
txn, kbid=kbid, metadata=rollover_metadata
|
97
|
+
)
|
98
|
+
state.external_index_created = True
|
99
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
100
|
+
await txn.commit()
|
63
101
|
|
64
102
|
|
65
103
|
async def create_rollover_shards(
|
66
104
|
app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
|
67
105
|
) -> writer_pb2.Shards:
|
68
106
|
"""
|
69
|
-
Creates
|
107
|
+
Creates new index node shards for a rollover operation.
|
70
108
|
If drain_nodes is provided, no replicas will be created on those nodes.
|
71
109
|
"""
|
110
|
+
|
72
111
|
logger.info("Creating rollover shards", extra={"kbid": kbid})
|
73
112
|
sm = app_context.shard_manager
|
113
|
+
nidx_node = get_nidx_fake_node()
|
74
114
|
|
75
|
-
async with datamanagers.
|
76
|
-
|
77
|
-
txn, kbid=kbid
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
115
|
+
async with datamanagers.with_ro_transaction() as txn:
|
116
|
+
try:
|
117
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
118
|
+
except RolloverStateNotFoundError:
|
119
|
+
# State is not set yet, create it
|
120
|
+
state = RolloverState(
|
121
|
+
rollover_shards_created=False,
|
122
|
+
external_index_created=False,
|
123
|
+
resources_scheduled=False,
|
124
|
+
resources_indexed=False,
|
125
|
+
cutover_shards=False,
|
126
|
+
cutover_external_index=False,
|
127
|
+
resources_validated=False,
|
128
|
+
)
|
82
129
|
|
83
130
|
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
84
131
|
if kb_shards is None:
|
85
132
|
raise UnexpectedRolloverError(f"No shards found for KB {kbid}")
|
86
133
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
134
|
+
if state.rollover_shards_created:
|
135
|
+
logger.info("Rollover shards already created, skipping", extra={"kbid": kbid})
|
136
|
+
return kb_shards
|
137
|
+
|
138
|
+
# create new shards
|
139
|
+
created_shards = []
|
140
|
+
try:
|
141
|
+
nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
|
142
|
+
for shard in kb_shards.shards:
|
143
|
+
shard.ClearField("replicas")
|
144
|
+
# Attempt to create configured number of replicas
|
145
|
+
replicas_created = 0
|
146
|
+
while replicas_created < settings.node_replicas:
|
147
|
+
if len(nodes) == 0:
|
148
|
+
# could have multiple shards on single node
|
149
|
+
nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
|
150
|
+
node_id = nodes.pop(0)
|
151
|
+
|
152
|
+
node = get_index_node(node_id)
|
153
|
+
if node is None:
|
154
|
+
logger.error(f"Node {node_id} is not found or not available")
|
155
|
+
continue
|
156
|
+
|
157
|
+
vectorsets = {
|
158
|
+
vectorset_id: vectorset_config.vectorset_index_config
|
159
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
160
|
+
txn, kbid=kbid
|
161
|
+
)
|
162
|
+
}
|
163
|
+
try:
|
164
|
+
if not vectorsets:
|
165
|
+
is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
|
166
|
+
vector_index_config = nodewriter_pb2.VectorIndexConfig(
|
167
|
+
similarity=kb_shards.similarity,
|
168
|
+
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
169
|
+
vector_dimension=kb_shards.model.vector_dimension,
|
170
|
+
normalize_vectors=is_matryoshka,
|
100
171
|
)
|
101
|
-
node_id = nodes.pop(0)
|
102
|
-
|
103
|
-
node = get_index_node(node_id)
|
104
|
-
if node is None:
|
105
|
-
logger.error(f"Node {node_id} is not found or not available")
|
106
|
-
continue
|
107
|
-
is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
|
108
|
-
try:
|
109
172
|
shard_created = await node.new_shard(
|
110
173
|
kbid,
|
111
|
-
|
112
|
-
release_channel=kb_shards.release_channel,
|
113
|
-
normalize_vectors=is_matryoshka,
|
174
|
+
vector_index_config=vector_index_config,
|
114
175
|
)
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
176
|
+
else:
|
177
|
+
shard_created = await node.new_shard_with_vectorsets(
|
178
|
+
kbid,
|
179
|
+
vectorsets_configs=vectorsets,
|
180
|
+
)
|
181
|
+
except Exception as e:
|
182
|
+
errors.capture_exception(e)
|
183
|
+
logger.exception(f"Error creating new shard at {node}")
|
184
|
+
continue
|
185
|
+
|
186
|
+
replica = writer_pb2.ShardReplica(node=str(node_id))
|
187
|
+
replica.shard.CopyFrom(shard_created)
|
188
|
+
shard.replicas.append(replica)
|
189
|
+
created_shards.append(shard)
|
190
|
+
replicas_created += 1
|
191
|
+
|
192
|
+
if nidx_node:
|
193
|
+
nidx_shard = await nidx_node.new_shard_with_vectorsets(
|
194
|
+
kbid,
|
195
|
+
vectorsets_configs=vectorsets,
|
196
|
+
)
|
197
|
+
shard.nidx_shard_id = nidx_shard.id
|
198
|
+
|
199
|
+
except Exception as e:
|
200
|
+
errors.capture_exception(e)
|
201
|
+
logger.exception("Unexpected error creating new shard")
|
202
|
+
for created_shard in created_shards:
|
203
|
+
await sm.rollback_shard(created_shard)
|
204
|
+
raise e
|
205
|
+
|
206
|
+
async with datamanagers.with_transaction() as txn:
|
207
|
+
await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=kb_shards)
|
208
|
+
state.rollover_shards_created = True
|
209
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
135
210
|
await txn.commit()
|
136
211
|
return kb_shards
|
137
212
|
|
138
213
|
|
139
|
-
def _get_shard(
|
140
|
-
shards: writer_pb2.Shards, shard_id: str
|
141
|
-
) -> Optional[writer_pb2.ShardObject]:
|
214
|
+
def _get_shard(shards: writer_pb2.Shards, shard_id: str) -> Optional[writer_pb2.ShardObject]:
|
142
215
|
for shard in shards.shards:
|
143
216
|
if shard_id == shard.shard:
|
144
217
|
return shard
|
145
218
|
return None
|
146
219
|
|
147
220
|
|
148
|
-
async def schedule_resource_indexing(
|
149
|
-
app_context: ApplicationContext, kbid: str
|
150
|
-
) -> None:
|
221
|
+
async def schedule_resource_indexing(app_context: ApplicationContext, kbid: str) -> None:
|
151
222
|
"""
|
152
223
|
Schedule indexing all data in a kb in rollover shards
|
153
224
|
"""
|
154
|
-
logger.info("
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
txn, kbid=kbid
|
159
|
-
)
|
160
|
-
if rollover_shards is None:
|
225
|
+
logger.info("Scheduling resources to be indexed to rollover shards", extra={"kbid": kbid})
|
226
|
+
async with datamanagers.with_ro_transaction() as txn:
|
227
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
228
|
+
if not state.rollover_shards_created:
|
161
229
|
raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
return
|
230
|
+
if state.resources_scheduled:
|
231
|
+
logger.info(
|
232
|
+
"Resources already scheduled for indexing, skipping",
|
233
|
+
extra={"kbid": kbid},
|
234
|
+
)
|
235
|
+
return
|
169
236
|
|
170
237
|
batch = []
|
171
238
|
async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
|
@@ -173,9 +240,7 @@ async def schedule_resource_indexing(
|
|
173
240
|
|
174
241
|
if len(batch) > 100:
|
175
242
|
async with datamanagers.with_transaction() as txn:
|
176
|
-
await datamanagers.rollover.add_batch_to_index(
|
177
|
-
txn, kbid=kbid, batch=batch
|
178
|
-
)
|
243
|
+
await datamanagers.rollover.add_batch_to_index(txn, kbid=kbid, batch=batch)
|
179
244
|
await txn.commit()
|
180
245
|
batch = []
|
181
246
|
if len(batch) > 0:
|
@@ -184,10 +249,8 @@ async def schedule_resource_indexing(
|
|
184
249
|
await txn.commit()
|
185
250
|
|
186
251
|
async with datamanagers.with_transaction() as txn:
|
187
|
-
|
188
|
-
await datamanagers.rollover.
|
189
|
-
txn, kbid=kbid, kb_shards=rollover_shards
|
190
|
-
)
|
252
|
+
state.resources_scheduled = True
|
253
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
191
254
|
await txn.commit()
|
192
255
|
|
193
256
|
|
@@ -195,24 +258,27 @@ def _to_ts(dt: datetime) -> int:
|
|
195
258
|
return int(dt.timestamp() * 1000 * 1000)
|
196
259
|
|
197
260
|
|
198
|
-
async def
|
261
|
+
async def index_to_rollover_index(
|
262
|
+
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
263
|
+
) -> None:
|
199
264
|
"""
|
200
|
-
Indexes all data in a kb in rollover
|
265
|
+
Indexes all data in a kb in rollover indexes. This happens before the cutover.
|
201
266
|
"""
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
)
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
267
|
+
extra = {"kbid": kbid, "external_index_provider": None}
|
268
|
+
if external is not None:
|
269
|
+
extra["external_index_provider"] = external.type.value
|
270
|
+
async with datamanagers.with_ro_transaction() as txn:
|
271
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
272
|
+
if not all([state.rollover_shards_created, state.resources_scheduled]):
|
273
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
274
|
+
rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
|
275
|
+
if rollover_shards is None:
|
276
|
+
raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
|
277
|
+
if state.resources_indexed:
|
278
|
+
logger.info("Resources already indexed, skipping", extra=extra)
|
212
279
|
return
|
213
280
|
|
214
|
-
logger.info("Indexing rollover
|
215
|
-
|
281
|
+
logger.info("Indexing to rollover index", extra=extra)
|
216
282
|
wait_index_batch: list[writer_pb2.ShardObject] = []
|
217
283
|
# now index on all new shards only
|
218
284
|
while True:
|
@@ -231,9 +297,7 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
|
|
231
297
|
extra={"kbid": kbid, "resource_id": resource_id},
|
232
298
|
)
|
233
299
|
async with datamanagers.with_transaction() as txn:
|
234
|
-
await datamanagers.rollover.remove_to_index(
|
235
|
-
txn, kbid=kbid, resource=resource_id
|
236
|
-
)
|
300
|
+
await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
|
237
301
|
await txn.commit()
|
238
302
|
continue
|
239
303
|
|
@@ -246,28 +310,29 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
|
|
246
310
|
raise UnexpectedRolloverError(
|
247
311
|
f"Shard {shard_id} not found. Was a new one created during migration?"
|
248
312
|
)
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
)
|
253
|
-
if resource_index_message is None:
|
313
|
+
resource = await get_resource(kbid, resource_id)
|
314
|
+
index_message = await get_resource_index_message(kbid, resource_id)
|
315
|
+
if resource is None or index_message is None:
|
254
316
|
# resource no longer existing, remove indexing and carry on
|
255
317
|
async with datamanagers.with_transaction() as txn:
|
256
|
-
await datamanagers.rollover.remove_to_index(
|
257
|
-
txn, kbid=kbid, resource=resource_id
|
258
|
-
)
|
318
|
+
await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
|
259
319
|
await txn.commit()
|
260
320
|
continue
|
261
321
|
|
322
|
+
if external is not None:
|
323
|
+
await external.index_resource(resource_id, index_message, to_rollover_indexes=True)
|
324
|
+
else:
|
325
|
+
await index_resource_to_shard(
|
326
|
+
app_context, kbid, resource_id, shard, resource_index_message=index_message
|
327
|
+
)
|
328
|
+
|
262
329
|
async with datamanagers.with_transaction() as txn:
|
263
330
|
await datamanagers.rollover.add_indexed(
|
264
331
|
txn,
|
265
332
|
kbid=kbid,
|
266
333
|
resource_id=resource_id,
|
267
334
|
shard_id=shard_id,
|
268
|
-
modification_time=_to_ts(
|
269
|
-
resource_index_message.metadata.modified.ToDatetime()
|
270
|
-
),
|
335
|
+
modification_time=_to_ts(resource.basic.modified.ToDatetime()), # type: ignore
|
271
336
|
)
|
272
337
|
await txn.commit()
|
273
338
|
wait_index_batch.append(shard)
|
@@ -281,11 +346,66 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
|
|
281
346
|
await wait_for_node(app_context, node_id)
|
282
347
|
wait_index_batch = []
|
283
348
|
|
284
|
-
_set_rollover_status(rollover_shards, RolloverStatus.RESOURCES_INDEXED)
|
285
349
|
async with datamanagers.with_transaction() as txn:
|
286
|
-
|
287
|
-
|
350
|
+
state.resources_indexed = True
|
351
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
352
|
+
await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=rollover_shards)
|
353
|
+
await txn.commit()
|
354
|
+
|
355
|
+
|
356
|
+
async def cutover_index(
|
357
|
+
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
358
|
+
) -> None:
|
359
|
+
"""
|
360
|
+
Swaps our the current active index for a knowledgebox.
|
361
|
+
"""
|
362
|
+
await cutover_shards(app_context, kbid)
|
363
|
+
if external is not None:
|
364
|
+
if external.supports_rollover:
|
365
|
+
await cutover_external_index(kbid, external)
|
366
|
+
else:
|
367
|
+
logger.info(
|
368
|
+
"External index provider does not support rollover",
|
369
|
+
extra={"kbid": kbid, "external_index_provider": external.type.value},
|
370
|
+
)
|
371
|
+
|
372
|
+
|
373
|
+
async def cutover_external_index(kbid: str, external: ExternalIndexManager) -> None:
|
374
|
+
"""
|
375
|
+
Cuts over to the newly creted external index for a knowledgebox.
|
376
|
+
The old indexes are deleted.
|
377
|
+
"""
|
378
|
+
extra = {"kbid": kbid, "external_index_provider": external.type.value}
|
379
|
+
logger.info("Cutting over external index", extra=extra)
|
380
|
+
async with datamanagers.with_rw_transaction() as txn:
|
381
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
382
|
+
if not all(
|
383
|
+
[
|
384
|
+
state.rollover_shards_created,
|
385
|
+
state.resources_scheduled,
|
386
|
+
state.resources_indexed,
|
387
|
+
]
|
388
|
+
):
|
389
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
390
|
+
if state.cutover_external_index:
|
391
|
+
logger.info("External index already cut over, skipping", extra=extra)
|
392
|
+
return
|
393
|
+
|
394
|
+
stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
|
395
|
+
rollover_metadata = await datamanagers.rollover.get_kb_rollover_external_index_metadata(
|
396
|
+
txn, kbid=kbid
|
288
397
|
)
|
398
|
+
if stored_metadata is None or rollover_metadata is None:
|
399
|
+
raise UnexpectedRolloverError("stored or rollover external index metadata not found")
|
400
|
+
|
401
|
+
await external.rollover_cutover_indexes()
|
402
|
+
|
403
|
+
await datamanagers.kb.set_external_index_provider_metadata(
|
404
|
+
txn, kbid=kbid, metadata=rollover_metadata
|
405
|
+
)
|
406
|
+
await datamanagers.rollover.delete_kb_rollover_external_index_metadata(txn, kbid=kbid)
|
407
|
+
state.cutover_external_index = True
|
408
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
289
409
|
await txn.commit()
|
290
410
|
|
291
411
|
|
@@ -297,29 +417,40 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
|
|
297
417
|
async with datamanagers.with_transaction() as txn:
|
298
418
|
sm = app_context.shard_manager
|
299
419
|
|
420
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
421
|
+
if not all(
|
422
|
+
[
|
423
|
+
state.rollover_shards_created,
|
424
|
+
state.resources_scheduled,
|
425
|
+
state.resources_indexed,
|
426
|
+
]
|
427
|
+
):
|
428
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
429
|
+
if state.cutover_shards:
|
430
|
+
logger.info("Shards already cut over, skipping", extra={"kbid": kbid})
|
431
|
+
return
|
432
|
+
|
300
433
|
previously_active_shards = await datamanagers.cluster.get_kb_shards(
|
301
|
-
txn, kbid=kbid
|
302
|
-
)
|
303
|
-
rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
|
304
|
-
txn, kbid=kbid
|
434
|
+
txn, kbid=kbid, for_update=True
|
305
435
|
)
|
436
|
+
rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
|
306
437
|
if previously_active_shards is None or rollover_shards is None:
|
307
438
|
raise UnexpectedRolloverError("Shards for kb not found")
|
308
439
|
|
309
|
-
|
310
|
-
await datamanagers.cluster.update_kb_shards(
|
311
|
-
txn, kbid=kbid, shards=rollover_shards
|
312
|
-
)
|
440
|
+
await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rollover_shards)
|
313
441
|
await datamanagers.rollover.delete_kb_rollover_shards(txn, kbid=kbid)
|
314
442
|
|
315
443
|
for shard in previously_active_shards.shards:
|
316
444
|
await sm.rollback_shard(shard)
|
317
445
|
|
446
|
+
state.cutover_shards = True
|
447
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
448
|
+
|
318
449
|
await txn.commit()
|
319
450
|
|
320
451
|
|
321
452
|
async def validate_indexed_data(
|
322
|
-
app_context: ApplicationContext, kbid: str
|
453
|
+
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
323
454
|
) -> list[str]:
|
324
455
|
"""
|
325
456
|
Goes through all the resources in a knowledgebox and validates it
|
@@ -329,21 +460,34 @@ async def validate_indexed_data(
|
|
329
460
|
|
330
461
|
If a resource was removed during the rollover, it will be removed as well.
|
331
462
|
"""
|
463
|
+
extra = {"kbid": kbid, "external_index_provider": None}
|
464
|
+
if external is not None:
|
465
|
+
extra["external_index_provider"] = external.type.value
|
466
|
+
async with datamanagers.with_ro_transaction() as txn:
|
467
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
468
|
+
if not all(
|
469
|
+
[
|
470
|
+
state.rollover_shards_created,
|
471
|
+
state.resources_scheduled,
|
472
|
+
state.resources_indexed,
|
473
|
+
state.cutover_shards,
|
474
|
+
]
|
475
|
+
):
|
476
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
332
477
|
|
333
|
-
async with datamanagers.with_transaction() as txn:
|
334
478
|
rolled_over_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
335
479
|
if rolled_over_shards is None:
|
336
480
|
raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
|
337
481
|
|
338
|
-
if
|
339
|
-
logger.info("Resources already validated, skipping", extra=
|
482
|
+
if state.resources_validated:
|
483
|
+
logger.info("Resources already validated, skipping", extra=extra)
|
340
484
|
return []
|
341
485
|
|
342
|
-
logger.info("Validating indexed data", extra=
|
486
|
+
logger.info("Validating indexed data", extra=extra)
|
343
487
|
|
344
|
-
repaired_resources = []
|
488
|
+
repaired_resources: list[str] = []
|
345
489
|
async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
|
346
|
-
async with datamanagers.
|
490
|
+
async with datamanagers.with_ro_transaction() as txn:
|
347
491
|
indexed_data = await datamanagers.rollover.get_indexed_data(
|
348
492
|
txn, kbid=kbid, resource_id=resource_id
|
349
493
|
)
|
@@ -360,7 +504,7 @@ async def validate_indexed_data(
|
|
360
504
|
if shard_id is None:
|
361
505
|
logger.error(
|
362
506
|
"Shard id not found for resource",
|
363
|
-
extra={"
|
507
|
+
extra={"resource_id": resource_id, **extra},
|
364
508
|
)
|
365
509
|
raise UnexpectedRolloverError("Shard id not found for resource")
|
366
510
|
last_indexed = 0
|
@@ -370,23 +514,18 @@ async def validate_indexed_data(
|
|
370
514
|
logger.error(
|
371
515
|
"Shard not found for resource",
|
372
516
|
extra={
|
373
|
-
"kbid": kbid,
|
374
517
|
"resource_id": resource_id,
|
375
518
|
"shard_id": shard_id,
|
519
|
+
**extra,
|
376
520
|
},
|
377
521
|
)
|
378
|
-
raise UnexpectedRolloverError(
|
379
|
-
f"Shard {shard_id} not found. This should not happen"
|
380
|
-
)
|
522
|
+
raise UnexpectedRolloverError(f"Shard {shard_id} not found. This should not happen")
|
381
523
|
|
382
|
-
|
383
|
-
res = await datamanagers.resources.get_resource(
|
384
|
-
txn, kbid=kbid, rid=resource_id
|
385
|
-
)
|
524
|
+
res = await get_resource(kbid, resource_id)
|
386
525
|
if res is None:
|
387
526
|
logger.error(
|
388
527
|
"Resource not found while validating, skipping",
|
389
|
-
extra={"
|
528
|
+
extra={"resource_id": resource_id, **extra},
|
390
529
|
)
|
391
530
|
continue
|
392
531
|
|
@@ -403,12 +542,26 @@ async def validate_indexed_data(
|
|
403
542
|
await txn.commit()
|
404
543
|
continue
|
405
544
|
|
545
|
+
index_message = await get_resource_index_message(kbid, resource_id)
|
546
|
+
if index_message is None:
|
547
|
+
logger.error(
|
548
|
+
"Resource index message not found while validating, skipping",
|
549
|
+
extra={"resource_id": resource_id, **extra},
|
550
|
+
)
|
551
|
+
continue
|
552
|
+
|
406
553
|
# resource was modified or added during rollover, reindex
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
554
|
+
if external is not None:
|
555
|
+
await external.index_resource(
|
556
|
+
resource_id,
|
557
|
+
index_message,
|
558
|
+
to_rollover_indexes=True,
|
559
|
+
)
|
560
|
+
else:
|
561
|
+
await index_resource_to_shard(
|
562
|
+
app_context, kbid, resource_id, shard, resource_index_message=index_message
|
563
|
+
)
|
564
|
+
repaired_resources.append(resource_id)
|
412
565
|
async with datamanagers.with_transaction() as txn:
|
413
566
|
await datamanagers.rollover.add_indexed(
|
414
567
|
txn,
|
@@ -432,11 +585,10 @@ async def validate_indexed_data(
|
|
432
585
|
raise UnexpectedRolloverError("Shard not found. This should not happen")
|
433
586
|
await delete_resource_from_shard(app_context, kbid, resource_id, shard)
|
434
587
|
|
435
|
-
_set_rollover_status(rolled_over_shards, RolloverStatus.RESOURCES_VALIDATED)
|
436
588
|
async with datamanagers.with_transaction() as txn:
|
437
|
-
|
438
|
-
|
439
|
-
)
|
589
|
+
state.resources_validated = True
|
590
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
591
|
+
await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rolled_over_shards)
|
440
592
|
|
441
593
|
return repaired_resources
|
442
594
|
|
@@ -458,69 +610,76 @@ async def clean_indexed_data(app_context: ApplicationContext, kbid: str) -> None
|
|
458
610
|
|
459
611
|
async def clean_rollover_status(app_context: ApplicationContext, kbid: str) -> None:
|
460
612
|
async with datamanagers.with_transaction() as txn:
|
461
|
-
|
462
|
-
|
613
|
+
try:
|
614
|
+
await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
615
|
+
except RolloverStateNotFoundError:
|
463
616
|
logger.warning(
|
464
|
-
"No
|
465
|
-
extra={"kbid": kbid},
|
617
|
+
"No rollover state found, skipping clean rollover status", extra={"kbid": kbid}
|
466
618
|
)
|
467
619
|
return
|
620
|
+
await datamanagers.rollover.clear_rollover_state(txn, kbid=kbid)
|
621
|
+
await txn.commit()
|
468
622
|
|
469
|
-
|
470
|
-
|
623
|
+
|
624
|
+
async def wait_for_cluster_ready() -> None:
|
625
|
+
node_ready_checks = 0
|
626
|
+
while len(cluster_manager.INDEX_NODES) == 0:
|
627
|
+
if node_ready_checks > 10:
|
628
|
+
raise Exception("No index nodes available")
|
629
|
+
logger.info("Waiting for index nodes to be available")
|
630
|
+
await asyncio.sleep(1)
|
631
|
+
node_ready_checks += 1
|
471
632
|
|
472
633
|
|
473
|
-
async def
|
634
|
+
async def rollover_kb_index(
|
474
635
|
app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
|
475
636
|
) -> None:
|
476
637
|
"""
|
477
|
-
Rollover a
|
478
|
-
shard and indexing all existing resources into the replicas.
|
638
|
+
Rollover a KB index is the process of creating new shard replicas for every
|
639
|
+
shard and indexing all existing resources into the replicas. Also includes creating new external indexes if
|
640
|
+
the KB is configured to use them.
|
479
641
|
|
480
|
-
Once all the data is in the new
|
481
|
-
to the new shards and delete the old shards.
|
642
|
+
Once all the data is in the new indexes, cut over to the replicated index delete the old one.
|
482
643
|
|
483
|
-
If drain_nodes is provided, no replicas will be created on those nodes. This is useful
|
484
|
-
for when we want to remove a set of nodes from the cluster.
|
644
|
+
If drain_nodes is provided, no index node replicas will be created on those nodes. This is useful
|
645
|
+
for when we want to remove a set of nodes from the index node cluster.
|
485
646
|
|
486
647
|
This is a very expensive operation and should be done with care.
|
487
648
|
|
488
649
|
Process:
|
489
|
-
- Create new shards
|
650
|
+
- Create new index for kb index (index node shards or external indexes if configured)
|
490
651
|
- Schedule all resources to be indexed
|
491
|
-
- Index all resources into new shards
|
492
|
-
- Cut over replicas to new shards
|
493
|
-
- Validate that all resources are in the new
|
652
|
+
- Index all resources into new kb index (index node shards or external indexes if configured)
|
653
|
+
- Cut over replicas to new shards (and external indexes if configured)
|
654
|
+
- Validate that all resources are in the new kb index
|
494
655
|
- Clean up indexed data
|
495
656
|
"""
|
496
|
-
|
497
|
-
while len(cluster_manager.INDEX_NODES) == 0:
|
498
|
-
if node_ready_checks > 10:
|
499
|
-
raise Exception("No index nodes available")
|
500
|
-
logger.info("Waiting for index nodes to be available")
|
501
|
-
await asyncio.sleep(1)
|
502
|
-
node_ready_checks += 1
|
657
|
+
await wait_for_cluster_ready()
|
503
658
|
|
504
|
-
|
659
|
+
extra = {"kbid": kbid, "external_index_provider": None}
|
660
|
+
external = await get_external_index_manager(kbid, for_rollover=True)
|
661
|
+
if external is not None:
|
662
|
+
extra["external_index_provider"] = external.type.value
|
663
|
+
logger.info("Rolling over KB index", extra=extra)
|
505
664
|
|
506
665
|
async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
|
507
|
-
await
|
666
|
+
await create_rollover_index(app_context, kbid, drain_nodes=drain_nodes, external=external)
|
508
667
|
await schedule_resource_indexing(app_context, kbid)
|
509
|
-
await
|
510
|
-
await
|
668
|
+
await index_to_rollover_index(app_context, kbid, external=external)
|
669
|
+
await cutover_index(app_context, kbid, external=external)
|
511
670
|
# we need to cut over BEFORE we validate the data
|
512
|
-
await validate_indexed_data(app_context, kbid)
|
671
|
+
await validate_indexed_data(app_context, kbid, external=external)
|
513
672
|
await clean_indexed_data(app_context, kbid)
|
514
673
|
await clean_rollover_status(app_context, kbid)
|
515
674
|
|
516
|
-
logger.info("Finished rolling over
|
675
|
+
logger.info("Finished rolling over KB indes", extra=extra)
|
517
676
|
|
518
677
|
|
519
678
|
async def _rollover_kbid_command(kbid: str) -> None: # pragma: no cover
|
520
679
|
app_context = ApplicationContext()
|
521
680
|
await app_context.initialize()
|
522
681
|
try:
|
523
|
-
await
|
682
|
+
await rollover_kb_index(app_context, kbid)
|
524
683
|
finally:
|
525
684
|
await app_context.finalize()
|
526
685
|
|