nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@
|
|
19
19
|
#
|
20
20
|
import argparse
|
21
21
|
import asyncio
|
22
|
-
import enum
|
23
22
|
import logging
|
24
23
|
from datetime import datetime
|
25
24
|
from typing import Optional
|
@@ -27,140 +26,213 @@ from typing import Optional
|
|
27
26
|
from nucliadb.common import datamanagers, locking
|
28
27
|
from nucliadb.common.cluster import manager as cluster_manager
|
29
28
|
from nucliadb.common.context import ApplicationContext
|
30
|
-
from
|
29
|
+
from nucliadb.common.datamanagers.rollover import RolloverState, RolloverStateNotFoundError
|
30
|
+
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
31
|
+
from nucliadb.common.external_index_providers.manager import (
|
32
|
+
get_external_index_manager,
|
33
|
+
)
|
34
|
+
from nucliadb.common.nidx import get_nidx_fake_node
|
35
|
+
from nucliadb_protos import nodewriter_pb2, writer_pb2
|
31
36
|
from nucliadb_telemetry import errors
|
32
37
|
|
33
38
|
from .manager import get_index_node
|
34
39
|
from .settings import settings
|
35
|
-
from .utils import
|
40
|
+
from .utils import (
|
41
|
+
delete_resource_from_shard,
|
42
|
+
get_resource,
|
43
|
+
get_resource_index_message,
|
44
|
+
index_resource_to_shard,
|
45
|
+
wait_for_node,
|
46
|
+
)
|
36
47
|
|
37
48
|
logger = logging.getLogger(__name__)
|
38
49
|
|
39
50
|
|
40
|
-
class
|
41
|
-
|
42
|
-
RESOURCES_INDEXED = "resources_indexed"
|
43
|
-
RESOURCES_VALIDATED = "resources_validated"
|
44
|
-
|
51
|
+
class UnexpectedRolloverError(Exception):
|
52
|
+
pass
|
45
53
|
|
46
|
-
def _get_rollover_status(
|
47
|
-
rollover_shards: writer_pb2.Shards, status: RolloverStatus
|
48
|
-
) -> bool:
|
49
|
-
return rollover_shards.extra.get(status.value) == "true"
|
50
54
|
|
55
|
+
async def create_rollover_index(
|
56
|
+
app_context: ApplicationContext,
|
57
|
+
kbid: str,
|
58
|
+
drain_nodes: Optional[list[str]] = None,
|
59
|
+
external: Optional[ExternalIndexManager] = None,
|
60
|
+
) -> None:
|
61
|
+
"""
|
62
|
+
Creates a new index for a knowledgebox in the index node cluster (and to the external index provider if configured).
|
63
|
+
For the external index case, we still need the shard on the index node cluster to be created because
|
64
|
+
it is used to store the rollover state during the rollover. However, the actual indexing will be done
|
65
|
+
by the external index provider.
|
66
|
+
"""
|
67
|
+
await create_rollover_shards(app_context, kbid, drain_nodes=drain_nodes)
|
68
|
+
if external is not None:
|
69
|
+
if external.supports_rollover:
|
70
|
+
await create_rollover_external_index(kbid, external)
|
71
|
+
else:
|
72
|
+
logger.info(
|
73
|
+
"External index provider does not support rollover",
|
74
|
+
extra={"kbid": kbid, "external_index_provider": external.type.value},
|
75
|
+
)
|
51
76
|
|
52
|
-
def _set_rollover_status(rollover_shards: writer_pb2.Shards, status: RolloverStatus):
|
53
|
-
rollover_shards.extra[status.value] = "true"
|
54
77
|
|
78
|
+
async def create_rollover_external_index(kbid: str, external: ExternalIndexManager) -> None:
|
79
|
+
extra = {"kbid": kbid, "external_index_provider": external.type.value}
|
80
|
+
async with datamanagers.with_ro_transaction() as txn:
|
81
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
82
|
+
if state.external_index_created:
|
83
|
+
logger.info("Rollover external index already created, skipping", extra=extra)
|
84
|
+
return
|
55
85
|
|
56
|
-
|
57
|
-
|
58
|
-
|
86
|
+
logger.info("Creating rollover external index", extra=extra)
|
87
|
+
async with datamanagers.with_ro_transaction() as txn:
|
88
|
+
stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
|
89
|
+
if stored_metadata is None:
|
90
|
+
raise UnexpectedRolloverError("External index metadata not found")
|
59
91
|
|
92
|
+
rollover_metadata = await external.rollover_create_indexes(stored_metadata)
|
60
93
|
|
61
|
-
|
62
|
-
|
94
|
+
async with datamanagers.with_rw_transaction() as txn:
|
95
|
+
await datamanagers.rollover.update_kb_rollover_external_index_metadata(
|
96
|
+
txn, kbid=kbid, metadata=rollover_metadata
|
97
|
+
)
|
98
|
+
state.external_index_created = True
|
99
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
100
|
+
await txn.commit()
|
63
101
|
|
64
102
|
|
65
103
|
async def create_rollover_shards(
|
66
|
-
app_context: ApplicationContext, kbid: str
|
104
|
+
app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
|
67
105
|
) -> writer_pb2.Shards:
|
68
106
|
"""
|
69
|
-
Creates
|
107
|
+
Creates new index node shards for a rollover operation.
|
108
|
+
If drain_nodes is provided, no replicas will be created on those nodes.
|
70
109
|
"""
|
71
|
-
|
110
|
+
|
111
|
+
logger.info("Creating rollover shards", extra={"kbid": kbid})
|
72
112
|
sm = app_context.shard_manager
|
113
|
+
nidx_node = get_nidx_fake_node()
|
73
114
|
|
74
|
-
async with datamanagers.
|
75
|
-
|
76
|
-
txn, kbid=kbid
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
115
|
+
async with datamanagers.with_ro_transaction() as txn:
|
116
|
+
try:
|
117
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
118
|
+
except RolloverStateNotFoundError:
|
119
|
+
# State is not set yet, create it
|
120
|
+
state = RolloverState(
|
121
|
+
rollover_shards_created=False,
|
122
|
+
external_index_created=False,
|
123
|
+
resources_scheduled=False,
|
124
|
+
resources_indexed=False,
|
125
|
+
cutover_shards=False,
|
126
|
+
cutover_external_index=False,
|
127
|
+
resources_validated=False,
|
128
|
+
)
|
81
129
|
|
82
130
|
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
83
131
|
if kb_shards is None:
|
84
132
|
raise UnexpectedRolloverError(f"No shards found for KB {kbid}")
|
85
133
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
134
|
+
if state.rollover_shards_created:
|
135
|
+
logger.info("Rollover shards already created, skipping", extra={"kbid": kbid})
|
136
|
+
return kb_shards
|
137
|
+
|
138
|
+
# create new shards
|
139
|
+
created_shards = []
|
140
|
+
try:
|
141
|
+
nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
|
142
|
+
for shard in kb_shards.shards:
|
143
|
+
shard.ClearField("replicas")
|
144
|
+
# Attempt to create configured number of replicas
|
145
|
+
replicas_created = 0
|
146
|
+
while replicas_created < settings.node_replicas:
|
147
|
+
if len(nodes) == 0:
|
148
|
+
# could have multiple shards on single node
|
149
|
+
nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
|
150
|
+
node_id = nodes.pop(0)
|
151
|
+
|
152
|
+
node = get_index_node(node_id)
|
153
|
+
if node is None:
|
154
|
+
logger.error(f"Node {node_id} is not found or not available")
|
155
|
+
continue
|
156
|
+
|
157
|
+
vectorsets = {
|
158
|
+
vectorset_id: vectorset_config.vectorset_index_config
|
159
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
160
|
+
txn, kbid=kbid
|
161
|
+
)
|
162
|
+
}
|
163
|
+
try:
|
164
|
+
if not vectorsets:
|
165
|
+
is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
|
166
|
+
vector_index_config = nodewriter_pb2.VectorIndexConfig(
|
167
|
+
similarity=kb_shards.similarity,
|
168
|
+
vector_type=nodewriter_pb2.VectorType.DENSE_F32,
|
169
|
+
vector_dimension=kb_shards.model.vector_dimension,
|
170
|
+
normalize_vectors=is_matryoshka,
|
171
|
+
)
|
105
172
|
shard_created = await node.new_shard(
|
106
173
|
kbid,
|
107
|
-
|
108
|
-
release_channel=kb_shards.release_channel,
|
174
|
+
vector_index_config=vector_index_config,
|
109
175
|
)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
176
|
+
else:
|
177
|
+
shard_created = await node.new_shard_with_vectorsets(
|
178
|
+
kbid,
|
179
|
+
vectorsets_configs=vectorsets,
|
180
|
+
)
|
181
|
+
except Exception as e:
|
182
|
+
errors.capture_exception(e)
|
183
|
+
logger.exception(f"Error creating new shard at {node}")
|
184
|
+
continue
|
185
|
+
|
186
|
+
replica = writer_pb2.ShardReplica(node=str(node_id))
|
187
|
+
replica.shard.CopyFrom(shard_created)
|
188
|
+
shard.replicas.append(replica)
|
189
|
+
created_shards.append(shard)
|
190
|
+
replicas_created += 1
|
191
|
+
|
192
|
+
if nidx_node:
|
193
|
+
nidx_shard = await nidx_node.new_shard_with_vectorsets(
|
194
|
+
kbid,
|
195
|
+
vectorsets_configs=vectorsets,
|
196
|
+
)
|
197
|
+
shard.nidx_shard_id = nidx_shard.id
|
198
|
+
|
199
|
+
except Exception as e:
|
200
|
+
errors.capture_exception(e)
|
201
|
+
logger.exception("Unexpected error creating new shard")
|
202
|
+
for created_shard in created_shards:
|
203
|
+
await sm.rollback_shard(created_shard)
|
204
|
+
raise e
|
205
|
+
|
206
|
+
async with datamanagers.with_transaction() as txn:
|
207
|
+
await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=kb_shards)
|
208
|
+
state.rollover_shards_created = True
|
209
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
130
210
|
await txn.commit()
|
131
211
|
return kb_shards
|
132
212
|
|
133
213
|
|
134
|
-
def _get_shard(
|
135
|
-
shards: writer_pb2.Shards, shard_id: str
|
136
|
-
) -> Optional[writer_pb2.ShardObject]:
|
214
|
+
def _get_shard(shards: writer_pb2.Shards, shard_id: str) -> Optional[writer_pb2.ShardObject]:
|
137
215
|
for shard in shards.shards:
|
138
216
|
if shard_id == shard.shard:
|
139
217
|
return shard
|
140
218
|
return None
|
141
219
|
|
142
220
|
|
143
|
-
async def schedule_resource_indexing(
|
144
|
-
app_context: ApplicationContext, kbid: str
|
145
|
-
) -> None:
|
221
|
+
async def schedule_resource_indexing(app_context: ApplicationContext, kbid: str) -> None:
|
146
222
|
"""
|
147
223
|
Schedule indexing all data in a kb in rollover shards
|
148
224
|
"""
|
149
|
-
logger.
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
txn, kbid=kbid
|
154
|
-
)
|
155
|
-
if rollover_shards is None:
|
225
|
+
logger.info("Scheduling resources to be indexed to rollover shards", extra={"kbid": kbid})
|
226
|
+
async with datamanagers.with_ro_transaction() as txn:
|
227
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
228
|
+
if not state.rollover_shards_created:
|
156
229
|
raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
return
|
230
|
+
if state.resources_scheduled:
|
231
|
+
logger.info(
|
232
|
+
"Resources already scheduled for indexing, skipping",
|
233
|
+
extra={"kbid": kbid},
|
234
|
+
)
|
235
|
+
return
|
164
236
|
|
165
237
|
batch = []
|
166
238
|
async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
|
@@ -168,9 +240,7 @@ async def schedule_resource_indexing(
|
|
168
240
|
|
169
241
|
if len(batch) > 100:
|
170
242
|
async with datamanagers.with_transaction() as txn:
|
171
|
-
await datamanagers.rollover.add_batch_to_index(
|
172
|
-
txn, kbid=kbid, batch=batch
|
173
|
-
)
|
243
|
+
await datamanagers.rollover.add_batch_to_index(txn, kbid=kbid, batch=batch)
|
174
244
|
await txn.commit()
|
175
245
|
batch = []
|
176
246
|
if len(batch) > 0:
|
@@ -179,10 +249,8 @@ async def schedule_resource_indexing(
|
|
179
249
|
await txn.commit()
|
180
250
|
|
181
251
|
async with datamanagers.with_transaction() as txn:
|
182
|
-
|
183
|
-
await datamanagers.rollover.
|
184
|
-
txn, kbid=kbid, kb_shards=rollover_shards
|
185
|
-
)
|
252
|
+
state.resources_scheduled = True
|
253
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
186
254
|
await txn.commit()
|
187
255
|
|
188
256
|
|
@@ -190,24 +258,27 @@ def _to_ts(dt: datetime) -> int:
|
|
190
258
|
return int(dt.timestamp() * 1000 * 1000)
|
191
259
|
|
192
260
|
|
193
|
-
async def
|
261
|
+
async def index_to_rollover_index(
|
262
|
+
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
263
|
+
) -> None:
|
194
264
|
"""
|
195
|
-
Indexes all data in a kb in rollover
|
265
|
+
Indexes all data in a kb in rollover indexes. This happens before the cutover.
|
196
266
|
"""
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
)
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
267
|
+
extra = {"kbid": kbid, "external_index_provider": None}
|
268
|
+
if external is not None:
|
269
|
+
extra["external_index_provider"] = external.type.value
|
270
|
+
async with datamanagers.with_ro_transaction() as txn:
|
271
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
272
|
+
if not all([state.rollover_shards_created, state.resources_scheduled]):
|
273
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
274
|
+
rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
|
275
|
+
if rollover_shards is None:
|
276
|
+
raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
|
277
|
+
if state.resources_indexed:
|
278
|
+
logger.info("Resources already indexed, skipping", extra=extra)
|
207
279
|
return
|
208
280
|
|
209
|
-
logger.
|
210
|
-
|
281
|
+
logger.info("Indexing to rollover index", extra=extra)
|
211
282
|
wait_index_batch: list[writer_pb2.ShardObject] = []
|
212
283
|
# now index on all new shards only
|
213
284
|
while True:
|
@@ -221,11 +292,14 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
|
|
221
292
|
txn, kbid=kbid, rid=resource_id
|
222
293
|
)
|
223
294
|
if shard_id is None:
|
224
|
-
logger.
|
225
|
-
"Shard id not found for resource",
|
295
|
+
logger.warning(
|
296
|
+
"Shard id not found for resource. Skipping indexing as it may have been deleted",
|
226
297
|
extra={"kbid": kbid, "resource_id": resource_id},
|
227
298
|
)
|
228
|
-
|
299
|
+
async with datamanagers.with_transaction() as txn:
|
300
|
+
await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
|
301
|
+
await txn.commit()
|
302
|
+
continue
|
229
303
|
|
230
304
|
shard = _get_shard(rollover_shards, shard_id)
|
231
305
|
if shard is None: # pragma: no cover
|
@@ -236,28 +310,29 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
|
|
236
310
|
raise UnexpectedRolloverError(
|
237
311
|
f"Shard {shard_id} not found. Was a new one created during migration?"
|
238
312
|
)
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
)
|
243
|
-
if resource_index_message is None:
|
313
|
+
resource = await get_resource(kbid, resource_id)
|
314
|
+
index_message = await get_resource_index_message(kbid, resource_id)
|
315
|
+
if resource is None or index_message is None:
|
244
316
|
# resource no longer existing, remove indexing and carry on
|
245
317
|
async with datamanagers.with_transaction() as txn:
|
246
|
-
await datamanagers.rollover.remove_to_index(
|
247
|
-
txn, kbid=kbid, resource=resource_id
|
248
|
-
)
|
318
|
+
await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
|
249
319
|
await txn.commit()
|
250
320
|
continue
|
251
321
|
|
322
|
+
if external is not None:
|
323
|
+
await external.index_resource(resource_id, index_message, to_rollover_indexes=True)
|
324
|
+
else:
|
325
|
+
await index_resource_to_shard(
|
326
|
+
app_context, kbid, resource_id, shard, resource_index_message=index_message
|
327
|
+
)
|
328
|
+
|
252
329
|
async with datamanagers.with_transaction() as txn:
|
253
330
|
await datamanagers.rollover.add_indexed(
|
254
331
|
txn,
|
255
332
|
kbid=kbid,
|
256
333
|
resource_id=resource_id,
|
257
334
|
shard_id=shard_id,
|
258
|
-
modification_time=_to_ts(
|
259
|
-
resource_index_message.metadata.modified.ToDatetime()
|
260
|
-
),
|
335
|
+
modification_time=_to_ts(resource.basic.modified.ToDatetime()), # type: ignore
|
261
336
|
)
|
262
337
|
await txn.commit()
|
263
338
|
wait_index_batch.append(shard)
|
@@ -271,11 +346,66 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
|
|
271
346
|
await wait_for_node(app_context, node_id)
|
272
347
|
wait_index_batch = []
|
273
348
|
|
274
|
-
_set_rollover_status(rollover_shards, RolloverStatus.RESOURCES_INDEXED)
|
275
349
|
async with datamanagers.with_transaction() as txn:
|
276
|
-
|
277
|
-
|
350
|
+
state.resources_indexed = True
|
351
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
352
|
+
await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=rollover_shards)
|
353
|
+
await txn.commit()
|
354
|
+
|
355
|
+
|
356
|
+
async def cutover_index(
|
357
|
+
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
358
|
+
) -> None:
|
359
|
+
"""
|
360
|
+
Swaps our the current active index for a knowledgebox.
|
361
|
+
"""
|
362
|
+
await cutover_shards(app_context, kbid)
|
363
|
+
if external is not None:
|
364
|
+
if external.supports_rollover:
|
365
|
+
await cutover_external_index(kbid, external)
|
366
|
+
else:
|
367
|
+
logger.info(
|
368
|
+
"External index provider does not support rollover",
|
369
|
+
extra={"kbid": kbid, "external_index_provider": external.type.value},
|
370
|
+
)
|
371
|
+
|
372
|
+
|
373
|
+
async def cutover_external_index(kbid: str, external: ExternalIndexManager) -> None:
|
374
|
+
"""
|
375
|
+
Cuts over to the newly creted external index for a knowledgebox.
|
376
|
+
The old indexes are deleted.
|
377
|
+
"""
|
378
|
+
extra = {"kbid": kbid, "external_index_provider": external.type.value}
|
379
|
+
logger.info("Cutting over external index", extra=extra)
|
380
|
+
async with datamanagers.with_rw_transaction() as txn:
|
381
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
382
|
+
if not all(
|
383
|
+
[
|
384
|
+
state.rollover_shards_created,
|
385
|
+
state.resources_scheduled,
|
386
|
+
state.resources_indexed,
|
387
|
+
]
|
388
|
+
):
|
389
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
390
|
+
if state.cutover_external_index:
|
391
|
+
logger.info("External index already cut over, skipping", extra=extra)
|
392
|
+
return
|
393
|
+
|
394
|
+
stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
|
395
|
+
rollover_metadata = await datamanagers.rollover.get_kb_rollover_external_index_metadata(
|
396
|
+
txn, kbid=kbid
|
397
|
+
)
|
398
|
+
if stored_metadata is None or rollover_metadata is None:
|
399
|
+
raise UnexpectedRolloverError("stored or rollover external index metadata not found")
|
400
|
+
|
401
|
+
await external.rollover_cutover_indexes()
|
402
|
+
|
403
|
+
await datamanagers.kb.set_external_index_provider_metadata(
|
404
|
+
txn, kbid=kbid, metadata=rollover_metadata
|
278
405
|
)
|
406
|
+
await datamanagers.rollover.delete_kb_rollover_external_index_metadata(txn, kbid=kbid)
|
407
|
+
state.cutover_external_index = True
|
408
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
279
409
|
await txn.commit()
|
280
410
|
|
281
411
|
|
@@ -283,33 +413,44 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
|
|
283
413
|
"""
|
284
414
|
Swaps our the current active shards for a knowledgebox.
|
285
415
|
"""
|
286
|
-
logger.
|
416
|
+
logger.info("Cutting over shards", extra={"kbid": kbid})
|
287
417
|
async with datamanagers.with_transaction() as txn:
|
288
418
|
sm = app_context.shard_manager
|
289
419
|
|
420
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
421
|
+
if not all(
|
422
|
+
[
|
423
|
+
state.rollover_shards_created,
|
424
|
+
state.resources_scheduled,
|
425
|
+
state.resources_indexed,
|
426
|
+
]
|
427
|
+
):
|
428
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
429
|
+
if state.cutover_shards:
|
430
|
+
logger.info("Shards already cut over, skipping", extra={"kbid": kbid})
|
431
|
+
return
|
432
|
+
|
290
433
|
previously_active_shards = await datamanagers.cluster.get_kb_shards(
|
291
|
-
txn, kbid=kbid
|
292
|
-
)
|
293
|
-
rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
|
294
|
-
txn, kbid=kbid
|
434
|
+
txn, kbid=kbid, for_update=True
|
295
435
|
)
|
436
|
+
rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
|
296
437
|
if previously_active_shards is None or rollover_shards is None:
|
297
438
|
raise UnexpectedRolloverError("Shards for kb not found")
|
298
439
|
|
299
|
-
|
300
|
-
await datamanagers.cluster.update_kb_shards(
|
301
|
-
txn, kbid=kbid, shards=rollover_shards
|
302
|
-
)
|
440
|
+
await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rollover_shards)
|
303
441
|
await datamanagers.rollover.delete_kb_rollover_shards(txn, kbid=kbid)
|
304
442
|
|
305
443
|
for shard in previously_active_shards.shards:
|
306
444
|
await sm.rollback_shard(shard)
|
307
445
|
|
446
|
+
state.cutover_shards = True
|
447
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
448
|
+
|
308
449
|
await txn.commit()
|
309
450
|
|
310
451
|
|
311
452
|
async def validate_indexed_data(
|
312
|
-
app_context: ApplicationContext, kbid: str
|
453
|
+
app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
|
313
454
|
) -> list[str]:
|
314
455
|
"""
|
315
456
|
Goes through all the resources in a knowledgebox and validates it
|
@@ -319,21 +460,34 @@ async def validate_indexed_data(
|
|
319
460
|
|
320
461
|
If a resource was removed during the rollover, it will be removed as well.
|
321
462
|
"""
|
463
|
+
extra = {"kbid": kbid, "external_index_provider": None}
|
464
|
+
if external is not None:
|
465
|
+
extra["external_index_provider"] = external.type.value
|
466
|
+
async with datamanagers.with_ro_transaction() as txn:
|
467
|
+
state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
468
|
+
if not all(
|
469
|
+
[
|
470
|
+
state.rollover_shards_created,
|
471
|
+
state.resources_scheduled,
|
472
|
+
state.resources_indexed,
|
473
|
+
state.cutover_shards,
|
474
|
+
]
|
475
|
+
):
|
476
|
+
raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
|
322
477
|
|
323
|
-
async with datamanagers.with_transaction() as txn:
|
324
478
|
rolled_over_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
325
479
|
if rolled_over_shards is None:
|
326
480
|
raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
|
327
481
|
|
328
|
-
if
|
329
|
-
logger.
|
482
|
+
if state.resources_validated:
|
483
|
+
logger.info("Resources already validated, skipping", extra=extra)
|
330
484
|
return []
|
331
485
|
|
332
|
-
logger.
|
486
|
+
logger.info("Validating indexed data", extra=extra)
|
333
487
|
|
334
|
-
repaired_resources = []
|
488
|
+
repaired_resources: list[str] = []
|
335
489
|
async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
|
336
|
-
async with datamanagers.
|
490
|
+
async with datamanagers.with_ro_transaction() as txn:
|
337
491
|
indexed_data = await datamanagers.rollover.get_indexed_data(
|
338
492
|
txn, kbid=kbid, resource_id=resource_id
|
339
493
|
)
|
@@ -350,7 +504,7 @@ async def validate_indexed_data(
|
|
350
504
|
if shard_id is None:
|
351
505
|
logger.error(
|
352
506
|
"Shard id not found for resource",
|
353
|
-
extra={"
|
507
|
+
extra={"resource_id": resource_id, **extra},
|
354
508
|
)
|
355
509
|
raise UnexpectedRolloverError("Shard id not found for resource")
|
356
510
|
last_indexed = 0
|
@@ -360,23 +514,18 @@ async def validate_indexed_data(
|
|
360
514
|
logger.error(
|
361
515
|
"Shard not found for resource",
|
362
516
|
extra={
|
363
|
-
"kbid": kbid,
|
364
517
|
"resource_id": resource_id,
|
365
518
|
"shard_id": shard_id,
|
519
|
+
**extra,
|
366
520
|
},
|
367
521
|
)
|
368
|
-
raise UnexpectedRolloverError(
|
369
|
-
f"Shard {shard_id} not found. This should not happen"
|
370
|
-
)
|
522
|
+
raise UnexpectedRolloverError(f"Shard {shard_id} not found. This should not happen")
|
371
523
|
|
372
|
-
|
373
|
-
res = await datamanagers.resources.get_resource(
|
374
|
-
txn, kbid=kbid, rid=resource_id
|
375
|
-
)
|
524
|
+
res = await get_resource(kbid, resource_id)
|
376
525
|
if res is None:
|
377
526
|
logger.error(
|
378
527
|
"Resource not found while validating, skipping",
|
379
|
-
extra={"
|
528
|
+
extra={"resource_id": resource_id, **extra},
|
380
529
|
)
|
381
530
|
continue
|
382
531
|
|
@@ -393,12 +542,26 @@ async def validate_indexed_data(
|
|
393
542
|
await txn.commit()
|
394
543
|
continue
|
395
544
|
|
545
|
+
index_message = await get_resource_index_message(kbid, resource_id)
|
546
|
+
if index_message is None:
|
547
|
+
logger.error(
|
548
|
+
"Resource index message not found while validating, skipping",
|
549
|
+
extra={"resource_id": resource_id, **extra},
|
550
|
+
)
|
551
|
+
continue
|
552
|
+
|
396
553
|
# resource was modified or added during rollover, reindex
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
554
|
+
if external is not None:
|
555
|
+
await external.index_resource(
|
556
|
+
resource_id,
|
557
|
+
index_message,
|
558
|
+
to_rollover_indexes=True,
|
559
|
+
)
|
560
|
+
else:
|
561
|
+
await index_resource_to_shard(
|
562
|
+
app_context, kbid, resource_id, shard, resource_index_message=index_message
|
563
|
+
)
|
564
|
+
repaired_resources.append(resource_id)
|
402
565
|
async with datamanagers.with_transaction() as txn:
|
403
566
|
await datamanagers.rollover.add_indexed(
|
404
567
|
txn,
|
@@ -409,25 +572,23 @@ async def validate_indexed_data(
|
|
409
572
|
)
|
410
573
|
await txn.commit()
|
411
574
|
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
shard = _get_shard(rolled_over_shards, shard_id)
|
421
|
-
if shard is None:
|
422
|
-
raise UnexpectedRolloverError("Shard not found. This should not happen")
|
575
|
+
# any left overs should be deleted
|
576
|
+
async for resource_id, (
|
577
|
+
shard_id,
|
578
|
+
last_indexed,
|
579
|
+
) in datamanagers.rollover.iterate_indexed_data(kbid=kbid):
|
580
|
+
if last_indexed == -1:
|
581
|
+
continue
|
423
582
|
|
424
|
-
|
583
|
+
shard = _get_shard(rolled_over_shards, shard_id)
|
584
|
+
if shard is None:
|
585
|
+
raise UnexpectedRolloverError("Shard not found. This should not happen")
|
586
|
+
await delete_resource_from_shard(app_context, kbid, resource_id, shard)
|
425
587
|
|
426
|
-
_set_rollover_status(rolled_over_shards, RolloverStatus.RESOURCES_VALIDATED)
|
427
588
|
async with datamanagers.with_transaction() as txn:
|
428
|
-
|
429
|
-
|
430
|
-
)
|
589
|
+
state.resources_validated = True
|
590
|
+
await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
|
591
|
+
await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rolled_over_shards)
|
431
592
|
|
432
593
|
return repaired_resources
|
433
594
|
|
@@ -449,64 +610,76 @@ async def clean_indexed_data(app_context: ApplicationContext, kbid: str) -> None
|
|
449
610
|
|
450
611
|
async def clean_rollover_status(app_context: ApplicationContext, kbid: str) -> None:
|
451
612
|
async with datamanagers.with_transaction() as txn:
|
452
|
-
|
453
|
-
|
613
|
+
try:
|
614
|
+
await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
|
615
|
+
except RolloverStateNotFoundError:
|
454
616
|
logger.warning(
|
455
|
-
"No
|
456
|
-
extra={"kbid": kbid},
|
617
|
+
"No rollover state found, skipping clean rollover status", extra={"kbid": kbid}
|
457
618
|
)
|
458
619
|
return
|
620
|
+
await datamanagers.rollover.clear_rollover_state(txn, kbid=kbid)
|
621
|
+
await txn.commit()
|
622
|
+
|
459
623
|
|
460
|
-
|
461
|
-
|
624
|
+
async def wait_for_cluster_ready() -> None:
|
625
|
+
node_ready_checks = 0
|
626
|
+
while len(cluster_manager.INDEX_NODES) == 0:
|
627
|
+
if node_ready_checks > 10:
|
628
|
+
raise Exception("No index nodes available")
|
629
|
+
logger.info("Waiting for index nodes to be available")
|
630
|
+
await asyncio.sleep(1)
|
631
|
+
node_ready_checks += 1
|
462
632
|
|
463
633
|
|
464
|
-
async def
|
634
|
+
async def rollover_kb_index(
|
635
|
+
app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
|
636
|
+
) -> None:
|
465
637
|
"""
|
466
|
-
Rollover a
|
467
|
-
shard and indexing all existing resources into the replicas.
|
638
|
+
Rollover a KB index is the process of creating new shard replicas for every
|
639
|
+
shard and indexing all existing resources into the replicas. Also includes creating new external indexes if
|
640
|
+
the KB is configured to use them.
|
641
|
+
|
642
|
+
Once all the data is in the new indexes, cut over to the replicated index delete the old one.
|
468
643
|
|
469
|
-
|
470
|
-
to
|
644
|
+
If drain_nodes is provided, no index node replicas will be created on those nodes. This is useful
|
645
|
+
for when we want to remove a set of nodes from the index node cluster.
|
471
646
|
|
472
647
|
This is a very expensive operation and should be done with care.
|
473
648
|
|
474
649
|
Process:
|
475
|
-
- Create new shards
|
650
|
+
- Create new index for kb index (index node shards or external indexes if configured)
|
476
651
|
- Schedule all resources to be indexed
|
477
|
-
- Index all resources into new shards
|
478
|
-
- Cut over replicas to new shards
|
479
|
-
- Validate that all resources are in the new
|
652
|
+
- Index all resources into new kb index (index node shards or external indexes if configured)
|
653
|
+
- Cut over replicas to new shards (and external indexes if configured)
|
654
|
+
- Validate that all resources are in the new kb index
|
480
655
|
- Clean up indexed data
|
481
656
|
"""
|
482
|
-
|
483
|
-
while len(cluster_manager.INDEX_NODES) == 0:
|
484
|
-
if node_ready_checks > 10:
|
485
|
-
raise Exception("No index nodes available")
|
486
|
-
logger.warning("Waiting for index nodes to be available")
|
487
|
-
await asyncio.sleep(1)
|
488
|
-
node_ready_checks += 1
|
657
|
+
await wait_for_cluster_ready()
|
489
658
|
|
490
|
-
|
659
|
+
extra = {"kbid": kbid, "external_index_provider": None}
|
660
|
+
external = await get_external_index_manager(kbid, for_rollover=True)
|
661
|
+
if external is not None:
|
662
|
+
extra["external_index_provider"] = external.type.value
|
663
|
+
logger.info("Rolling over KB index", extra=extra)
|
491
664
|
|
492
665
|
async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
|
493
|
-
await
|
666
|
+
await create_rollover_index(app_context, kbid, drain_nodes=drain_nodes, external=external)
|
494
667
|
await schedule_resource_indexing(app_context, kbid)
|
495
|
-
await
|
496
|
-
await
|
668
|
+
await index_to_rollover_index(app_context, kbid, external=external)
|
669
|
+
await cutover_index(app_context, kbid, external=external)
|
497
670
|
# we need to cut over BEFORE we validate the data
|
498
|
-
await validate_indexed_data(app_context, kbid)
|
671
|
+
await validate_indexed_data(app_context, kbid, external=external)
|
499
672
|
await clean_indexed_data(app_context, kbid)
|
500
673
|
await clean_rollover_status(app_context, kbid)
|
501
674
|
|
502
|
-
logger.
|
675
|
+
logger.info("Finished rolling over KB indes", extra=extra)
|
503
676
|
|
504
677
|
|
505
678
|
async def _rollover_kbid_command(kbid: str) -> None: # pragma: no cover
|
506
679
|
app_context = ApplicationContext()
|
507
680
|
await app_context.initialize()
|
508
681
|
try:
|
509
|
-
await
|
682
|
+
await rollover_kb_index(app_context, kbid)
|
510
683
|
finally:
|
511
684
|
await app_context.finalize()
|
512
685
|
|