nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/migrator/migrator.py
CHANGED
@@ -22,23 +22,20 @@ import logging
|
|
22
22
|
from typing import Optional
|
23
23
|
|
24
24
|
from nucliadb.common import locking
|
25
|
-
from nucliadb.common.cluster.rollover import
|
25
|
+
from nucliadb.common.cluster.rollover import rollover_kb_index
|
26
26
|
from nucliadb.common.cluster.settings import in_standalone_mode
|
27
|
+
from nucliadb.common.maindb.pg import PGDriver
|
27
28
|
from nucliadb.migrator.context import ExecutionContext
|
28
|
-
from nucliadb.migrator.utils import get_migrations
|
29
|
+
from nucliadb.migrator.utils import get_migrations, get_pg_migrations
|
29
30
|
from nucliadb_telemetry import errors, metrics
|
30
31
|
|
31
|
-
migration_observer = metrics.Observer(
|
32
|
-
"nucliadb_migrations", labels={"type": "kb", "target_version": ""}
|
33
|
-
)
|
32
|
+
migration_observer = metrics.Observer("nucliadb_migrations", labels={"type": "kb", "target_version": ""})
|
34
33
|
|
35
34
|
|
36
35
|
logger = logging.getLogger(__name__)
|
37
36
|
|
38
37
|
|
39
|
-
async def run_kb_migrations(
|
40
|
-
context: ExecutionContext, kbid: str, target_version: int
|
41
|
-
) -> None:
|
38
|
+
async def run_kb_migrations(context: ExecutionContext, kbid: str, target_version: int) -> None:
|
42
39
|
async with locking.distributed_lock(f"migration-{kbid}"):
|
43
40
|
kb_info = await context.data_manager.get_kb_info(kbid)
|
44
41
|
if kb_info is None:
|
@@ -46,9 +43,7 @@ async def run_kb_migrations(
|
|
46
43
|
await context.data_manager.delete_kb_migration(kbid=kbid)
|
47
44
|
return
|
48
45
|
|
49
|
-
migrations = get_migrations(
|
50
|
-
from_version=kb_info.current_version, to_version=target_version
|
51
|
-
)
|
46
|
+
migrations = get_migrations(from_version=kb_info.current_version, to_version=target_version)
|
52
47
|
|
53
48
|
for migration in migrations:
|
54
49
|
migration_info = {
|
@@ -59,14 +54,10 @@ async def run_kb_migrations(
|
|
59
54
|
|
60
55
|
try:
|
61
56
|
logger.info("Migrating KB", extra=migration_info)
|
62
|
-
with migration_observer(
|
63
|
-
|
64
|
-
):
|
65
|
-
await migration.module.migrate_kb(context, kbid) # type: ignore
|
57
|
+
with migration_observer({"type": "kb", "target_version": str(migration.version)}):
|
58
|
+
await migration.module.migrate_kb(context, kbid)
|
66
59
|
logger.info("Finished KB Migration", extra=migration_info)
|
67
|
-
await context.data_manager.update_kb_info(
|
68
|
-
kbid=kbid, current_version=migration.version
|
69
|
-
)
|
60
|
+
await context.data_manager.update_kb_info(kbid=kbid, current_version=migration.version)
|
70
61
|
except Exception as exc:
|
71
62
|
errors.capture_exception(exc)
|
72
63
|
logger.exception("Failed to migrate KB", extra=migration_info)
|
@@ -74,9 +65,7 @@ async def run_kb_migrations(
|
|
74
65
|
|
75
66
|
refreshed_kb_info = await context.data_manager.get_kb_info(kbid=kbid)
|
76
67
|
if refreshed_kb_info is None:
|
77
|
-
logger.warning(
|
78
|
-
"KB not found. This should not happen.", extra={"kbid": kbid}
|
79
|
-
)
|
68
|
+
logger.warning("KB not found. This should not happen.", extra={"kbid": kbid})
|
80
69
|
return
|
81
70
|
assert refreshed_kb_info.current_version == target_version
|
82
71
|
|
@@ -88,7 +77,7 @@ async def run_all_kb_migrations(context: ExecutionContext, target_version: int)
|
|
88
77
|
Schedule all KB migrations to run in parallel. Only a certain number of migrations will run at the same time.
|
89
78
|
If any of the migrations fail, the whole process will fail.
|
90
79
|
"""
|
91
|
-
to_migrate = await context.data_manager.get_kb_migrations(
|
80
|
+
to_migrate = await context.data_manager.get_kb_migrations()
|
92
81
|
|
93
82
|
if len(to_migrate) == 0:
|
94
83
|
return
|
@@ -154,13 +143,9 @@ async def run_global_migrations(context: ExecutionContext, target_version: int)
|
|
154
143
|
}
|
155
144
|
try:
|
156
145
|
logger.info("Migrating", extra=migration_info)
|
157
|
-
with migration_observer(
|
158
|
-
|
159
|
-
)
|
160
|
-
await migration.module.migrate(context) # type: ignore
|
161
|
-
await context.data_manager.update_global_info(
|
162
|
-
current_version=migration.version
|
163
|
-
)
|
146
|
+
with migration_observer({"type": "global", "target_version": str(migration.version)}):
|
147
|
+
await migration.module.migrate(context)
|
148
|
+
await context.data_manager.update_global_info(current_version=migration.version)
|
164
149
|
logger.info("Finished migration", extra=migration_info)
|
165
150
|
except Exception as exc:
|
166
151
|
errors.capture_exception(exc)
|
@@ -177,7 +162,7 @@ async def run_rollover_in_parallel(
|
|
177
162
|
) -> None:
|
178
163
|
async with max_concurrent:
|
179
164
|
try:
|
180
|
-
await
|
165
|
+
await rollover_kb_index(context, kbid)
|
181
166
|
await context.data_manager.delete_kb_rollover(kbid=kbid)
|
182
167
|
except Exception as exc:
|
183
168
|
errors.capture_exception(exc)
|
@@ -221,7 +206,39 @@ async def run_rollovers(context: ExecutionContext) -> None:
|
|
221
206
|
raise Exception(f"Failed to migrate KBs. Failures: {failures}")
|
222
207
|
|
223
208
|
|
209
|
+
async def run_pg_schema_migrations(driver: PGDriver):
|
210
|
+
migrations = get_pg_migrations()
|
211
|
+
|
212
|
+
# The migration uses two transactions. The former is only used to get a lock (pg_advisory_lock)
|
213
|
+
# without having to worry about correctly unlocking it (postgres unlocks it when the transaction ends)
|
214
|
+
async with driver.transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock: # type: ignore[attr-defined]
|
215
|
+
await cur_lock.execute(
|
216
|
+
"CREATE TABLE IF NOT EXISTS migrations (version INT PRIMARY KEY, migrated_at TIMESTAMP NOT NULL DEFAULT NOW())"
|
217
|
+
)
|
218
|
+
await tx_lock.commit()
|
219
|
+
await cur_lock.execute("SELECT pg_advisory_xact_lock(3116614845278015934)")
|
220
|
+
|
221
|
+
await cur_lock.execute("SELECT version FROM migrations")
|
222
|
+
migrated = [r[0] for r in await cur_lock.fetchall()]
|
223
|
+
|
224
|
+
for version, migration in migrations:
|
225
|
+
if version in migrated:
|
226
|
+
continue
|
227
|
+
|
228
|
+
# Gets a new transaction for each migration, so if they get interrupted we at least
|
229
|
+
# save the state of the last finished transaction
|
230
|
+
async with driver.transaction() as tx, tx.connection.cursor() as cur: # type: ignore[attr-defined]
|
231
|
+
await migration.migrate(tx)
|
232
|
+
await cur.execute("INSERT INTO migrations (version) VALUES (%s)", (version,))
|
233
|
+
await tx.commit()
|
234
|
+
|
235
|
+
|
224
236
|
async def run(context: ExecutionContext, target_version: Optional[int] = None) -> None:
|
237
|
+
# Run schema migrations first, since they create the `resources` table needed for the lock below
|
238
|
+
# Schema migrations use their own locking system
|
239
|
+
if isinstance(context.kv_driver, PGDriver):
|
240
|
+
await run_pg_schema_migrations(context.kv_driver)
|
241
|
+
|
225
242
|
async with locking.distributed_lock(locking.MIGRATIONS_LOCK):
|
226
243
|
# before we move to managed migrations, see if there are any rollovers
|
227
244
|
# scheduled and run them
|
nucliadb/migrator/utils.py
CHANGED
@@ -17,13 +17,12 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import importlib
|
20
21
|
import logging
|
21
22
|
import os
|
22
23
|
import types
|
23
24
|
from functools import lru_cache
|
24
25
|
|
25
|
-
import migrations
|
26
|
-
|
27
26
|
from .models import Migration
|
28
27
|
|
29
28
|
logger = logging.getLogger(__name__)
|
@@ -33,14 +32,27 @@ MIGRATION_DIR = os.path.sep.join(
|
|
33
32
|
)
|
34
33
|
|
35
34
|
|
35
|
+
def get_pg_migrations() -> list[tuple[int, types.ModuleType]]:
|
36
|
+
output = []
|
37
|
+
for filename in os.listdir(os.path.join(MIGRATION_DIR, "pg")):
|
38
|
+
if filename.endswith(".py") and filename != "__init__.py":
|
39
|
+
module_name = filename[:-3]
|
40
|
+
version = int(module_name.split("_")[0])
|
41
|
+
module = importlib.import_module(f"migrations.pg.{module_name}")
|
42
|
+
if not hasattr(module, "migrate"):
|
43
|
+
raise Exception(f"Missing `migrate` function in {module_name}")
|
44
|
+
output.append((version, module))
|
45
|
+
output.sort()
|
46
|
+
return output
|
47
|
+
|
48
|
+
|
36
49
|
def get_migration_modules() -> list[tuple[types.ModuleType, int]]:
|
37
50
|
output = []
|
38
51
|
for filename in os.listdir(MIGRATION_DIR):
|
39
52
|
if filename.endswith(".py") and filename != "__init__.py":
|
40
53
|
module_name = filename[:-3]
|
41
54
|
version = int(module_name.split("_")[0])
|
42
|
-
|
43
|
-
module = getattr(migrations, module_name)
|
55
|
+
module = importlib.import_module(f"migrations.{module_name}")
|
44
56
|
if not hasattr(module, "migrate"):
|
45
57
|
raise Exception(f"Missing `migrate` function in {module_name}")
|
46
58
|
if not hasattr(module, "migrate_kb"):
|
@@ -49,17 +61,13 @@ def get_migration_modules() -> list[tuple[types.ModuleType, int]]:
|
|
49
61
|
return output
|
50
62
|
|
51
63
|
|
52
|
-
def get_migrations(
|
53
|
-
from_version: int = 0, to_version: int = 99999999
|
54
|
-
) -> list[Migration]:
|
64
|
+
def get_migrations(from_version: int = 0, to_version: int = 99999999) -> list[Migration]:
|
55
65
|
migrations: list[Migration] = []
|
56
66
|
for module, version in get_migration_modules():
|
57
67
|
migrations.append(Migration(version=version, module=module))
|
58
68
|
|
59
69
|
migrations.sort(key=lambda m: m.version)
|
60
|
-
return [
|
61
|
-
m for m in migrations if m.version > from_version and m.version <= to_version
|
62
|
-
]
|
70
|
+
return [m for m in migrations if m.version > from_version and m.version <= to_version]
|
63
71
|
|
64
72
|
|
65
73
|
@lru_cache(maxsize=None)
|
nucliadb/purge/__init__.py
CHANGED
@@ -18,10 +18,9 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import asyncio
|
21
|
+
import importlib.metadata
|
21
22
|
from typing import AsyncGenerator
|
22
23
|
|
23
|
-
import pkg_resources
|
24
|
-
|
25
24
|
from nucliadb.common.cluster.exceptions import NodeError, ShardNotFound
|
26
25
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
27
26
|
from nucliadb.common.maindb.driver import Driver
|
@@ -31,6 +30,9 @@ from nucliadb.ingest.orm.knowledgebox import (
|
|
31
30
|
KB_TO_DELETE,
|
32
31
|
KB_TO_DELETE_BASE,
|
33
32
|
KB_TO_DELETE_STORAGE_BASE,
|
33
|
+
KB_VECTORSET_TO_DELETE,
|
34
|
+
KB_VECTORSET_TO_DELETE_BASE,
|
35
|
+
RESOURCE_TO_DELETE_STORAGE_BASE,
|
34
36
|
KnowledgeBox,
|
35
37
|
)
|
36
38
|
from nucliadb_telemetry import errors
|
@@ -41,7 +43,7 @@ from nucliadb_utils.utilities import get_storage
|
|
41
43
|
|
42
44
|
async def _iter_keys(driver: Driver, match: str) -> AsyncGenerator[str, None]:
|
43
45
|
async with driver.transaction(read_only=True) as keys_txn:
|
44
|
-
async for key in keys_txn.keys(match=match
|
46
|
+
async for key in keys_txn.keys(match=match):
|
45
47
|
yield key
|
46
48
|
|
47
49
|
|
@@ -52,9 +54,7 @@ async def purge_kb(driver: Driver):
|
|
52
54
|
try:
|
53
55
|
kbid = key.split("/")[2]
|
54
56
|
except Exception:
|
55
|
-
logger.warning(
|
56
|
-
f" X Skipping purge {key}, wrong key format, expected {KB_TO_DELETE_BASE}"
|
57
|
-
)
|
57
|
+
logger.warning(f" X Skipping purge {key}, wrong key format, expected {KB_TO_DELETE_BASE}")
|
58
58
|
continue
|
59
59
|
|
60
60
|
try:
|
@@ -62,15 +62,11 @@ async def purge_kb(driver: Driver):
|
|
62
62
|
logger.info(f" √ Successfully Purged {kbid}")
|
63
63
|
except ShardNotFound as exc:
|
64
64
|
errors.capture_exception(exc)
|
65
|
-
logger.error(
|
66
|
-
f" X At least one shard was unavailable while purging {kbid}, skipping"
|
67
|
-
)
|
65
|
+
logger.error(f" X At least one shard was unavailable while purging {kbid}, skipping")
|
68
66
|
continue
|
69
67
|
except NodeError as exc:
|
70
68
|
errors.capture_exception(exc)
|
71
|
-
logger.error(
|
72
|
-
f" X At least one node was unavailable while purging {kbid}, skipping"
|
73
|
-
)
|
69
|
+
logger.error(f" X At least one node was unavailable while purging {kbid}, skipping")
|
74
70
|
continue
|
75
71
|
|
76
72
|
except Exception as exc:
|
@@ -82,10 +78,10 @@ async def purge_kb(driver: Driver):
|
|
82
78
|
|
83
79
|
# Now delete the tikv delete mark
|
84
80
|
try:
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
81
|
+
async with driver.transaction() as txn:
|
82
|
+
key_to_purge = KB_TO_DELETE.format(kbid=kbid)
|
83
|
+
await txn.delete(key_to_purge)
|
84
|
+
await txn.commit()
|
89
85
|
logger.info(f" √ Deleted {key_to_purge}")
|
90
86
|
except Exception as exc:
|
91
87
|
errors.capture_exception(exc)
|
@@ -112,16 +108,12 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
|
|
112
108
|
|
113
109
|
delete_marker = False
|
114
110
|
if conflict:
|
115
|
-
logger.info(
|
116
|
-
f" . Nothing was deleted for {key}, (Bucket not yet empty), will try next time"
|
117
|
-
)
|
111
|
+
logger.info(f" . Nothing was deleted for {key}, (Bucket not yet empty), will try next time")
|
118
112
|
# Just in case something failed while setting a lifecycle policy to
|
119
113
|
# remove all elements from the bucket, reschedule it
|
120
114
|
await storage.schedule_delete_kb(kbid)
|
121
115
|
elif not deleted:
|
122
|
-
logger.info(
|
123
|
-
f" ! Expected bucket for {key} was not found, will delete marker"
|
124
|
-
)
|
116
|
+
logger.info(f" ! Expected bucket for {key} was not found, will delete marker")
|
125
117
|
delete_marker = True
|
126
118
|
elif deleted:
|
127
119
|
logger.info(" √ Bucket successfully deleted")
|
@@ -129,19 +121,122 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
|
|
129
121
|
|
130
122
|
if delete_marker:
|
131
123
|
try:
|
132
|
-
|
133
|
-
|
124
|
+
async with driver.transaction() as txn:
|
125
|
+
await txn.delete(key)
|
126
|
+
await txn.commit()
|
134
127
|
logger.info(f" √ Deleted storage deletion marker {key}")
|
135
128
|
except Exception as exc:
|
136
129
|
errors.capture_exception(exc)
|
137
130
|
logger.info(f" X Error while deleting key {key}")
|
138
|
-
await txn.abort()
|
139
|
-
else:
|
140
|
-
await txn.commit()
|
141
131
|
|
142
132
|
logger.info("FINISH PURGING KB STORAGE")
|
143
133
|
|
144
134
|
|
135
|
+
async def purge_deleted_resource_storage(driver: Driver, storage: Storage) -> None:
|
136
|
+
"""
|
137
|
+
Remove from storage all resources marked as deleted.
|
138
|
+
|
139
|
+
Returns the number of resources purged.
|
140
|
+
"""
|
141
|
+
logger.info("Starting purge of deleted resource storage")
|
142
|
+
to_purge = await _count_resources_storage_to_purge(driver)
|
143
|
+
logger.info(f"Found {to_purge} resources to purge")
|
144
|
+
while True:
|
145
|
+
try:
|
146
|
+
purged = await _purge_resources_storage_batch(driver, storage, batch_size=100)
|
147
|
+
if not purged:
|
148
|
+
logger.info("No more resources to purge found")
|
149
|
+
return
|
150
|
+
logger.info(f"Purged {purged} resources")
|
151
|
+
|
152
|
+
except asyncio.CancelledError:
|
153
|
+
logger.info("Purge of deleted resource storage was cancelled")
|
154
|
+
return
|
155
|
+
|
156
|
+
|
157
|
+
async def _count_resources_storage_to_purge(driver: Driver) -> int:
|
158
|
+
"""
|
159
|
+
Count the number of resources marked as deleted in storage.
|
160
|
+
"""
|
161
|
+
async with driver.transaction(read_only=True) as txn:
|
162
|
+
return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
|
163
|
+
|
164
|
+
|
165
|
+
async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch_size: int = 100) -> int:
|
166
|
+
"""
|
167
|
+
Remove from storage a batch of resources marked as deleted. Returns the
|
168
|
+
number of resources purged.
|
169
|
+
"""
|
170
|
+
# Get the keys of the resources to delete in batches of 100
|
171
|
+
to_delete_batch = []
|
172
|
+
async with driver.transaction(read_only=True) as txn:
|
173
|
+
async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
|
174
|
+
to_delete_batch.append(key)
|
175
|
+
|
176
|
+
if not to_delete_batch:
|
177
|
+
return 0
|
178
|
+
|
179
|
+
# Delete the resources blobs from storage
|
180
|
+
logger.info(f"Purging {len(to_delete_batch)} deleted resources")
|
181
|
+
tasks = []
|
182
|
+
for key in to_delete_batch:
|
183
|
+
kbid, resource_id = key.split("/")[-2:]
|
184
|
+
tasks.append(asyncio.create_task(storage.delete_resource(kbid, resource_id)))
|
185
|
+
await asyncio.gather(*tasks)
|
186
|
+
|
187
|
+
# Delete the schedule-to-delete keys
|
188
|
+
async with driver.transaction() as txn:
|
189
|
+
for key in to_delete_batch:
|
190
|
+
await txn.delete(key)
|
191
|
+
await txn.commit()
|
192
|
+
|
193
|
+
return len(to_delete_batch)
|
194
|
+
|
195
|
+
|
196
|
+
async def purge_kb_vectorsets(driver: Driver, storage: Storage):
|
197
|
+
"""Vectors for a vectorset are stored in a key inside each resource. Iterate
|
198
|
+
through all resources of the KB and remove any storage object containing
|
199
|
+
vectors for the specific vectorset to purge.
|
200
|
+
|
201
|
+
"""
|
202
|
+
logger.info("START PURGING KB VECTORSETS")
|
203
|
+
|
204
|
+
purged = []
|
205
|
+
async for key in _iter_keys(driver, KB_VECTORSET_TO_DELETE_BASE):
|
206
|
+
logger.info(f"Purging vectorsets {key}")
|
207
|
+
try:
|
208
|
+
_base, kbid, vectorset = key.lstrip("/").split("/")
|
209
|
+
except ValueError:
|
210
|
+
logger.info(f" X Skipping purge {key}, wrong key format, expected {KB_VECTORSET_TO_DELETE}")
|
211
|
+
continue
|
212
|
+
|
213
|
+
try:
|
214
|
+
async with driver.transaction(read_only=True) as txn:
|
215
|
+
kb = KnowledgeBox(txn, storage, kbid)
|
216
|
+
async for resource in kb.iterate_resources():
|
217
|
+
fields = await resource.get_fields(force=True)
|
218
|
+
# we don't need the maindb transaction anymore to remove vectors from storage
|
219
|
+
for field in fields.values():
|
220
|
+
await field.delete_vectors(vectorset)
|
221
|
+
except Exception as exc:
|
222
|
+
errors.capture_exception(exc)
|
223
|
+
logger.error(
|
224
|
+
f" X ERROR while executing KB vectorset purge, skipping",
|
225
|
+
exc_info=exc,
|
226
|
+
extra={"kbid": kbid},
|
227
|
+
)
|
228
|
+
continue
|
229
|
+
|
230
|
+
purged.append(key)
|
231
|
+
|
232
|
+
async with driver.transaction() as txn:
|
233
|
+
for key in purged:
|
234
|
+
await txn.delete(key)
|
235
|
+
await txn.commit()
|
236
|
+
|
237
|
+
logger.info("FINISH PURGING KB VECTORSETS")
|
238
|
+
|
239
|
+
|
145
240
|
async def main():
|
146
241
|
"""
|
147
242
|
This script will purge all knowledge boxes marked to be deleted in maindb.
|
@@ -153,17 +248,28 @@ async def main():
|
|
153
248
|
service_name=SERVICE_NAME,
|
154
249
|
)
|
155
250
|
try:
|
251
|
+
purge_resources_storage_task = asyncio.create_task(
|
252
|
+
purge_deleted_resource_storage(driver, storage)
|
253
|
+
)
|
156
254
|
await purge_kb(driver)
|
157
255
|
await purge_kb_storage(driver, storage)
|
256
|
+
await purge_kb_vectorsets(driver, storage)
|
257
|
+
await purge_resources_storage_task
|
258
|
+
except Exception as ex: # pragma: no cover
|
259
|
+
logger.exception("Unhandled exception on purge command")
|
260
|
+
errors.capture_exception(ex)
|
158
261
|
finally:
|
159
|
-
|
160
|
-
|
161
|
-
|
262
|
+
try:
|
263
|
+
purge_resources_storage_task.cancel()
|
264
|
+
await storage.finalize()
|
265
|
+
await teardown_driver()
|
266
|
+
await teardown_cluster()
|
267
|
+
except Exception: # pragma: no cover
|
268
|
+
logger.exception("Error tearing down utilities on purge command")
|
269
|
+
pass
|
162
270
|
|
163
271
|
|
164
272
|
def run() -> int: # pragma: no cover
|
165
273
|
setup_logging()
|
166
|
-
|
167
|
-
errors.setup_error_handling(pkg_resources.get_distribution("nucliadb").version)
|
168
|
-
|
274
|
+
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
169
275
|
return asyncio.run(main())
|
nucliadb/purge/orphan_shards.py
CHANGED
@@ -19,11 +19,11 @@
|
|
19
19
|
|
20
20
|
import argparse
|
21
21
|
import asyncio
|
22
|
+
import importlib.metadata
|
22
23
|
from dataclasses import dataclass
|
23
24
|
from typing import Optional
|
24
25
|
|
25
|
-
import
|
26
|
-
from grpc.aio import AioRpcError # type: ignore
|
26
|
+
from grpc.aio import AioRpcError
|
27
27
|
|
28
28
|
from nucliadb.common import datamanagers
|
29
29
|
from nucliadb.common.cluster import manager
|
@@ -86,7 +86,7 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardLocation]:
|
|
86
86
|
orphan_shard_ids = indexed_shards.keys() - stored_shards.keys()
|
87
87
|
orphan_shards: dict[str, ShardLocation] = {}
|
88
88
|
unavailable_nodes: set[str] = set()
|
89
|
-
async with datamanagers.
|
89
|
+
async with datamanagers.with_ro_transaction() as txn:
|
90
90
|
for shard_id in orphan_shard_ids:
|
91
91
|
node_id = indexed_shards[shard_id].node_id
|
92
92
|
node = manager.get_index_node(node_id) # type: ignore
|
@@ -99,9 +99,7 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardLocation]:
|
|
99
99
|
# Shards with knwon KB ids can be checked and ignore those comming from
|
100
100
|
# an ongoing migration/rollover
|
101
101
|
if kbid != UNKNOWN_KB:
|
102
|
-
skip = await datamanagers.rollover.is_rollover_shard(
|
103
|
-
txn, kbid=kbid, shard_id=shard_id
|
104
|
-
)
|
102
|
+
skip = await datamanagers.rollover.is_rollover_shard(txn, kbid=kbid, shard_id=shard_id)
|
105
103
|
if skip:
|
106
104
|
continue
|
107
105
|
|
@@ -133,18 +131,14 @@ async def _get_stored_shards(driver: Driver) -> dict[str, ShardLocation]:
|
|
133
131
|
try:
|
134
132
|
kb_shards = await shards_manager.get_shards_by_kbid(kbid)
|
135
133
|
except ShardsNotFound:
|
136
|
-
logger.warning(
|
137
|
-
"KB not found while looking for orphan shards", extra={"kbid": kbid}
|
138
|
-
)
|
134
|
+
logger.warning("KB not found while looking for orphan shards", extra={"kbid": kbid})
|
139
135
|
continue
|
140
136
|
else:
|
141
137
|
for shard_object_pb in kb_shards:
|
142
138
|
for shard_replica_pb in shard_object_pb.replicas:
|
143
139
|
shard_replica_id = shard_replica_pb.shard.id
|
144
140
|
node_id = shard_replica_pb.node
|
145
|
-
stored_shards[shard_replica_id] = ShardLocation(
|
146
|
-
kbid=kbid, node_id=node_id
|
147
|
-
)
|
141
|
+
stored_shards[shard_replica_id] = ShardLocation(kbid=kbid, node_id=node_id)
|
148
142
|
return stored_shards
|
149
143
|
|
150
144
|
|
@@ -264,6 +258,6 @@ async def main():
|
|
264
258
|
def run() -> int: # pragma: no cover
|
265
259
|
setup_logging()
|
266
260
|
|
267
|
-
errors.setup_error_handling(
|
261
|
+
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
268
262
|
|
269
263
|
return asyncio.run(main())
|
nucliadb/reader/__init__.py
CHANGED
@@ -29,9 +29,7 @@ API_PREFIX = "api"
|
|
29
29
|
class EndpointFilter(logging.Filter):
|
30
30
|
def filter(self, record: logging.LogRecord) -> bool:
|
31
31
|
return (
|
32
|
-
record.args is not None
|
33
|
-
and len(record.args) >= 3
|
34
|
-
and record.args[2] not in ("/", "/metrics") # type: ignore
|
32
|
+
record.args is not None and len(record.args) >= 3 and record.args[2] not in ("/", "/metrics") # type: ignore
|
35
33
|
)
|
36
34
|
|
37
35
|
|
nucliadb/reader/api/models.py
CHANGED
@@ -22,15 +22,12 @@ from typing import TYPE_CHECKING, Any, Optional, Union
|
|
22
22
|
from pydantic import BaseModel
|
23
23
|
|
24
24
|
import nucliadb_models as models
|
25
|
-
from nucliadb_models.common import
|
25
|
+
from nucliadb_models.common import FieldTypeName
|
26
26
|
from nucliadb_models.resource import (
|
27
27
|
ConversationFieldExtractedData,
|
28
|
-
DatetimeFieldExtractedData,
|
29
28
|
Error,
|
30
29
|
ExtractedDataType,
|
31
30
|
FileFieldExtractedData,
|
32
|
-
KeywordsetFieldExtractedData,
|
33
|
-
LayoutFieldExtractedData,
|
34
31
|
LinkFieldExtractedData,
|
35
32
|
TextFieldExtractedData,
|
36
33
|
)
|
@@ -41,10 +38,7 @@ if TYPE_CHECKING: # pragma: no cover
|
|
41
38
|
models.FieldText,
|
42
39
|
models.FieldFile,
|
43
40
|
models.FieldLink,
|
44
|
-
models.FieldLayout,
|
45
41
|
models.Conversation,
|
46
|
-
models.FieldKeywordset,
|
47
|
-
models.FieldDatetime,
|
48
42
|
]
|
49
43
|
]
|
50
44
|
else:
|
@@ -60,14 +54,9 @@ class ResourceField(BaseModel):
|
|
60
54
|
error: Optional[Error] = None
|
61
55
|
|
62
56
|
|
63
|
-
FIELD_NAMES_TO_PB_TYPE_MAP = {v: k for k, v in FIELD_TYPES_MAP.items()}
|
64
|
-
|
65
57
|
FIELD_NAME_TO_EXTRACTED_DATA_FIELD_MAP: dict[FieldTypeName, Any] = {
|
66
58
|
FieldTypeName.TEXT: TextFieldExtractedData,
|
67
59
|
FieldTypeName.FILE: FileFieldExtractedData,
|
68
60
|
FieldTypeName.LINK: LinkFieldExtractedData,
|
69
|
-
FieldTypeName.DATETIME: DatetimeFieldExtractedData,
|
70
|
-
FieldTypeName.KEYWORDSET: KeywordsetFieldExtractedData,
|
71
|
-
FieldTypeName.LAYOUT: LayoutFieldExtractedData,
|
72
61
|
FieldTypeName.CONVERSATION: ConversationFieldExtractedData,
|
73
62
|
}
|