nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/common/nidx.py
ADDED
@@ -0,0 +1,260 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
import os
|
22
|
+
from typing import Optional
|
23
|
+
|
24
|
+
from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxSearcherStub
|
25
|
+
|
26
|
+
from nucliadb.common.cluster.base import AbstractIndexNode
|
27
|
+
from nucliadb.common.cluster.settings import settings
|
28
|
+
from nucliadb.ingest.settings import DriverConfig
|
29
|
+
from nucliadb.ingest.settings import settings as ingest_settings
|
30
|
+
from nucliadb_protos.nodewriter_pb2 import (
|
31
|
+
IndexMessage,
|
32
|
+
)
|
33
|
+
from nucliadb_utils import logger
|
34
|
+
from nucliadb_utils.grpc import get_traced_grpc_channel
|
35
|
+
from nucliadb_utils.nats import NatsConnectionManager
|
36
|
+
from nucliadb_utils.settings import FileBackendConfig, indexing_settings, storage_settings
|
37
|
+
from nucliadb_utils.storages.settings import settings as extended_storage_settings
|
38
|
+
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
39
|
+
|
40
|
+
NIDX_ENABLED = bool(os.environ.get("NIDX_ENABLED"))
|
41
|
+
|
42
|
+
|
43
|
+
class NidxUtility:
|
44
|
+
api_client = None
|
45
|
+
searcher_client = None
|
46
|
+
|
47
|
+
async def initialize(self):
|
48
|
+
raise NotImplementedError()
|
49
|
+
|
50
|
+
async def finalize(self):
|
51
|
+
raise NotImplementedError()
|
52
|
+
|
53
|
+
async def index(self, msg: IndexMessage) -> int:
|
54
|
+
raise NotImplementedError()
|
55
|
+
|
56
|
+
def wait_for_sync(self):
|
57
|
+
pass
|
58
|
+
|
59
|
+
|
60
|
+
def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
|
61
|
+
config = {}
|
62
|
+
if storage_settings.file_backend == FileBackendConfig.LOCAL:
|
63
|
+
local_bucket = bucket or storage_settings.local_indexing_bucket
|
64
|
+
file_path = f"{storage_settings.local_files}/{local_bucket}"
|
65
|
+
os.makedirs(file_path, exist_ok=True)
|
66
|
+
|
67
|
+
config[f"{prefix}__OBJECT_STORE"] = "file"
|
68
|
+
config[f"{prefix}__FILE_PATH"] = file_path
|
69
|
+
elif storage_settings.file_backend == FileBackendConfig.GCS:
|
70
|
+
gcs_bucket = bucket or extended_storage_settings.gcs_indexing_bucket
|
71
|
+
config[f"{prefix}__OBJECT_STORE"] = "gcs"
|
72
|
+
if gcs_bucket:
|
73
|
+
config[f"{prefix}__BUCKET"] = gcs_bucket
|
74
|
+
if storage_settings.gcs_base64_creds:
|
75
|
+
config[f"{prefix}__BASE64_CREDS"] = storage_settings.gcs_base64_creds
|
76
|
+
if storage_settings.gcs_endpoint_url:
|
77
|
+
config[f"{prefix}__ENDPOINT"] = storage_settings.gcs_endpoint_url
|
78
|
+
elif storage_settings.file_backend == FileBackendConfig.S3:
|
79
|
+
s3_bucket = bucket or extended_storage_settings.s3_indexing_bucket
|
80
|
+
config[f"{prefix}__OBJECT_STORE"] = "s3"
|
81
|
+
if s3_bucket:
|
82
|
+
config[f"{prefix}__BUCKET"] = s3_bucket
|
83
|
+
config[f"{prefix}__CLIENT_ID"] = storage_settings.s3_client_id or ""
|
84
|
+
config[f"{prefix}__CLIENT_SECRET"] = storage_settings.s3_client_secret or ""
|
85
|
+
config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
|
86
|
+
if storage_settings.s3_endpoint:
|
87
|
+
config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
|
88
|
+
|
89
|
+
return config
|
90
|
+
|
91
|
+
|
92
|
+
class NidxBindingUtility(NidxUtility):
|
93
|
+
"""Implements Nidx utility using the binding"""
|
94
|
+
|
95
|
+
def __init__(self):
|
96
|
+
if ingest_settings.driver != DriverConfig.PG:
|
97
|
+
raise ValueError("nidx_binding requires DRIVER=pg")
|
98
|
+
|
99
|
+
self.config = {
|
100
|
+
"METADATA__DATABASE_URL": ingest_settings.driver_pg_url,
|
101
|
+
**_storage_config("INDEXER", None),
|
102
|
+
**_storage_config("STORAGE", "nidx"),
|
103
|
+
}
|
104
|
+
|
105
|
+
async def initialize(self):
|
106
|
+
import nidx_binding # type: ignore
|
107
|
+
|
108
|
+
self.binding = nidx_binding.NidxBinding(self.config)
|
109
|
+
self.api_client = NidxApiStub(
|
110
|
+
get_traced_grpc_channel(f"localhost:{self.binding.api_port}", "nidx_api")
|
111
|
+
)
|
112
|
+
self.searcher_client = NidxSearcherStub(
|
113
|
+
get_traced_grpc_channel(f"localhost:{self.binding.searcher_port}", "nidx_searcher")
|
114
|
+
)
|
115
|
+
|
116
|
+
async def finalize(self):
|
117
|
+
del self.binding
|
118
|
+
|
119
|
+
async def index(self, msg: IndexMessage) -> int:
|
120
|
+
return self.binding.index(msg.SerializeToString())
|
121
|
+
|
122
|
+
def wait_for_sync(self):
|
123
|
+
self.binding.wait_for_sync()
|
124
|
+
|
125
|
+
|
126
|
+
class NidxServiceUtility(NidxUtility):
|
127
|
+
"""Implements Nidx utility connecting to the network service"""
|
128
|
+
|
129
|
+
def __init__(self):
|
130
|
+
if indexing_settings.index_nidx_subject is None:
|
131
|
+
raise ValueError("INDEX_NIDX_SUBJECT needed for nidx utility")
|
132
|
+
|
133
|
+
if not settings.nidx_api_address or not settings.nidx_searcher_address:
|
134
|
+
raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
|
135
|
+
|
136
|
+
self.nats_connection_manager = NatsConnectionManager(
|
137
|
+
service_name="NidxIndexer",
|
138
|
+
nats_servers=indexing_settings.index_jetstream_servers,
|
139
|
+
nats_creds=indexing_settings.index_jetstream_auth,
|
140
|
+
)
|
141
|
+
self.subject = indexing_settings.index_nidx_subject
|
142
|
+
|
143
|
+
async def initialize(self):
|
144
|
+
await self.nats_connection_manager.initialize()
|
145
|
+
self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
|
146
|
+
self.searcher_client = NidxSearcherStub(
|
147
|
+
get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
|
148
|
+
)
|
149
|
+
|
150
|
+
async def finalize(self):
|
151
|
+
await self.nats_connection_manager.finalize()
|
152
|
+
|
153
|
+
async def index(self, writer: IndexMessage) -> int:
|
154
|
+
res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
|
155
|
+
logger.info(
|
156
|
+
f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}" # noqa
|
157
|
+
)
|
158
|
+
return res.seq
|
159
|
+
|
160
|
+
|
161
|
+
async def start_nidx_utility() -> Optional[NidxUtility]:
|
162
|
+
if not NIDX_ENABLED:
|
163
|
+
return None
|
164
|
+
|
165
|
+
nidx = get_nidx()
|
166
|
+
if nidx:
|
167
|
+
return nidx
|
168
|
+
|
169
|
+
nidx_utility: NidxUtility
|
170
|
+
if settings.standalone_mode:
|
171
|
+
nidx_utility = NidxBindingUtility()
|
172
|
+
else:
|
173
|
+
nidx_utility = NidxServiceUtility()
|
174
|
+
|
175
|
+
await nidx_utility.initialize()
|
176
|
+
set_utility(Utility.NIDX, nidx_utility)
|
177
|
+
return nidx_utility
|
178
|
+
|
179
|
+
|
180
|
+
async def stop_nidx_utility():
|
181
|
+
nidx_utility = get_nidx()
|
182
|
+
if nidx_utility:
|
183
|
+
clean_utility(Utility.NIDX)
|
184
|
+
await nidx_utility.finalize()
|
185
|
+
|
186
|
+
|
187
|
+
def get_nidx() -> Optional[NidxUtility]:
|
188
|
+
return get_utility(Utility.NIDX)
|
189
|
+
|
190
|
+
|
191
|
+
def get_nidx_api_client() -> Optional["NidxApiStub"]:
|
192
|
+
nidx = get_nidx()
|
193
|
+
if nidx:
|
194
|
+
return nidx.api_client
|
195
|
+
else:
|
196
|
+
return None
|
197
|
+
|
198
|
+
|
199
|
+
def get_nidx_searcher_client() -> Optional["NidxSearcherStub"]:
|
200
|
+
nidx = get_nidx()
|
201
|
+
if nidx:
|
202
|
+
return nidx.searcher_client
|
203
|
+
else:
|
204
|
+
return None
|
205
|
+
|
206
|
+
|
207
|
+
# TODO: Remove the index node abstraction
|
208
|
+
class NodeNidxAdapter:
|
209
|
+
def __init__(self, api_client, searcher_client):
|
210
|
+
# API methods
|
211
|
+
self.GetShard = api_client.GetShard
|
212
|
+
self.NewShard = api_client.NewShard
|
213
|
+
self.DeleteShard = api_client.DeleteShard
|
214
|
+
self.ListShards = api_client.ListShards
|
215
|
+
self.AddVectorSet = api_client.AddVectorSet
|
216
|
+
self.RemoveVectorSet = api_client.RemoveVectorSet
|
217
|
+
self.ListVectorSets = api_client.ListVectorSets
|
218
|
+
self.GetMetadata = api_client.GetMetadata
|
219
|
+
|
220
|
+
# Searcher methods
|
221
|
+
self.Search = searcher_client.Search
|
222
|
+
self.Suggest = searcher_client.Suggest
|
223
|
+
self.Paragraphs = searcher_client.Paragraphs
|
224
|
+
self.Documents = searcher_client.Documents
|
225
|
+
|
226
|
+
|
227
|
+
class FakeNode(AbstractIndexNode):
|
228
|
+
def __init__(self, api_client, searcher_client):
|
229
|
+
self.client = NodeNidxAdapter(api_client, searcher_client)
|
230
|
+
|
231
|
+
@property
|
232
|
+
def reader(self):
|
233
|
+
return self.client
|
234
|
+
|
235
|
+
@property
|
236
|
+
def writer(self):
|
237
|
+
return self.client
|
238
|
+
|
239
|
+
def is_read_replica(_):
|
240
|
+
return False
|
241
|
+
|
242
|
+
@property
|
243
|
+
def id(self):
|
244
|
+
return "nidx"
|
245
|
+
|
246
|
+
@property
|
247
|
+
def address(self):
|
248
|
+
return "nidx"
|
249
|
+
|
250
|
+
@property
|
251
|
+
def primary_id(self):
|
252
|
+
return "nidx"
|
253
|
+
|
254
|
+
|
255
|
+
def get_nidx_fake_node() -> Optional[FakeNode]:
|
256
|
+
nidx = get_nidx()
|
257
|
+
if nidx:
|
258
|
+
return FakeNode(nidx.api_client, nidx.searcher_client)
|
259
|
+
else:
|
260
|
+
return None
|
@@ -17,6 +17,7 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import json
|
20
21
|
from datetime import datetime
|
21
22
|
from typing import AsyncGenerator, Union
|
22
23
|
|
@@ -53,25 +54,38 @@ class ExportImportDataManager:
|
|
53
54
|
|
54
55
|
async def get_metadata(self, type: str, kbid: str, id: str) -> Metadata:
|
55
56
|
key = self._get_maindb_metadata_key(type, kbid, id)
|
56
|
-
async with self.driver.transaction() as txn:
|
57
|
+
async with self.driver.transaction(read_only=True) as txn:
|
57
58
|
data = await txn.get(key)
|
58
59
|
if data is None or data == b"":
|
59
60
|
raise MetadataNotFound()
|
60
61
|
decoded = data.decode("utf-8")
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
if type == "export":
|
63
|
+
model_type = ExportMetadata
|
64
|
+
elif type == "import":
|
65
|
+
model_type = ImportMetadata # type: ignore
|
66
|
+
else:
|
67
|
+
raise ValueError(f"Invalid type: {type}")
|
68
|
+
json_decoded = json.loads(decoded)
|
69
|
+
|
70
|
+
# For some reason, the total and processed fields are not always present in the metadata.
|
71
|
+
# This is to unblock already created exports that hit this bug.
|
72
|
+
if json_decoded.get("total") is None:
|
73
|
+
json_decoded["total"] = 0
|
74
|
+
if json_decoded.get("processed") is None:
|
75
|
+
json_decoded["processed"] = 0
|
76
|
+
|
77
|
+
return model_type.model_validate(json_decoded)
|
66
78
|
|
67
79
|
async def set_metadata(
|
68
80
|
self,
|
69
81
|
type: str,
|
70
82
|
metadata: Metadata,
|
71
83
|
):
|
84
|
+
metadata.processed = metadata.processed or 0
|
85
|
+
metadata.total = metadata.total or 0
|
72
86
|
metadata.modified = datetime.utcnow()
|
73
87
|
key = self._get_maindb_metadata_key(type, metadata.kbid, metadata.id)
|
74
|
-
data = metadata.
|
88
|
+
data = metadata.model_dump_json().encode("utf-8")
|
75
89
|
async with self.driver.transaction() as txn:
|
76
90
|
await txn.set(key, data)
|
77
91
|
await txn.commit()
|
@@ -97,9 +111,7 @@ class ExportImportDataManager:
|
|
97
111
|
await self.storage.uploaditerator(export_bytes, field, cf)
|
98
112
|
return cf.size
|
99
113
|
|
100
|
-
async def download_export(
|
101
|
-
self, kbid: str, export_id: str
|
102
|
-
) -> AsyncGenerator[bytes, None]:
|
114
|
+
async def download_export(self, kbid: str, export_id: str) -> AsyncGenerator[bytes, None]:
|
103
115
|
key = STORAGE_EXPORT_KEY.format(export_id=export_id)
|
104
116
|
bucket = self.storage.get_bucket_name(kbid)
|
105
117
|
async for chunk in self.storage.download(bucket, key):
|
@@ -125,13 +137,9 @@ class ExportImportDataManager:
|
|
125
137
|
async for chunk in self.storage.download(bucket, key):
|
126
138
|
yield chunk
|
127
139
|
|
128
|
-
def _get_storage_field(
|
129
|
-
self, kbid: str, key: str, cf: resources_pb2.CloudFile
|
130
|
-
) -> StorageField:
|
140
|
+
def _get_storage_field(self, kbid: str, key: str, cf: resources_pb2.CloudFile) -> StorageField:
|
131
141
|
bucket = self.storage.get_bucket_name(kbid)
|
132
|
-
return self.storage.field_klass(
|
133
|
-
storage=self.storage, bucket=bucket, fullkey=key, field=cf
|
134
|
-
)
|
142
|
+
return self.storage.field_klass(storage=self.storage, bucket=bucket, fullkey=key, field=cf)
|
135
143
|
|
136
144
|
async def delete_import(self, kbid: str, import_id: str):
|
137
145
|
key = STORAGE_IMPORT_KEY.format(import_id=import_id)
|
@@ -151,6 +159,4 @@ class ExportImportDataManager:
|
|
151
159
|
await func(kbid, id)
|
152
160
|
except Exception as ex:
|
153
161
|
errors.capture_exception(ex)
|
154
|
-
logger.exception(
|
155
|
-
f"Could not delete {type} {id} from storage", extra={"kbid": kbid}
|
156
|
-
)
|
162
|
+
logger.exception(f"Could not delete {type} {id} from storage", extra={"kbid": kbid})
|
@@ -35,6 +35,7 @@ from nucliadb.export_import.utils import (
|
|
35
35
|
get_cloud_files,
|
36
36
|
get_entities,
|
37
37
|
get_labels,
|
38
|
+
get_learning_config,
|
38
39
|
iter_kb_resource_uuids,
|
39
40
|
)
|
40
41
|
from nucliadb_protos import writer_pb2
|
@@ -51,6 +52,9 @@ async def export_kb(
|
|
51
52
|
|
52
53
|
If a metadata object is provided, uses it to resume the export if it was interrupted.
|
53
54
|
"""
|
55
|
+
async for chunk in export_learning_config(kbid):
|
56
|
+
yield chunk
|
57
|
+
|
54
58
|
resources_iterator = export_resources(context, kbid)
|
55
59
|
if metadata is not None:
|
56
60
|
assert metadata.kbid == kbid
|
@@ -66,9 +70,7 @@ async def export_kb(
|
|
66
70
|
yield chunk
|
67
71
|
|
68
72
|
|
69
|
-
async def export_kb_to_blob_storage(
|
70
|
-
context: ApplicationContext, msg: NatsTaskMessage
|
71
|
-
) -> None:
|
73
|
+
async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMessage) -> None:
|
72
74
|
"""
|
73
75
|
Exports the data of a knowledgebox to the blob storage service.
|
74
76
|
"""
|
@@ -86,7 +88,7 @@ async def export_kb_to_blob_storage(
|
|
86
88
|
export_size = await upload_export_retried(iterator, kbid, export_id)
|
87
89
|
|
88
90
|
# Store export size
|
89
|
-
metadata.total = metadata.processed = export_size
|
91
|
+
metadata.total = metadata.processed = export_size or 0
|
90
92
|
await dm.set_metadata("export", metadata)
|
91
93
|
|
92
94
|
|
@@ -103,9 +105,7 @@ async def export_resources(
|
|
103
105
|
yield chunk
|
104
106
|
|
105
107
|
|
106
|
-
async def export_resources_resumable(
|
107
|
-
context, metadata: ExportMetadata
|
108
|
-
) -> AsyncGenerator[bytes, None]:
|
108
|
+
async def export_resources_resumable(context, metadata: ExportMetadata) -> AsyncGenerator[bytes, None]:
|
109
109
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
110
110
|
|
111
111
|
kbid = metadata.kbid
|
@@ -189,3 +189,16 @@ async def export_labels(
|
|
189
189
|
yield ExportedItemType.LABELS.encode("utf-8")
|
190
190
|
yield len(data).to_bytes(4, byteorder="big")
|
191
191
|
yield data
|
192
|
+
|
193
|
+
|
194
|
+
async def export_learning_config(
|
195
|
+
kbid: str,
|
196
|
+
) -> AsyncGenerator[bytes, None]:
|
197
|
+
lconfig = await get_learning_config(kbid)
|
198
|
+
if lconfig is None:
|
199
|
+
logger.warning(f"No learning configuration found for kbid", extra={"kbid": kbid})
|
200
|
+
return
|
201
|
+
data = lconfig.model_dump_json().encode("utf-8")
|
202
|
+
yield ExportedItemType.LEARNING_CONFIG.encode("utf-8")
|
203
|
+
yield len(data).to_bytes(4, byteorder="big")
|
204
|
+
yield data
|
@@ -28,9 +28,7 @@ from nucliadb.export_import.models import (
|
|
28
28
|
NatsTaskMessage,
|
29
29
|
)
|
30
30
|
from nucliadb.export_import.utils import (
|
31
|
-
ExportStream,
|
32
31
|
ExportStreamReader,
|
33
|
-
IteratorExportStream,
|
34
32
|
TaskRetryHandler,
|
35
33
|
import_binary,
|
36
34
|
import_broker_message,
|
@@ -47,7 +45,7 @@ BinaryStreamGenerator = Callable[[int], BinaryStream]
|
|
47
45
|
async def import_kb(
|
48
46
|
context: ApplicationContext,
|
49
47
|
kbid: str,
|
50
|
-
stream:
|
48
|
+
stream: AsyncGenerator[bytes, None],
|
51
49
|
metadata: Optional[ImportMetadata] = None,
|
52
50
|
) -> None:
|
53
51
|
"""
|
@@ -91,28 +89,25 @@ async def import_kb(
|
|
91
89
|
await dm.set_metadata("import", metadata)
|
92
90
|
|
93
91
|
if metadata is not None:
|
94
|
-
metadata.processed = stream_reader.read_bytes
|
92
|
+
metadata.processed = stream_reader.read_bytes or 0
|
95
93
|
await dm.set_metadata("import", metadata)
|
96
94
|
|
97
95
|
|
98
|
-
async def import_kb_from_blob_storage(
|
99
|
-
context: ApplicationContext, msg: NatsTaskMessage
|
100
|
-
):
|
96
|
+
async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTaskMessage):
|
101
97
|
"""
|
102
98
|
Imports to a knowledgebox from an export stored in the blob storage service.
|
103
99
|
"""
|
104
100
|
kbid, import_id = msg.kbid, msg.id
|
105
101
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
106
102
|
metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
|
107
|
-
iterator = dm.download_import(kbid, import_id)
|
108
|
-
stream = IteratorExportStream(iterator)
|
109
103
|
|
110
104
|
retry_handler = TaskRetryHandler("import", dm, metadata)
|
111
105
|
|
112
106
|
@retry_handler.wrap
|
113
|
-
async def import_kb_retried(context, kbid,
|
107
|
+
async def import_kb_retried(context, kbid, metadata):
|
108
|
+
stream = dm.download_import(kbid, import_id)
|
114
109
|
await import_kb(context, kbid, stream, metadata)
|
115
110
|
|
116
|
-
await import_kb_retried(context, kbid,
|
111
|
+
await import_kb_retried(context, kbid, metadata)
|
117
112
|
|
118
113
|
await dm.try_delete_from_storage("import", kbid, import_id)
|
nucliadb/export_import/models.py
CHANGED
@@ -17,7 +17,7 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
|
20
|
+
import datetime
|
21
21
|
from enum import Enum
|
22
22
|
from typing import Any
|
23
23
|
|
@@ -36,6 +36,7 @@ class ExportedItemType(str, Enum):
|
|
36
36
|
LABELS = "LAB"
|
37
37
|
ENTITIES = "ENT"
|
38
38
|
BINARY = "BIN"
|
39
|
+
LEARNING_CONFIG = "LEA"
|
39
40
|
|
40
41
|
|
41
42
|
ExportItem = tuple[ExportedItemType, Any]
|
@@ -56,8 +57,8 @@ class Metadata(BaseModel):
|
|
56
57
|
task: TaskMetadata = TaskMetadata(status=Status.SCHEDULED)
|
57
58
|
total: int = 0
|
58
59
|
processed: int = 0
|
59
|
-
created: datetime = datetime.
|
60
|
-
modified: datetime = datetime.
|
60
|
+
created: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
|
61
|
+
modified: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
|
61
62
|
|
62
63
|
|
63
64
|
class ExportMetadata(Metadata):
|
@@ -65,8 +66,7 @@ class ExportMetadata(Metadata):
|
|
65
66
|
exported_resources: list[str] = list()
|
66
67
|
|
67
68
|
|
68
|
-
class ImportMetadata(Metadata):
|
69
|
-
...
|
69
|
+
class ImportMetadata(Metadata): ...
|
70
70
|
|
71
71
|
|
72
72
|
class NatsTaskMessage(BaseModel):
|
nucliadb/export_import/tasks.py
CHANGED
@@ -32,7 +32,7 @@ def get_exports_consumer() -> NatsTaskConsumer:
|
|
32
32
|
name="exports_consumer",
|
33
33
|
stream=const.Streams.KB_EXPORTS, # type: ignore
|
34
34
|
callback=export_kb_to_blob_storage, # type: ignore
|
35
|
-
msg_type=NatsTaskMessage,
|
35
|
+
msg_type=NatsTaskMessage,
|
36
36
|
max_concurrent_messages=10,
|
37
37
|
)
|
38
38
|
|
@@ -41,7 +41,7 @@ async def get_exports_producer(context: ApplicationContext) -> NatsTaskProducer:
|
|
41
41
|
producer = create_producer(
|
42
42
|
name="exports_producer",
|
43
43
|
stream=const.Streams.KB_EXPORTS, # type: ignore
|
44
|
-
msg_type=NatsTaskMessage,
|
44
|
+
msg_type=NatsTaskMessage,
|
45
45
|
)
|
46
46
|
await producer.initialize(context)
|
47
47
|
return producer
|
@@ -52,7 +52,7 @@ def get_imports_consumer() -> NatsTaskConsumer:
|
|
52
52
|
name="imports_consumer",
|
53
53
|
stream=const.Streams.KB_IMPORTS, # type: ignore
|
54
54
|
callback=import_kb_from_blob_storage, # type: ignore
|
55
|
-
msg_type=NatsTaskMessage,
|
55
|
+
msg_type=NatsTaskMessage,
|
56
56
|
max_concurrent_messages=10,
|
57
57
|
)
|
58
58
|
|
@@ -61,7 +61,7 @@ async def get_imports_producer(context: ApplicationContext) -> NatsTaskProducer:
|
|
61
61
|
producer = create_producer(
|
62
62
|
name="imports_producer",
|
63
63
|
stream=const.Streams.KB_IMPORTS, # type: ignore
|
64
|
-
msg_type=NatsTaskMessage,
|
64
|
+
msg_type=NatsTaskMessage,
|
65
65
|
)
|
66
66
|
await producer.initialize(context)
|
67
67
|
return producer
|