nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/common/nidx.py
ADDED
@@ -0,0 +1,260 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
import os
|
22
|
+
from typing import Optional
|
23
|
+
|
24
|
+
from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxSearcherStub
|
25
|
+
|
26
|
+
from nucliadb.common.cluster.base import AbstractIndexNode
|
27
|
+
from nucliadb.common.cluster.settings import settings
|
28
|
+
from nucliadb.ingest.settings import DriverConfig
|
29
|
+
from nucliadb.ingest.settings import settings as ingest_settings
|
30
|
+
from nucliadb_protos.nodewriter_pb2 import (
|
31
|
+
IndexMessage,
|
32
|
+
)
|
33
|
+
from nucliadb_utils import logger
|
34
|
+
from nucliadb_utils.grpc import get_traced_grpc_channel
|
35
|
+
from nucliadb_utils.nats import NatsConnectionManager
|
36
|
+
from nucliadb_utils.settings import FileBackendConfig, indexing_settings, storage_settings
|
37
|
+
from nucliadb_utils.storages.settings import settings as extended_storage_settings
|
38
|
+
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
39
|
+
|
40
|
+
NIDX_ENABLED = bool(os.environ.get("NIDX_ENABLED"))
|
41
|
+
|
42
|
+
|
43
|
+
class NidxUtility:
|
44
|
+
api_client = None
|
45
|
+
searcher_client = None
|
46
|
+
|
47
|
+
async def initialize(self):
|
48
|
+
raise NotImplementedError()
|
49
|
+
|
50
|
+
async def finalize(self):
|
51
|
+
raise NotImplementedError()
|
52
|
+
|
53
|
+
async def index(self, msg: IndexMessage) -> int:
|
54
|
+
raise NotImplementedError()
|
55
|
+
|
56
|
+
def wait_for_sync(self):
|
57
|
+
pass
|
58
|
+
|
59
|
+
|
60
|
+
def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
|
61
|
+
config = {}
|
62
|
+
if storage_settings.file_backend == FileBackendConfig.LOCAL:
|
63
|
+
local_bucket = bucket or storage_settings.local_indexing_bucket
|
64
|
+
file_path = f"{storage_settings.local_files}/{local_bucket}"
|
65
|
+
os.makedirs(file_path, exist_ok=True)
|
66
|
+
|
67
|
+
config[f"{prefix}__OBJECT_STORE"] = "file"
|
68
|
+
config[f"{prefix}__FILE_PATH"] = file_path
|
69
|
+
elif storage_settings.file_backend == FileBackendConfig.GCS:
|
70
|
+
gcs_bucket = bucket or extended_storage_settings.gcs_indexing_bucket
|
71
|
+
config[f"{prefix}__OBJECT_STORE"] = "gcs"
|
72
|
+
if gcs_bucket:
|
73
|
+
config[f"{prefix}__BUCKET"] = gcs_bucket
|
74
|
+
if storage_settings.gcs_base64_creds:
|
75
|
+
config[f"{prefix}__BASE64_CREDS"] = storage_settings.gcs_base64_creds
|
76
|
+
if storage_settings.gcs_endpoint_url:
|
77
|
+
config[f"{prefix}__ENDPOINT"] = storage_settings.gcs_endpoint_url
|
78
|
+
elif storage_settings.file_backend == FileBackendConfig.S3:
|
79
|
+
s3_bucket = bucket or extended_storage_settings.s3_indexing_bucket
|
80
|
+
config[f"{prefix}__OBJECT_STORE"] = "s3"
|
81
|
+
if s3_bucket:
|
82
|
+
config[f"{prefix}__BUCKET"] = s3_bucket
|
83
|
+
config[f"{prefix}__CLIENT_ID"] = storage_settings.s3_client_id or ""
|
84
|
+
config[f"{prefix}__CLIENT_SECRET"] = storage_settings.s3_client_secret or ""
|
85
|
+
config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
|
86
|
+
if storage_settings.s3_endpoint:
|
87
|
+
config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
|
88
|
+
|
89
|
+
return config
|
90
|
+
|
91
|
+
|
92
|
+
class NidxBindingUtility(NidxUtility):
|
93
|
+
"""Implements Nidx utility using the binding"""
|
94
|
+
|
95
|
+
def __init__(self):
|
96
|
+
if ingest_settings.driver != DriverConfig.PG:
|
97
|
+
raise ValueError("nidx_binding requires DRIVER=pg")
|
98
|
+
|
99
|
+
self.config = {
|
100
|
+
"METADATA__DATABASE_URL": ingest_settings.driver_pg_url,
|
101
|
+
**_storage_config("INDEXER", None),
|
102
|
+
**_storage_config("STORAGE", "nidx"),
|
103
|
+
}
|
104
|
+
|
105
|
+
async def initialize(self):
|
106
|
+
import nidx_binding # type: ignore
|
107
|
+
|
108
|
+
self.binding = nidx_binding.NidxBinding(self.config)
|
109
|
+
self.api_client = NidxApiStub(
|
110
|
+
get_traced_grpc_channel(f"localhost:{self.binding.api_port}", "nidx_api")
|
111
|
+
)
|
112
|
+
self.searcher_client = NidxSearcherStub(
|
113
|
+
get_traced_grpc_channel(f"localhost:{self.binding.searcher_port}", "nidx_searcher")
|
114
|
+
)
|
115
|
+
|
116
|
+
async def finalize(self):
|
117
|
+
del self.binding
|
118
|
+
|
119
|
+
async def index(self, msg: IndexMessage) -> int:
|
120
|
+
return self.binding.index(msg.SerializeToString())
|
121
|
+
|
122
|
+
def wait_for_sync(self):
|
123
|
+
self.binding.wait_for_sync()
|
124
|
+
|
125
|
+
|
126
|
+
class NidxServiceUtility(NidxUtility):
|
127
|
+
"""Implements Nidx utility connecting to the network service"""
|
128
|
+
|
129
|
+
def __init__(self):
|
130
|
+
if indexing_settings.index_nidx_subject is None:
|
131
|
+
raise ValueError("INDEX_NIDX_SUBJECT needed for nidx utility")
|
132
|
+
|
133
|
+
if not settings.nidx_api_address or not settings.nidx_searcher_address:
|
134
|
+
raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
|
135
|
+
|
136
|
+
self.nats_connection_manager = NatsConnectionManager(
|
137
|
+
service_name="NidxIndexer",
|
138
|
+
nats_servers=indexing_settings.index_jetstream_servers,
|
139
|
+
nats_creds=indexing_settings.index_jetstream_auth,
|
140
|
+
)
|
141
|
+
self.subject = indexing_settings.index_nidx_subject
|
142
|
+
|
143
|
+
async def initialize(self):
|
144
|
+
await self.nats_connection_manager.initialize()
|
145
|
+
self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
|
146
|
+
self.searcher_client = NidxSearcherStub(
|
147
|
+
get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
|
148
|
+
)
|
149
|
+
|
150
|
+
async def finalize(self):
|
151
|
+
await self.nats_connection_manager.finalize()
|
152
|
+
|
153
|
+
async def index(self, writer: IndexMessage) -> int:
|
154
|
+
res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
|
155
|
+
logger.info(
|
156
|
+
f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}" # noqa
|
157
|
+
)
|
158
|
+
return res.seq
|
159
|
+
|
160
|
+
|
161
|
+
async def start_nidx_utility() -> Optional[NidxUtility]:
|
162
|
+
if not NIDX_ENABLED:
|
163
|
+
return None
|
164
|
+
|
165
|
+
nidx = get_nidx()
|
166
|
+
if nidx:
|
167
|
+
return nidx
|
168
|
+
|
169
|
+
nidx_utility: NidxUtility
|
170
|
+
if settings.standalone_mode:
|
171
|
+
nidx_utility = NidxBindingUtility()
|
172
|
+
else:
|
173
|
+
nidx_utility = NidxServiceUtility()
|
174
|
+
|
175
|
+
await nidx_utility.initialize()
|
176
|
+
set_utility(Utility.NIDX, nidx_utility)
|
177
|
+
return nidx_utility
|
178
|
+
|
179
|
+
|
180
|
+
async def stop_nidx_utility():
|
181
|
+
nidx_utility = get_nidx()
|
182
|
+
if nidx_utility:
|
183
|
+
clean_utility(Utility.NIDX)
|
184
|
+
await nidx_utility.finalize()
|
185
|
+
|
186
|
+
|
187
|
+
def get_nidx() -> Optional[NidxUtility]:
|
188
|
+
return get_utility(Utility.NIDX)
|
189
|
+
|
190
|
+
|
191
|
+
def get_nidx_api_client() -> Optional["NidxApiStub"]:
|
192
|
+
nidx = get_nidx()
|
193
|
+
if nidx:
|
194
|
+
return nidx.api_client
|
195
|
+
else:
|
196
|
+
return None
|
197
|
+
|
198
|
+
|
199
|
+
def get_nidx_searcher_client() -> Optional["NidxSearcherStub"]:
|
200
|
+
nidx = get_nidx()
|
201
|
+
if nidx:
|
202
|
+
return nidx.searcher_client
|
203
|
+
else:
|
204
|
+
return None
|
205
|
+
|
206
|
+
|
207
|
+
# TODO: Remove the index node abstraction
|
208
|
+
class NodeNidxAdapter:
|
209
|
+
def __init__(self, api_client, searcher_client):
|
210
|
+
# API methods
|
211
|
+
self.GetShard = api_client.GetShard
|
212
|
+
self.NewShard = api_client.NewShard
|
213
|
+
self.DeleteShard = api_client.DeleteShard
|
214
|
+
self.ListShards = api_client.ListShards
|
215
|
+
self.AddVectorSet = api_client.AddVectorSet
|
216
|
+
self.RemoveVectorSet = api_client.RemoveVectorSet
|
217
|
+
self.ListVectorSets = api_client.ListVectorSets
|
218
|
+
self.GetMetadata = api_client.GetMetadata
|
219
|
+
|
220
|
+
# Searcher methods
|
221
|
+
self.Search = searcher_client.Search
|
222
|
+
self.Suggest = searcher_client.Suggest
|
223
|
+
self.Paragraphs = searcher_client.Paragraphs
|
224
|
+
self.Documents = searcher_client.Documents
|
225
|
+
|
226
|
+
|
227
|
+
class FakeNode(AbstractIndexNode):
|
228
|
+
def __init__(self, api_client, searcher_client):
|
229
|
+
self.client = NodeNidxAdapter(api_client, searcher_client)
|
230
|
+
|
231
|
+
@property
|
232
|
+
def reader(self):
|
233
|
+
return self.client
|
234
|
+
|
235
|
+
@property
|
236
|
+
def writer(self):
|
237
|
+
return self.client
|
238
|
+
|
239
|
+
def is_read_replica(_):
|
240
|
+
return False
|
241
|
+
|
242
|
+
@property
|
243
|
+
def id(self):
|
244
|
+
return "nidx"
|
245
|
+
|
246
|
+
@property
|
247
|
+
def address(self):
|
248
|
+
return "nidx"
|
249
|
+
|
250
|
+
@property
|
251
|
+
def primary_id(self):
|
252
|
+
return "nidx"
|
253
|
+
|
254
|
+
|
255
|
+
def get_nidx_fake_node() -> Optional[FakeNode]:
|
256
|
+
nidx = get_nidx()
|
257
|
+
if nidx:
|
258
|
+
return FakeNode(nidx.api_client, nidx.searcher_client)
|
259
|
+
else:
|
260
|
+
return None
|
@@ -17,6 +17,7 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import json
|
20
21
|
from datetime import datetime
|
21
22
|
from typing import AsyncGenerator, Union
|
22
23
|
|
@@ -53,25 +54,38 @@ class ExportImportDataManager:
|
|
53
54
|
|
54
55
|
async def get_metadata(self, type: str, kbid: str, id: str) -> Metadata:
|
55
56
|
key = self._get_maindb_metadata_key(type, kbid, id)
|
56
|
-
async with self.driver.transaction() as txn:
|
57
|
+
async with self.driver.transaction(read_only=True) as txn:
|
57
58
|
data = await txn.get(key)
|
58
59
|
if data is None or data == b"":
|
59
60
|
raise MetadataNotFound()
|
60
61
|
decoded = data.decode("utf-8")
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
62
|
+
if type == "export":
|
63
|
+
model_type = ExportMetadata
|
64
|
+
elif type == "import":
|
65
|
+
model_type = ImportMetadata # type: ignore
|
66
|
+
else:
|
67
|
+
raise ValueError(f"Invalid type: {type}")
|
68
|
+
json_decoded = json.loads(decoded)
|
69
|
+
|
70
|
+
# For some reason, the total and processed fields are not always present in the metadata.
|
71
|
+
# This is to unblock already created exports that hit this bug.
|
72
|
+
if json_decoded.get("total") is None:
|
73
|
+
json_decoded["total"] = 0
|
74
|
+
if json_decoded.get("processed") is None:
|
75
|
+
json_decoded["processed"] = 0
|
76
|
+
|
77
|
+
return model_type.model_validate(json_decoded)
|
66
78
|
|
67
79
|
async def set_metadata(
|
68
80
|
self,
|
69
81
|
type: str,
|
70
82
|
metadata: Metadata,
|
71
83
|
):
|
84
|
+
metadata.processed = metadata.processed or 0
|
85
|
+
metadata.total = metadata.total or 0
|
72
86
|
metadata.modified = datetime.utcnow()
|
73
87
|
key = self._get_maindb_metadata_key(type, metadata.kbid, metadata.id)
|
74
|
-
data = metadata.
|
88
|
+
data = metadata.model_dump_json().encode("utf-8")
|
75
89
|
async with self.driver.transaction() as txn:
|
76
90
|
await txn.set(key, data)
|
77
91
|
await txn.commit()
|
@@ -97,9 +111,7 @@ class ExportImportDataManager:
|
|
97
111
|
await self.storage.uploaditerator(export_bytes, field, cf)
|
98
112
|
return cf.size
|
99
113
|
|
100
|
-
async def download_export(
|
101
|
-
self, kbid: str, export_id: str
|
102
|
-
) -> AsyncGenerator[bytes, None]:
|
114
|
+
async def download_export(self, kbid: str, export_id: str) -> AsyncGenerator[bytes, None]:
|
103
115
|
key = STORAGE_EXPORT_KEY.format(export_id=export_id)
|
104
116
|
bucket = self.storage.get_bucket_name(kbid)
|
105
117
|
async for chunk in self.storage.download(bucket, key):
|
@@ -125,13 +137,9 @@ class ExportImportDataManager:
|
|
125
137
|
async for chunk in self.storage.download(bucket, key):
|
126
138
|
yield chunk
|
127
139
|
|
128
|
-
def _get_storage_field(
|
129
|
-
self, kbid: str, key: str, cf: resources_pb2.CloudFile
|
130
|
-
) -> StorageField:
|
140
|
+
def _get_storage_field(self, kbid: str, key: str, cf: resources_pb2.CloudFile) -> StorageField:
|
131
141
|
bucket = self.storage.get_bucket_name(kbid)
|
132
|
-
return self.storage.field_klass(
|
133
|
-
storage=self.storage, bucket=bucket, fullkey=key, field=cf
|
134
|
-
)
|
142
|
+
return self.storage.field_klass(storage=self.storage, bucket=bucket, fullkey=key, field=cf)
|
135
143
|
|
136
144
|
async def delete_import(self, kbid: str, import_id: str):
|
137
145
|
key = STORAGE_IMPORT_KEY.format(import_id=import_id)
|
@@ -151,6 +159,4 @@ class ExportImportDataManager:
|
|
151
159
|
await func(kbid, id)
|
152
160
|
except Exception as ex:
|
153
161
|
errors.capture_exception(ex)
|
154
|
-
logger.exception(
|
155
|
-
f"Could not delete {type} {id} from storage", extra={"kbid": kbid}
|
156
|
-
)
|
162
|
+
logger.exception(f"Could not delete {type} {id} from storage", extra={"kbid": kbid})
|
@@ -70,9 +70,7 @@ async def export_kb(
|
|
70
70
|
yield chunk
|
71
71
|
|
72
72
|
|
73
|
-
async def export_kb_to_blob_storage(
|
74
|
-
context: ApplicationContext, msg: NatsTaskMessage
|
75
|
-
) -> None:
|
73
|
+
async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMessage) -> None:
|
76
74
|
"""
|
77
75
|
Exports the data of a knowledgebox to the blob storage service.
|
78
76
|
"""
|
@@ -90,7 +88,7 @@ async def export_kb_to_blob_storage(
|
|
90
88
|
export_size = await upload_export_retried(iterator, kbid, export_id)
|
91
89
|
|
92
90
|
# Store export size
|
93
|
-
metadata.total = metadata.processed = export_size
|
91
|
+
metadata.total = metadata.processed = export_size or 0
|
94
92
|
await dm.set_metadata("export", metadata)
|
95
93
|
|
96
94
|
|
@@ -107,9 +105,7 @@ async def export_resources(
|
|
107
105
|
yield chunk
|
108
106
|
|
109
107
|
|
110
|
-
async def export_resources_resumable(
|
111
|
-
context, metadata: ExportMetadata
|
112
|
-
) -> AsyncGenerator[bytes, None]:
|
108
|
+
async def export_resources_resumable(context, metadata: ExportMetadata) -> AsyncGenerator[bytes, None]:
|
113
109
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
114
110
|
|
115
111
|
kbid = metadata.kbid
|
@@ -200,11 +196,9 @@ async def export_learning_config(
|
|
200
196
|
) -> AsyncGenerator[bytes, None]:
|
201
197
|
lconfig = await get_learning_config(kbid)
|
202
198
|
if lconfig is None:
|
203
|
-
logger.warning(
|
204
|
-
f"No learning configuration found for kbid", extra={"kbid": kbid}
|
205
|
-
)
|
199
|
+
logger.warning(f"No learning configuration found for kbid", extra={"kbid": kbid})
|
206
200
|
return
|
207
|
-
data = lconfig.
|
201
|
+
data = lconfig.model_dump_json().encode("utf-8")
|
208
202
|
yield ExportedItemType.LEARNING_CONFIG.encode("utf-8")
|
209
203
|
yield len(data).to_bytes(4, byteorder="big")
|
210
204
|
yield data
|
@@ -89,27 +89,25 @@ async def import_kb(
|
|
89
89
|
await dm.set_metadata("import", metadata)
|
90
90
|
|
91
91
|
if metadata is not None:
|
92
|
-
metadata.processed = stream_reader.read_bytes
|
92
|
+
metadata.processed = stream_reader.read_bytes or 0
|
93
93
|
await dm.set_metadata("import", metadata)
|
94
94
|
|
95
95
|
|
96
|
-
async def import_kb_from_blob_storage(
|
97
|
-
context: ApplicationContext, msg: NatsTaskMessage
|
98
|
-
):
|
96
|
+
async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTaskMessage):
|
99
97
|
"""
|
100
98
|
Imports to a knowledgebox from an export stored in the blob storage service.
|
101
99
|
"""
|
102
100
|
kbid, import_id = msg.kbid, msg.id
|
103
101
|
dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
|
104
102
|
metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
|
105
|
-
stream = dm.download_import(kbid, import_id)
|
106
103
|
|
107
104
|
retry_handler = TaskRetryHandler("import", dm, metadata)
|
108
105
|
|
109
106
|
@retry_handler.wrap
|
110
|
-
async def import_kb_retried(context, kbid,
|
107
|
+
async def import_kb_retried(context, kbid, metadata):
|
108
|
+
stream = dm.download_import(kbid, import_id)
|
111
109
|
await import_kb(context, kbid, stream, metadata)
|
112
110
|
|
113
|
-
await import_kb_retried(context, kbid,
|
111
|
+
await import_kb_retried(context, kbid, metadata)
|
114
112
|
|
115
113
|
await dm.try_delete_from_storage("import", kbid, import_id)
|
nucliadb/export_import/models.py
CHANGED
@@ -17,7 +17,7 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
|
20
|
+
import datetime
|
21
21
|
from enum import Enum
|
22
22
|
from typing import Any
|
23
23
|
|
@@ -57,8 +57,8 @@ class Metadata(BaseModel):
|
|
57
57
|
task: TaskMetadata = TaskMetadata(status=Status.SCHEDULED)
|
58
58
|
total: int = 0
|
59
59
|
processed: int = 0
|
60
|
-
created: datetime = datetime.
|
61
|
-
modified: datetime = datetime.
|
60
|
+
created: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
|
61
|
+
modified: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
|
62
62
|
|
63
63
|
|
64
64
|
class ExportMetadata(Metadata):
|
nucliadb/export_import/tasks.py
CHANGED
@@ -32,7 +32,7 @@ def get_exports_consumer() -> NatsTaskConsumer:
|
|
32
32
|
name="exports_consumer",
|
33
33
|
stream=const.Streams.KB_EXPORTS, # type: ignore
|
34
34
|
callback=export_kb_to_blob_storage, # type: ignore
|
35
|
-
msg_type=NatsTaskMessage,
|
35
|
+
msg_type=NatsTaskMessage,
|
36
36
|
max_concurrent_messages=10,
|
37
37
|
)
|
38
38
|
|
@@ -41,7 +41,7 @@ async def get_exports_producer(context: ApplicationContext) -> NatsTaskProducer:
|
|
41
41
|
producer = create_producer(
|
42
42
|
name="exports_producer",
|
43
43
|
stream=const.Streams.KB_EXPORTS, # type: ignore
|
44
|
-
msg_type=NatsTaskMessage,
|
44
|
+
msg_type=NatsTaskMessage,
|
45
45
|
)
|
46
46
|
await producer.initialize(context)
|
47
47
|
return producer
|
@@ -52,7 +52,7 @@ def get_imports_consumer() -> NatsTaskConsumer:
|
|
52
52
|
name="imports_consumer",
|
53
53
|
stream=const.Streams.KB_IMPORTS, # type: ignore
|
54
54
|
callback=import_kb_from_blob_storage, # type: ignore
|
55
|
-
msg_type=NatsTaskMessage,
|
55
|
+
msg_type=NatsTaskMessage,
|
56
56
|
max_concurrent_messages=10,
|
57
57
|
)
|
58
58
|
|
@@ -61,7 +61,7 @@ async def get_imports_producer(context: ApplicationContext) -> NatsTaskProducer:
|
|
61
61
|
producer = create_producer(
|
62
62
|
name="imports_producer",
|
63
63
|
stream=const.Streams.KB_IMPORTS, # type: ignore
|
64
|
-
msg_type=NatsTaskMessage,
|
64
|
+
msg_type=NatsTaskMessage,
|
65
65
|
)
|
66
66
|
await producer.initialize(context)
|
67
67
|
return producer
|
nucliadb/export_import/utils.py
CHANGED
@@ -20,7 +20,6 @@
|
|
20
20
|
import functools
|
21
21
|
from typing import AsyncGenerator, AsyncIterator, Callable, Optional
|
22
22
|
|
23
|
-
import nats.errors
|
24
23
|
from google.protobuf.message import DecodeError as ProtobufDecodeError
|
25
24
|
|
26
25
|
from nucliadb import learning_proxy
|
@@ -34,10 +33,12 @@ from nucliadb.export_import.exceptions import (
|
|
34
33
|
WrongExportStreamFormat,
|
35
34
|
)
|
36
35
|
from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
|
36
|
+
from nucliadb.ingest.orm.broker_message import generate_broker_message
|
37
37
|
from nucliadb_models.export_import import Status
|
38
38
|
from nucliadb_protos import knowledgebox_pb2 as kb_pb2
|
39
39
|
from nucliadb_protos import resources_pb2, writer_pb2
|
40
40
|
from nucliadb_utils.const import Streams
|
41
|
+
from nucliadb_utils.transaction import MaxTransactionSizeExceededError
|
41
42
|
|
42
43
|
BinaryStream = AsyncGenerator[bytes, None]
|
43
44
|
BinaryStreamGenerator = Callable[[int], BinaryStream]
|
@@ -59,9 +60,6 @@ WRITER_BM_FIELDS = [
|
|
59
60
|
"files",
|
60
61
|
"texts",
|
61
62
|
"conversations",
|
62
|
-
"layouts",
|
63
|
-
"keywordsets",
|
64
|
-
"datetimes",
|
65
63
|
]
|
66
64
|
|
67
65
|
|
@@ -88,7 +86,7 @@ async def transaction_commit(
|
|
88
86
|
wait=False,
|
89
87
|
target_subject=Streams.INGEST_PROCESSED.subject,
|
90
88
|
)
|
91
|
-
except
|
89
|
+
except MaxTransactionSizeExceededError:
|
92
90
|
stored_key = await context.blob_storage.set_stream_message(
|
93
91
|
kbid=bm.kbid, rid=bm.uuid, data=bm.SerializeToString()
|
94
92
|
)
|
@@ -151,23 +149,17 @@ async def set_entities_groups(
|
|
151
149
|
context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
|
152
150
|
) -> None:
|
153
151
|
async with datamanagers.with_transaction() as txn:
|
154
|
-
await datamanagers.entities.set_entities_groups(
|
155
|
-
txn, kbid=kbid, entities_groups=entities_groups
|
156
|
-
)
|
152
|
+
await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
|
157
153
|
await txn.commit()
|
158
154
|
|
159
155
|
|
160
|
-
async def set_labels(
|
161
|
-
context: ApplicationContext, kbid: str, labels: kb_pb2.Labels
|
162
|
-
) -> None:
|
156
|
+
async def set_labels(context: ApplicationContext, kbid: str, labels: kb_pb2.Labels) -> None:
|
163
157
|
async with datamanagers.with_transaction() as txn:
|
164
158
|
await datamanagers.labels.set_labels(txn, kbid=kbid, labels=labels)
|
165
159
|
await txn.commit()
|
166
160
|
|
167
161
|
|
168
|
-
async def iter_kb_resource_uuids(
|
169
|
-
context: ApplicationContext, kbid: str
|
170
|
-
) -> AsyncGenerator[str, None]:
|
162
|
+
async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> AsyncGenerator[str, None]:
|
171
163
|
async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
|
172
164
|
yield rid
|
173
165
|
|
@@ -175,8 +167,13 @@ async def iter_kb_resource_uuids(
|
|
175
167
|
async def get_broker_message(
|
176
168
|
context: ApplicationContext, kbid: str, rid: str
|
177
169
|
) -> Optional[writer_pb2.BrokerMessage]:
|
178
|
-
async with datamanagers.
|
179
|
-
|
170
|
+
async with datamanagers.with_ro_transaction() as txn:
|
171
|
+
resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
|
172
|
+
if resource is None:
|
173
|
+
return None
|
174
|
+
resource.disable_vectors = False
|
175
|
+
resource.txn = txn
|
176
|
+
return await generate_broker_message(resource)
|
180
177
|
|
181
178
|
|
182
179
|
def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFile]:
|
@@ -184,6 +181,10 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
|
|
184
181
|
binaries: list[resources_pb2.CloudFile] = []
|
185
182
|
for file_field in bm.files.values():
|
186
183
|
if file_field.HasField("file"):
|
184
|
+
if file_field.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
|
185
|
+
# Binaries of externally hosted files are not
|
186
|
+
# to be downloaded and stored in the export file
|
187
|
+
continue
|
187
188
|
_clone_collect_cf(binaries, file_field.file)
|
188
189
|
|
189
190
|
for conversation in bm.conversations.values():
|
@@ -191,11 +192,6 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
|
|
191
192
|
for attachment in message.content.attachments:
|
192
193
|
_clone_collect_cf(binaries, attachment)
|
193
194
|
|
194
|
-
for layout in bm.layouts.values():
|
195
|
-
for block in layout.body.blocks.values():
|
196
|
-
if block.HasField("file"):
|
197
|
-
_clone_collect_cf(binaries, block.file)
|
198
|
-
|
199
195
|
for field_extracted_data in bm.file_extracted_data:
|
200
196
|
if field_extracted_data.HasField("file_thumbnail"):
|
201
197
|
_clone_collect_cf(binaries, field_extracted_data.file_thumbnail)
|
@@ -227,9 +223,7 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
|
|
227
223
|
return binaries
|
228
224
|
|
229
225
|
|
230
|
-
def _clone_collect_cf(
|
231
|
-
binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile
|
232
|
-
):
|
226
|
+
def _clone_collect_cf(binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile):
|
233
227
|
cf = resources_pb2.CloudFile()
|
234
228
|
cf.CopyFrom(origin)
|
235
229
|
# Mark the cloud file of the broker message being exported as export source
|
@@ -246,12 +240,12 @@ async def download_binary(
|
|
246
240
|
|
247
241
|
|
248
242
|
async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
|
249
|
-
async with datamanagers.
|
243
|
+
async with datamanagers.with_ro_transaction() as txn:
|
250
244
|
return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
|
251
245
|
|
252
246
|
|
253
247
|
async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
|
254
|
-
async with datamanagers.
|
248
|
+
async with datamanagers.with_ro_transaction() as txn:
|
255
249
|
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
256
250
|
|
257
251
|
|
@@ -398,7 +392,7 @@ class ExportStreamReader:
|
|
398
392
|
# Backward compatible code for old exports that don't have a learning config.
|
399
393
|
return None, type_bytes + self.stream.buffer
|
400
394
|
data = await self.read_item()
|
401
|
-
lconfig = learning_proxy.LearningConfiguration.
|
395
|
+
lconfig = learning_proxy.LearningConfiguration.model_validate_json(data)
|
402
396
|
return lconfig, self.stream.buffer
|
403
397
|
|
404
398
|
async def iter_items(self) -> AsyncGenerator[ExportItem, None]:
|
@@ -411,7 +405,7 @@ class ExportStreamReader:
|
|
411
405
|
ExportedItemType.ENTITIES: self.read_entities,
|
412
406
|
ExportedItemType.LABELS: self.read_labels,
|
413
407
|
}[item_type]
|
414
|
-
data = await read_data_func()
|
408
|
+
data = await read_data_func()
|
415
409
|
yield item_type, data
|
416
410
|
except ExportStreamExhausted:
|
417
411
|
break
|
@@ -506,21 +500,15 @@ def stream_compatible_with_kb(
|
|
506
500
|
return wrapped()
|
507
501
|
|
508
502
|
|
509
|
-
async def _check_semantic_model_compatibility(
|
510
|
-
kbid: str, stream: AsyncGenerator[bytes, None]
|
511
|
-
) -> bytes:
|
503
|
+
async def _check_semantic_model_compatibility(kbid: str, stream: AsyncGenerator[bytes, None]) -> bytes:
|
512
504
|
stream_reader = ExportStreamReader(stream)
|
513
505
|
lconfig, leftover_bytes = await stream_reader.maybe_read_learning_config()
|
514
506
|
if lconfig is None:
|
515
|
-
logger.warning(
|
516
|
-
"Learning config not found on the export stream. Export may be incompatible."
|
517
|
-
)
|
507
|
+
logger.warning("Learning config not found on the export stream. Export may be incompatible.")
|
518
508
|
return leftover_bytes
|
519
509
|
kb_lconfig = await get_learning_config(kbid)
|
520
510
|
if kb_lconfig is None:
|
521
|
-
logger.warning(
|
522
|
-
"No learning config found on the knowledge box. Export may be incompatible."
|
523
|
-
)
|
511
|
+
logger.warning("No learning config found on the knowledge box. Export may be incompatible.")
|
524
512
|
return leftover_bytes
|
525
513
|
if kb_lconfig.semantic_model == lconfig.semantic_model:
|
526
514
|
logger.info(f"Semantic model match: {kb_lconfig.semantic_model}")
|
nucliadb/health.py
CHANGED
@@ -78,9 +78,7 @@ async def grpc_health_check(health_servicer) -> None:
|
|
78
78
|
for check in _health_checks:
|
79
79
|
if not check():
|
80
80
|
logger.info(f"Health check failed on {check.__name__}")
|
81
|
-
await health_servicer.set(
|
82
|
-
"", health_pb2.HealthCheckResponse.NOT_SERVING
|
83
|
-
)
|
81
|
+
await health_servicer.set("", health_pb2.HealthCheckResponse.NOT_SERVING)
|
84
82
|
break
|
85
83
|
else:
|
86
84
|
await health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)
|