nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/export_import/utils.py
CHANGED
@@ -18,25 +18,27 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import functools
|
21
|
-
from io import BytesIO
|
22
21
|
from typing import AsyncGenerator, AsyncIterator, Callable, Optional
|
23
22
|
|
24
|
-
import nats.errors
|
25
23
|
from google.protobuf.message import DecodeError as ProtobufDecodeError
|
26
24
|
|
25
|
+
from nucliadb import learning_proxy
|
27
26
|
from nucliadb.common import datamanagers
|
28
27
|
from nucliadb.common.context import ApplicationContext
|
29
28
|
from nucliadb.export_import import logger
|
30
29
|
from nucliadb.export_import.datamanager import ExportImportDataManager
|
31
30
|
from nucliadb.export_import.exceptions import (
|
32
31
|
ExportStreamExhausted,
|
32
|
+
IncompatibleExport,
|
33
33
|
WrongExportStreamFormat,
|
34
34
|
)
|
35
35
|
from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
|
36
|
+
from nucliadb.ingest.orm.broker_message import generate_broker_message
|
36
37
|
from nucliadb_models.export_import import Status
|
37
38
|
from nucliadb_protos import knowledgebox_pb2 as kb_pb2
|
38
39
|
from nucliadb_protos import resources_pb2, writer_pb2
|
39
40
|
from nucliadb_utils.const import Streams
|
41
|
+
from nucliadb_utils.transaction import MaxTransactionSizeExceededError
|
40
42
|
|
41
43
|
BinaryStream = AsyncGenerator[bytes, None]
|
42
44
|
BinaryStreamGenerator = Callable[[int], BinaryStream]
|
@@ -50,7 +52,6 @@ PROCESSING_BM_FIELDS = [
|
|
50
52
|
"field_metadata",
|
51
53
|
"field_vectors",
|
52
54
|
"field_large_metadata",
|
53
|
-
"user_vectors",
|
54
55
|
]
|
55
56
|
|
56
57
|
# Broker message fields that are populated by the nucliadb writer component
|
@@ -59,9 +60,6 @@ WRITER_BM_FIELDS = [
|
|
59
60
|
"files",
|
60
61
|
"texts",
|
61
62
|
"conversations",
|
62
|
-
"layouts",
|
63
|
-
"keywordsets",
|
64
|
-
"datetimes",
|
65
63
|
]
|
66
64
|
|
67
65
|
|
@@ -88,7 +86,7 @@ async def transaction_commit(
|
|
88
86
|
wait=False,
|
89
87
|
target_subject=Streams.INGEST_PROCESSED.subject,
|
90
88
|
)
|
91
|
-
except
|
89
|
+
except MaxTransactionSizeExceededError:
|
92
90
|
stored_key = await context.blob_storage.set_stream_message(
|
93
91
|
kbid=bm.kbid, rid=bm.uuid, data=bm.SerializeToString()
|
94
92
|
)
|
@@ -151,23 +149,17 @@ async def set_entities_groups(
|
|
151
149
|
context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
|
152
150
|
) -> None:
|
153
151
|
async with datamanagers.with_transaction() as txn:
|
154
|
-
await datamanagers.entities.set_entities_groups(
|
155
|
-
txn, kbid=kbid, entities_groups=entities_groups
|
156
|
-
)
|
152
|
+
await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
|
157
153
|
await txn.commit()
|
158
154
|
|
159
155
|
|
160
|
-
async def set_labels(
|
161
|
-
context: ApplicationContext, kbid: str, labels: kb_pb2.Labels
|
162
|
-
) -> None:
|
156
|
+
async def set_labels(context: ApplicationContext, kbid: str, labels: kb_pb2.Labels) -> None:
|
163
157
|
async with datamanagers.with_transaction() as txn:
|
164
158
|
await datamanagers.labels.set_labels(txn, kbid=kbid, labels=labels)
|
165
159
|
await txn.commit()
|
166
160
|
|
167
161
|
|
168
|
-
async def iter_kb_resource_uuids(
|
169
|
-
context: ApplicationContext, kbid: str
|
170
|
-
) -> AsyncGenerator[str, None]:
|
162
|
+
async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> AsyncGenerator[str, None]:
|
171
163
|
async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
|
172
164
|
yield rid
|
173
165
|
|
@@ -175,8 +167,13 @@ async def iter_kb_resource_uuids(
|
|
175
167
|
async def get_broker_message(
|
176
168
|
context: ApplicationContext, kbid: str, rid: str
|
177
169
|
) -> Optional[writer_pb2.BrokerMessage]:
|
178
|
-
async with datamanagers.
|
179
|
-
|
170
|
+
async with datamanagers.with_ro_transaction() as txn:
|
171
|
+
resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
|
172
|
+
if resource is None:
|
173
|
+
return None
|
174
|
+
resource.disable_vectors = False
|
175
|
+
resource.txn = txn
|
176
|
+
return await generate_broker_message(resource)
|
180
177
|
|
181
178
|
|
182
179
|
def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFile]:
|
@@ -184,6 +181,10 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
|
|
184
181
|
binaries: list[resources_pb2.CloudFile] = []
|
185
182
|
for file_field in bm.files.values():
|
186
183
|
if file_field.HasField("file"):
|
184
|
+
if file_field.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
|
185
|
+
# Binaries of externally hosted files are not
|
186
|
+
# to be downloaded and stored in the export file
|
187
|
+
continue
|
187
188
|
_clone_collect_cf(binaries, file_field.file)
|
188
189
|
|
189
190
|
for conversation in bm.conversations.values():
|
@@ -191,11 +192,6 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
|
|
191
192
|
for attachment in message.content.attachments:
|
192
193
|
_clone_collect_cf(binaries, attachment)
|
193
194
|
|
194
|
-
for layout in bm.layouts.values():
|
195
|
-
for block in layout.body.blocks.values():
|
196
|
-
if block.HasField("file"):
|
197
|
-
_clone_collect_cf(binaries, block.file)
|
198
|
-
|
199
195
|
for field_extracted_data in bm.file_extracted_data:
|
200
196
|
if field_extracted_data.HasField("file_thumbnail"):
|
201
197
|
_clone_collect_cf(binaries, field_extracted_data.file_thumbnail)
|
@@ -213,6 +209,8 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
|
|
213
209
|
_clone_collect_cf(binaries, link_extracted_data.link_preview)
|
214
210
|
if link_extracted_data.HasField("link_image"):
|
215
211
|
_clone_collect_cf(binaries, link_extracted_data.link_image)
|
212
|
+
for file_generated in link_extracted_data.file_generated.values():
|
213
|
+
_clone_collect_cf(binaries, file_generated)
|
216
214
|
|
217
215
|
for field_metadata in bm.field_metadata:
|
218
216
|
if field_metadata.metadata.metadata.HasField("thumbnail"):
|
@@ -225,9 +223,7 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
|
|
225
223
|
return binaries
|
226
224
|
|
227
225
|
|
228
|
-
def _clone_collect_cf(
|
229
|
-
binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile
|
230
|
-
):
|
226
|
+
def _clone_collect_cf(binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile):
|
231
227
|
cf = resources_pb2.CloudFile()
|
232
228
|
cf.CopyFrom(origin)
|
233
229
|
# Mark the cloud file of the broker message being exported as export source
|
@@ -244,43 +240,21 @@ async def download_binary(
|
|
244
240
|
|
245
241
|
|
246
242
|
async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
|
247
|
-
async with datamanagers.
|
243
|
+
async with datamanagers.with_ro_transaction() as txn:
|
248
244
|
return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
|
249
245
|
|
250
246
|
|
251
247
|
async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
|
252
|
-
async with datamanagers.
|
248
|
+
async with datamanagers.with_ro_transaction() as txn:
|
253
249
|
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
254
250
|
|
255
251
|
|
256
|
-
class EndOfStream(Exception):
|
257
|
-
...
|
252
|
+
class EndOfStream(Exception): ...
|
258
253
|
|
259
254
|
|
260
255
|
class ExportStream:
|
261
256
|
"""
|
262
257
|
Models a stream of export bytes that can be read from asynchronously.
|
263
|
-
"""
|
264
|
-
|
265
|
-
def __init__(self, export: BytesIO):
|
266
|
-
self.export = export
|
267
|
-
self.read_bytes = 0
|
268
|
-
self._length = len(export.getvalue())
|
269
|
-
|
270
|
-
async def read(self, n_bytes):
|
271
|
-
"""
|
272
|
-
Reads n_bytes from the export stream.
|
273
|
-
Raises ExportStreamExhausted if there are no more bytes to read.
|
274
|
-
"""
|
275
|
-
if self.read_bytes == self._length:
|
276
|
-
raise ExportStreamExhausted()
|
277
|
-
chunk = self.export.read(n_bytes)
|
278
|
-
self.read_bytes += len(chunk)
|
279
|
-
return chunk
|
280
|
-
|
281
|
-
|
282
|
-
class IteratorExportStream(ExportStream):
|
283
|
-
"""
|
284
258
|
Adapts the parent class to be able to read bytes yielded from an async iterator.
|
285
259
|
"""
|
286
260
|
|
@@ -325,8 +299,8 @@ class ExportStreamReader:
|
|
325
299
|
yields the deserialized export items ready to be imported.
|
326
300
|
"""
|
327
301
|
|
328
|
-
def __init__(self,
|
329
|
-
self.stream =
|
302
|
+
def __init__(self, stream: AsyncGenerator[bytes, None]):
|
303
|
+
self.stream = ExportStream(stream)
|
330
304
|
|
331
305
|
@property
|
332
306
|
def read_bytes(self) -> int:
|
@@ -401,6 +375,26 @@ class ExportStreamReader:
|
|
401
375
|
raise WrongExportStreamFormat() from ex
|
402
376
|
return labels
|
403
377
|
|
378
|
+
async def maybe_read_learning_config(
|
379
|
+
self,
|
380
|
+
) -> tuple[Optional[learning_proxy.LearningConfiguration], bytes]:
|
381
|
+
"""
|
382
|
+
Tries to read a learning config from the beginning of the stream.
|
383
|
+
Returs the learning config if found. It also returns any leftover bytes that
|
384
|
+
may have been read from the network into memory that need to be yielded and imported.
|
385
|
+
"""
|
386
|
+
# We assume that the learning config is the first item in the export stream.
|
387
|
+
try:
|
388
|
+
type_bytes = await self.stream.read(3)
|
389
|
+
except ExportStreamExhausted:
|
390
|
+
return None, self.stream.buffer
|
391
|
+
if type_bytes != ExportedItemType.LEARNING_CONFIG.value.encode():
|
392
|
+
# Backward compatible code for old exports that don't have a learning config.
|
393
|
+
return None, type_bytes + self.stream.buffer
|
394
|
+
data = await self.read_item()
|
395
|
+
lconfig = learning_proxy.LearningConfiguration.model_validate_json(data)
|
396
|
+
return lconfig, self.stream.buffer
|
397
|
+
|
404
398
|
async def iter_items(self) -> AsyncGenerator[ExportItem, None]:
|
405
399
|
while True:
|
406
400
|
try:
|
@@ -411,7 +405,7 @@ class ExportStreamReader:
|
|
411
405
|
ExportedItemType.ENTITIES: self.read_entities,
|
412
406
|
ExportedItemType.LABELS: self.read_labels,
|
413
407
|
}[item_type]
|
414
|
-
data = await read_data_func()
|
408
|
+
data = await read_data_func()
|
415
409
|
yield item_type, data
|
416
410
|
except ExportStreamExhausted:
|
417
411
|
break
|
@@ -476,3 +470,49 @@ class TaskRetryHandler:
|
|
476
470
|
await self.dm.set_metadata(self.type, metadata)
|
477
471
|
|
478
472
|
return wrapper
|
473
|
+
|
474
|
+
|
475
|
+
async def get_learning_config(
|
476
|
+
kbid: str,
|
477
|
+
) -> Optional[learning_proxy.LearningConfiguration]:
|
478
|
+
return await learning_proxy.get_configuration(kbid)
|
479
|
+
|
480
|
+
|
481
|
+
def stream_compatible_with_kb(
|
482
|
+
kbid: str, stream: AsyncGenerator[bytes, None]
|
483
|
+
) -> AsyncGenerator[bytes, None]:
|
484
|
+
"""
|
485
|
+
Wrapper around an export stream that checks if the export is compatible with the destination knowledge box.
|
486
|
+
"""
|
487
|
+
|
488
|
+
async def wrapped() -> AsyncGenerator[bytes, None]:
|
489
|
+
# Read the a few bytes from the beginning of the stream to check the semantic model.
|
490
|
+
# If the semantic model is not compatible, raise an exception.
|
491
|
+
# If there are leftover bytes, yield them.
|
492
|
+
leftover_bytes = await _check_semantic_model_compatibility(kbid, stream)
|
493
|
+
if len(leftover_bytes) > 0:
|
494
|
+
yield leftover_bytes
|
495
|
+
|
496
|
+
# Now yield the rest of the stream
|
497
|
+
async for chunk in stream:
|
498
|
+
yield chunk
|
499
|
+
|
500
|
+
return wrapped()
|
501
|
+
|
502
|
+
|
503
|
+
async def _check_semantic_model_compatibility(kbid: str, stream: AsyncGenerator[bytes, None]) -> bytes:
|
504
|
+
stream_reader = ExportStreamReader(stream)
|
505
|
+
lconfig, leftover_bytes = await stream_reader.maybe_read_learning_config()
|
506
|
+
if lconfig is None:
|
507
|
+
logger.warning("Learning config not found on the export stream. Export may be incompatible.")
|
508
|
+
return leftover_bytes
|
509
|
+
kb_lconfig = await get_learning_config(kbid)
|
510
|
+
if kb_lconfig is None:
|
511
|
+
logger.warning("No learning config found on the knowledge box. Export may be incompatible.")
|
512
|
+
return leftover_bytes
|
513
|
+
if kb_lconfig.semantic_model == lconfig.semantic_model:
|
514
|
+
logger.info(f"Semantic model match: {kb_lconfig.semantic_model}")
|
515
|
+
return leftover_bytes
|
516
|
+
raise IncompatibleExport(
|
517
|
+
f"Cannot import. Semantic model mismatch: {kb_lconfig.semantic_model} != {lconfig.semantic_model}"
|
518
|
+
)
|
nucliadb/health.py
CHANGED
@@ -78,9 +78,7 @@ async def grpc_health_check(health_servicer) -> None:
|
|
78
78
|
for check in _health_checks:
|
79
79
|
if not check():
|
80
80
|
logger.info(f"Health check failed on {check.__name__}")
|
81
|
-
await health_servicer.set(
|
82
|
-
"", health_pb2.HealthCheckResponse.NOT_SERVING
|
83
|
-
)
|
81
|
+
await health_servicer.set("", health_pb2.HealthCheckResponse.NOT_SERVING)
|
84
82
|
break
|
85
83
|
else:
|
86
84
|
await health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)
|
nucliadb/ingest/app.py
CHANGED
@@ -18,10 +18,9 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import asyncio
|
21
|
+
import importlib.metadata
|
21
22
|
from typing import Awaitable, Callable
|
22
23
|
|
23
|
-
import pkg_resources
|
24
|
-
|
25
24
|
from nucliadb import health
|
26
25
|
from nucliadb.common.cluster.discovery.utils import (
|
27
26
|
setup_cluster_discovery,
|
@@ -30,10 +29,12 @@ from nucliadb.common.cluster.discovery.utils import (
|
|
30
29
|
from nucliadb.common.cluster.settings import settings as cluster_settings
|
31
30
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
32
31
|
from nucliadb.common.context import ApplicationContext
|
32
|
+
from nucliadb.common.nidx import start_nidx_utility
|
33
33
|
from nucliadb.export_import.tasks import get_exports_consumer, get_imports_consumer
|
34
34
|
from nucliadb.ingest import SERVICE_NAME
|
35
35
|
from nucliadb.ingest.consumer import service as consumer_service
|
36
36
|
from nucliadb.ingest.partitions import assign_partitions
|
37
|
+
from nucliadb.ingest.processing import start_processing_engine, stop_processing_engine
|
37
38
|
from nucliadb.ingest.service import start_grpc
|
38
39
|
from nucliadb.ingest.settings import settings
|
39
40
|
from nucliadb_telemetry import errors
|
@@ -46,10 +47,12 @@ from nucliadb_utils.utilities import (
|
|
46
47
|
start_audit_utility,
|
47
48
|
start_indexing_utility,
|
48
49
|
start_nats_manager,
|
50
|
+
start_partitioning_utility,
|
49
51
|
start_transaction_utility,
|
50
52
|
stop_audit_utility,
|
51
53
|
stop_indexing_utility,
|
52
54
|
stop_nats_manager,
|
55
|
+
stop_partitioning_utility,
|
53
56
|
stop_transaction_utility,
|
54
57
|
)
|
55
58
|
|
@@ -59,15 +62,17 @@ async def initialize() -> list[Callable[[], Awaitable[None]]]:
|
|
59
62
|
|
60
63
|
await setup_cluster()
|
61
64
|
await start_transaction_utility(SERVICE_NAME)
|
62
|
-
if
|
63
|
-
not cluster_settings.standalone_mode
|
64
|
-
and indexing_settings.index_jetstream_servers is not None
|
65
|
-
):
|
65
|
+
if not cluster_settings.standalone_mode and indexing_settings.index_jetstream_servers is not None:
|
66
66
|
await start_indexing_utility(SERVICE_NAME)
|
67
67
|
|
68
|
+
start_partitioning_utility()
|
69
|
+
|
70
|
+
await start_nidx_utility()
|
71
|
+
|
68
72
|
await start_audit_utility(SERVICE_NAME)
|
69
73
|
|
70
74
|
finalizers = [
|
75
|
+
stop_partitioning_utility,
|
71
76
|
stop_transaction_utility,
|
72
77
|
stop_indexing_utility,
|
73
78
|
stop_audit_utility,
|
@@ -123,8 +128,7 @@ async def main_consumer(): # pragma: no cover
|
|
123
128
|
ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
|
124
129
|
|
125
130
|
await run_until_exit(
|
126
|
-
[grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown]
|
127
|
-
+ finalizers
|
131
|
+
[grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown] + finalizers
|
128
132
|
)
|
129
133
|
|
130
134
|
|
@@ -138,12 +142,13 @@ async def main_orm_grpc(): # pragma: no cover
|
|
138
142
|
async def main_ingest_processed_consumer(): # pragma: no cover
|
139
143
|
finalizers = await initialize()
|
140
144
|
|
145
|
+
await start_processing_engine()
|
141
146
|
metrics_server = await serve_metrics()
|
142
147
|
grpc_health_finalizer = await health.start_grpc_health_service(settings.grpc_port)
|
143
148
|
consumer = await consumer_service.start_ingest_processed_consumer(SERVICE_NAME)
|
144
149
|
|
145
150
|
await run_until_exit(
|
146
|
-
[grpc_health_finalizer, consumer, metrics_server.shutdown] + finalizers
|
151
|
+
[grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine] + finalizers
|
147
152
|
)
|
148
153
|
|
149
154
|
|
@@ -181,10 +186,9 @@ async def main_subscriber_workers(): # pragma: no cover
|
|
181
186
|
|
182
187
|
def setup_configuration(): # pragma: no cover
|
183
188
|
setup_logging()
|
184
|
-
|
185
189
|
assign_partitions(settings)
|
186
190
|
|
187
|
-
errors.setup_error_handling(
|
191
|
+
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
188
192
|
|
189
193
|
if asyncio._get_running_loop() is not None:
|
190
194
|
raise RuntimeError("cannot be called from a running event loop")
|
@@ -23,14 +23,11 @@ import logging
|
|
23
23
|
import uuid
|
24
24
|
from functools import partial
|
25
25
|
|
26
|
-
from
|
27
|
-
|
26
|
+
from nucliadb.common import datamanagers
|
28
27
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
29
28
|
from nucliadb.common.cluster.manager import choose_node
|
30
29
|
from nucliadb.common.cluster.utils import get_shard_manager
|
31
|
-
from nucliadb.common.
|
32
|
-
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
33
|
-
from nucliadb.ingest.orm.resource import Resource
|
30
|
+
from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
|
34
31
|
from nucliadb_protos import audit_pb2, nodereader_pb2, noderesources_pb2, writer_pb2
|
35
32
|
from nucliadb_utils import const
|
36
33
|
from nucliadb_utils.audit.audit import AuditStorage
|
@@ -62,12 +59,10 @@ class IndexAuditHandler:
|
|
62
59
|
def __init__(
|
63
60
|
self,
|
64
61
|
*,
|
65
|
-
driver: Driver,
|
66
62
|
audit: AuditStorage,
|
67
63
|
pubsub: PubSubDriver,
|
68
64
|
check_delay: float = 5.0,
|
69
65
|
):
|
70
|
-
self.driver = driver
|
71
66
|
self.audit = audit
|
72
67
|
self.pubsub = pubsub
|
73
68
|
self.shard_manager = get_shard_manager()
|
@@ -98,17 +93,15 @@ class IndexAuditHandler:
|
|
98
93
|
metrics.total_messages.inc({"action": "ignored", "type": "audit_counter"})
|
99
94
|
return
|
100
95
|
|
101
|
-
self.task_handler.schedule(
|
102
|
-
notification.kbid, partial(self.process_kb, notification.kbid)
|
103
|
-
)
|
96
|
+
self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
|
104
97
|
metrics.total_messages.inc({"action": "scheduled", "type": "audit_counter"})
|
105
98
|
|
106
99
|
@metrics.handler_histo.wrap({"type": "audit_counter"})
|
107
100
|
async def process_kb(self, kbid: str) -> None:
|
108
101
|
try:
|
109
|
-
shard_groups: list[
|
110
|
-
|
111
|
-
|
102
|
+
shard_groups: list[writer_pb2.ShardObject] = await self.shard_manager.get_shards_by_kbid(
|
103
|
+
kbid
|
104
|
+
)
|
112
105
|
except ShardsNotFound:
|
113
106
|
logger.warning(f"No shards found for kbid {kbid}, skipping")
|
114
107
|
return
|
@@ -119,7 +112,8 @@ class IndexAuditHandler:
|
|
119
112
|
total_paragraphs = 0
|
120
113
|
|
121
114
|
for shard_obj in shard_groups:
|
122
|
-
node,
|
115
|
+
# TODO: Uses node for auditing, don't want to suddenly change metrics
|
116
|
+
node, shard_id = choose_node(shard_obj, use_nidx=False)
|
123
117
|
shard: nodereader_pb2.Shard = await node.reader.GetShard(
|
124
118
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
125
119
|
)
|
@@ -127,12 +121,18 @@ class IndexAuditHandler:
|
|
127
121
|
total_fields += shard.fields
|
128
122
|
total_paragraphs += shard.paragraphs
|
129
123
|
|
130
|
-
|
124
|
+
async with datamanagers.with_ro_transaction() as txn:
|
125
|
+
num_vectorsets = (
|
126
|
+
len([vs async for vs in datamanagers.vectorsets.iter(txn=txn, kbid=kbid)]) or 1
|
127
|
+
)
|
128
|
+
|
129
|
+
self.audit.report_storage(
|
131
130
|
kbid=kbid,
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
131
|
+
paragraphs=total_paragraphs,
|
132
|
+
fields=total_fields,
|
133
|
+
bytes=total_paragraphs # This is an estimation of bytes stored in a KB
|
134
|
+
* AVG_PARAGRAPH_SIZE_BYTES
|
135
|
+
* num_vectorsets,
|
136
136
|
)
|
137
137
|
|
138
138
|
|
@@ -147,12 +147,10 @@ class ResourceWritesAuditHandler:
|
|
147
147
|
def __init__(
|
148
148
|
self,
|
149
149
|
*,
|
150
|
-
driver: Driver,
|
151
150
|
storage: Storage,
|
152
151
|
audit: AuditStorage,
|
153
152
|
pubsub: PubSubDriver,
|
154
153
|
):
|
155
|
-
self.driver = driver
|
156
154
|
self.storage = storage
|
157
155
|
self.audit = audit
|
158
156
|
self.pubsub = pubsub
|
@@ -169,117 +167,6 @@ class ResourceWritesAuditHandler:
|
|
169
167
|
async def finalize(self) -> None:
|
170
168
|
await self.pubsub.unsubscribe(self.subscription_id)
|
171
169
|
|
172
|
-
def iterate_auditable_fields(
|
173
|
-
self,
|
174
|
-
resource_keys: list[tuple[FieldType.ValueType, str]],
|
175
|
-
message: writer_pb2.BrokerMessage,
|
176
|
-
):
|
177
|
-
"""
|
178
|
-
Generator that emits the combined list of field ids from both
|
179
|
-
the existing resource and message that needs to be considered
|
180
|
-
in the audit of fields.
|
181
|
-
"""
|
182
|
-
yielded = set()
|
183
|
-
|
184
|
-
# Include all fields present in the message we are processing
|
185
|
-
for field_id in message.files.keys():
|
186
|
-
key = (field_id, writer_pb2.FieldType.FILE)
|
187
|
-
yield key
|
188
|
-
yielded.add(key)
|
189
|
-
|
190
|
-
for field_id in message.conversations.keys():
|
191
|
-
key = (field_id, writer_pb2.FieldType.CONVERSATION)
|
192
|
-
yield key
|
193
|
-
yielded.add(key)
|
194
|
-
|
195
|
-
for field_id in message.layouts.keys():
|
196
|
-
key = (field_id, writer_pb2.FieldType.LAYOUT)
|
197
|
-
yield key
|
198
|
-
yielded.add(key)
|
199
|
-
|
200
|
-
for field_id in message.texts.keys():
|
201
|
-
key = (field_id, writer_pb2.FieldType.TEXT)
|
202
|
-
yield key
|
203
|
-
yielded.add(key)
|
204
|
-
|
205
|
-
for field_id in message.keywordsets.keys():
|
206
|
-
key = (field_id, writer_pb2.FieldType.KEYWORDSET)
|
207
|
-
yield key
|
208
|
-
yielded.add(key)
|
209
|
-
|
210
|
-
for field_id in message.datetimes.keys():
|
211
|
-
key = (field_id, writer_pb2.FieldType.DATETIME)
|
212
|
-
yield key
|
213
|
-
yielded.add(key)
|
214
|
-
|
215
|
-
for field_id in message.links.keys():
|
216
|
-
key = (field_id, writer_pb2.FieldType.LINK)
|
217
|
-
yield key
|
218
|
-
yielded.add(key)
|
219
|
-
|
220
|
-
for field_type, field_id in resource_keys:
|
221
|
-
if field_type is writer_pb2.FieldType.GENERIC:
|
222
|
-
continue
|
223
|
-
|
224
|
-
if not (
|
225
|
-
field_id in message.files
|
226
|
-
or message.type is writer_pb2.BrokerMessage.MessageType.DELETE
|
227
|
-
):
|
228
|
-
continue
|
229
|
-
|
230
|
-
# Avoid duplicates
|
231
|
-
if (field_type, field_id) in yielded:
|
232
|
-
continue
|
233
|
-
|
234
|
-
yield (field_id, field_type)
|
235
|
-
|
236
|
-
async def collect_audit_fields(
|
237
|
-
self, message: writer_pb2.BrokerMessage
|
238
|
-
) -> list[audit_pb2.AuditField]:
|
239
|
-
if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
|
240
|
-
# If we are fully deleting a resource we won't iterate the delete_fields (if any).
|
241
|
-
# Make no sense as we already collected all resource fields as deleted
|
242
|
-
return []
|
243
|
-
|
244
|
-
audit_storage_fields: list[audit_pb2.AuditField] = []
|
245
|
-
async with self.driver.transaction() as txn:
|
246
|
-
kb = KnowledgeBox(txn, self.storage, message.kbid)
|
247
|
-
resource = Resource(txn, self.storage, kb, message.uuid)
|
248
|
-
field_keys = await resource.get_fields_ids()
|
249
|
-
|
250
|
-
for field_id, field_type in self.iterate_auditable_fields(
|
251
|
-
field_keys, message
|
252
|
-
):
|
253
|
-
auditfield = audit_pb2.AuditField()
|
254
|
-
auditfield.field_type = field_type
|
255
|
-
auditfield.field_id = field_id
|
256
|
-
if field_type is writer_pb2.FieldType.FILE:
|
257
|
-
auditfield.filename = message.files[field_id].file.filename
|
258
|
-
# The field did exist, so we are overwriting it, with a modified file
|
259
|
-
# in case of a file
|
260
|
-
auditfield.action = audit_pb2.AuditField.FieldAction.MODIFIED
|
261
|
-
if field_type is writer_pb2.FieldType.FILE:
|
262
|
-
auditfield.size = message.files[field_id].file.size
|
263
|
-
|
264
|
-
audit_storage_fields.append(auditfield)
|
265
|
-
|
266
|
-
for fieldid in message.delete_fields or []:
|
267
|
-
field = await resource.get_field(
|
268
|
-
fieldid.field, writer_pb2.FieldType.FILE, load=True
|
269
|
-
)
|
270
|
-
audit_field = audit_pb2.AuditField()
|
271
|
-
audit_field.action = audit_pb2.AuditField.FieldAction.DELETED
|
272
|
-
audit_field.field_id = fieldid.field
|
273
|
-
audit_field.field_type = fieldid.field_type
|
274
|
-
if fieldid.field_type is writer_pb2.FieldType.FILE:
|
275
|
-
val = await field.get_value()
|
276
|
-
audit_field.size = 0
|
277
|
-
if val is not None:
|
278
|
-
audit_field.filename = val.file.filename
|
279
|
-
audit_storage_fields.append(audit_field)
|
280
|
-
|
281
|
-
return audit_storage_fields
|
282
|
-
|
283
170
|
async def handle_message(self, raw_data) -> None:
|
284
171
|
data = self.pubsub.parse(raw_data)
|
285
172
|
notification = writer_pb2.Notification()
|
@@ -289,27 +176,23 @@ class ResourceWritesAuditHandler:
|
|
289
176
|
metrics.total_messages.inc({"action": "ignored", "type": "audit_fields"})
|
290
177
|
return
|
291
178
|
|
292
|
-
|
293
|
-
if
|
179
|
+
message_audit: writer_pb2.Audit = notification.message_audit
|
180
|
+
if message_audit.message_source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
294
181
|
metrics.total_messages.inc({"action": "ignored", "type": "audit_fields"})
|
295
182
|
return
|
296
183
|
|
297
|
-
logger.info(
|
298
|
-
{"message": "Processing field audit for kbid", "kbid": notification.kbid}
|
299
|
-
)
|
184
|
+
logger.info({"message": "Processing field audit for kbid", "kbid": notification.kbid})
|
300
185
|
|
301
186
|
metrics.total_messages.inc({"action": "scheduled", "type": "audit_fields"})
|
302
187
|
with metrics.handler_histo({"type": "audit_fields"}):
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
await self.audit.report(
|
307
|
-
kbid=message.kbid,
|
188
|
+
when = message_audit.when if message_audit.HasField("when") else None
|
189
|
+
self.audit.report_and_send(
|
190
|
+
kbid=message_audit.kbid,
|
308
191
|
when=when,
|
309
|
-
user=
|
310
|
-
rid=
|
311
|
-
origin=
|
312
|
-
field_metadata=field_metadata,
|
192
|
+
user=message_audit.user,
|
193
|
+
rid=message_audit.uuid,
|
194
|
+
origin=message_audit.origin,
|
195
|
+
field_metadata=list(message_audit.field_metadata),
|
313
196
|
audit_type=AUDIT_TYPES.get(notification.write_type),
|
314
|
-
audit_fields=audit_fields,
|
197
|
+
audit_fields=list(message_audit.audit_fields),
|
315
198
|
)
|