nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -0,0 +1,752 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
import asyncio
|
21
|
+
import logging
|
22
|
+
from typing import Optional
|
23
|
+
|
24
|
+
import aiohttp.client_exceptions
|
25
|
+
import nats.errors
|
26
|
+
import nats.js.errors
|
27
|
+
|
28
|
+
from nucliadb.common import datamanagers, locking
|
29
|
+
from nucliadb.common.cluster.settings import settings as cluster_settings
|
30
|
+
from nucliadb.common.cluster.utils import get_shard_manager
|
31
|
+
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
32
|
+
from nucliadb.common.external_index_providers.manager import get_external_index_manager
|
33
|
+
from nucliadb.common.maindb.driver import Driver, Transaction
|
34
|
+
from nucliadb.common.maindb.exceptions import ConflictError, MaindbServerError
|
35
|
+
from nucliadb.ingest.orm.exceptions import (
|
36
|
+
DeadletteredError,
|
37
|
+
InvalidBrokerMessage,
|
38
|
+
ResourceNotIndexable,
|
39
|
+
SequenceOrderViolation,
|
40
|
+
)
|
41
|
+
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
42
|
+
from nucliadb.ingest.orm.metrics import processor_observer
|
43
|
+
from nucliadb.ingest.orm.processor import sequence_manager
|
44
|
+
from nucliadb.ingest.orm.processor.auditing import collect_audit_fields
|
45
|
+
from nucliadb.ingest.orm.processor.data_augmentation import (
|
46
|
+
get_generated_fields,
|
47
|
+
send_generated_fields_to_process,
|
48
|
+
)
|
49
|
+
from nucliadb.ingest.orm.resource import Resource
|
50
|
+
from nucliadb_protos import (
|
51
|
+
knowledgebox_pb2,
|
52
|
+
noderesources_pb2,
|
53
|
+
nodewriter_pb2,
|
54
|
+
resources_pb2,
|
55
|
+
writer_pb2,
|
56
|
+
)
|
57
|
+
from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
|
58
|
+
from nucliadb_telemetry import errors
|
59
|
+
from nucliadb_utils import const
|
60
|
+
from nucliadb_utils.cache.pubsub import PubSubDriver
|
61
|
+
from nucliadb_utils.storages.storage import Storage
|
62
|
+
from nucliadb_utils.utilities import get_storage, has_feature
|
63
|
+
|
64
|
+
from .pgcatalog import pgcatalog_delete, pgcatalog_update
|
65
|
+
|
66
|
+
logger = logging.getLogger("ingest-processor")
|
67
|
+
|
68
|
+
MESSAGE_TO_NOTIFICATION_SOURCE = {
|
69
|
+
writer_pb2.BrokerMessage.MessageSource.WRITER: writer_pb2.NotificationSource.WRITER,
|
70
|
+
writer_pb2.BrokerMessage.MessageSource.PROCESSOR: writer_pb2.NotificationSource.PROCESSOR,
|
71
|
+
}
|
72
|
+
|
73
|
+
|
74
|
+
def validate_indexable_resource(resource: noderesources_pb2.Resource) -> None:
|
75
|
+
"""
|
76
|
+
It would be more optimal to move this to another layer but it'd also make the code
|
77
|
+
more difficult to grok and test because we'd need to move processable check and throw
|
78
|
+
an exception in the middle of a bunch of processing logic.
|
79
|
+
|
80
|
+
As it is implemented right now, we just do the check if a resource is indexable right
|
81
|
+
before we actually try to index it and not buried it somewhere else in the code base.
|
82
|
+
|
83
|
+
This is still an edge case.
|
84
|
+
"""
|
85
|
+
num_paragraphs = 0
|
86
|
+
for _, fparagraph in resource.paragraphs.items():
|
87
|
+
# this count should not be very expensive to do since we don't have
|
88
|
+
# a lot of different fields and we just do a count on a dict
|
89
|
+
num_paragraphs += len(fparagraph.paragraphs)
|
90
|
+
|
91
|
+
if num_paragraphs > cluster_settings.max_resource_paragraphs:
|
92
|
+
raise ResourceNotIndexable(
|
93
|
+
"Resource has too many paragraphs. "
|
94
|
+
f"Supported: {cluster_settings.max_resource_paragraphs} , Number: {num_paragraphs}"
|
95
|
+
)
|
96
|
+
|
97
|
+
|
98
|
+
class Processor:
|
99
|
+
"""
|
100
|
+
This class is responsible for processing messages from the broker
|
101
|
+
and attempts to manage sequencing correctly with a txn id implementation.
|
102
|
+
|
103
|
+
The "txn" in this implementation is oriented around the sequence id of
|
104
|
+
messages coming through the message broker.
|
105
|
+
|
106
|
+
Not all writes are going to have a transaction id. For example, writes
|
107
|
+
coming from processor can be coming through a different channel
|
108
|
+
and can not use the txn id
|
109
|
+
"""
|
110
|
+
|
111
|
+
messages: dict[str, list[writer_pb2.BrokerMessage]]
|
112
|
+
|
113
|
+
def __init__(
|
114
|
+
self,
|
115
|
+
driver: Driver,
|
116
|
+
storage: Storage,
|
117
|
+
pubsub: Optional[PubSubDriver] = None,
|
118
|
+
partition: Optional[str] = None,
|
119
|
+
):
|
120
|
+
self.messages = {}
|
121
|
+
self.driver = driver
|
122
|
+
self.storage = storage
|
123
|
+
self.partition = partition
|
124
|
+
self.pubsub = pubsub
|
125
|
+
self.index_node_shard_manager = get_shard_manager()
|
126
|
+
|
127
|
+
async def process(
|
128
|
+
self,
|
129
|
+
message: writer_pb2.BrokerMessage,
|
130
|
+
seqid: int,
|
131
|
+
partition: Optional[str] = None,
|
132
|
+
transaction_check: bool = True,
|
133
|
+
) -> None:
|
134
|
+
partition = partition if self.partition is None else self.partition
|
135
|
+
if partition is None:
|
136
|
+
raise AttributeError("Can't process message from unknown partition")
|
137
|
+
|
138
|
+
# When running in transactional mode, we need to check that
|
139
|
+
# that the current message doesn't violate the sequence order for the
|
140
|
+
# current partition
|
141
|
+
if transaction_check:
|
142
|
+
last_seqid = await sequence_manager.get_last_seqid(self.driver, partition)
|
143
|
+
if last_seqid is not None and seqid <= last_seqid:
|
144
|
+
raise SequenceOrderViolation(last_seqid)
|
145
|
+
|
146
|
+
if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
|
147
|
+
await self.delete_resource(message, seqid, partition, transaction_check)
|
148
|
+
elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
|
149
|
+
await self.txn([message], seqid, partition, transaction_check)
|
150
|
+
elif message.type == writer_pb2.BrokerMessage.MessageType.MULTI:
|
151
|
+
# XXX Not supported right now
|
152
|
+
# MULTI, COMMIT and ROLLBACK are all not supported in transactional mode right now
|
153
|
+
# This concept is probably not tenable with current architecture because
|
154
|
+
# of how nats works and how we would need to manage rollbacks.
|
155
|
+
# XXX Should this be removed?
|
156
|
+
await self.multi(message, seqid)
|
157
|
+
elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
|
158
|
+
await self.commit(message, seqid, partition)
|
159
|
+
elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
|
160
|
+
await self.rollback(message, seqid, partition)
|
161
|
+
|
162
|
+
async def get_resource_uuid(self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage) -> str:
|
163
|
+
if message.uuid is None:
|
164
|
+
uuid = await kb.get_resource_uuid_by_slug(message.slug)
|
165
|
+
else:
|
166
|
+
uuid = message.uuid
|
167
|
+
return uuid
|
168
|
+
|
169
|
+
@processor_observer.wrap({"type": "delete_resource"})
|
170
|
+
async def delete_resource(
|
171
|
+
self,
|
172
|
+
message: writer_pb2.BrokerMessage,
|
173
|
+
seqid: int,
|
174
|
+
partition: str,
|
175
|
+
transaction_check: bool = True,
|
176
|
+
) -> None:
|
177
|
+
async with self.driver.transaction() as txn:
|
178
|
+
try:
|
179
|
+
kb = KnowledgeBox(txn, self.storage, message.kbid)
|
180
|
+
|
181
|
+
uuid = await self.get_resource_uuid(kb, message)
|
182
|
+
async with locking.distributed_lock(
|
183
|
+
locking.RESOURCE_INDEX_LOCK.format(kbid=message.kbid, resource_id=uuid)
|
184
|
+
):
|
185
|
+
# we need to have a lock at indexing time because we don't know if
|
186
|
+
# a resource was in the process of being moved when a delete occurred
|
187
|
+
shard_id = await datamanagers.resources.get_resource_shard_id(
|
188
|
+
txn, kbid=message.kbid, rid=uuid
|
189
|
+
)
|
190
|
+
if shard_id is None:
|
191
|
+
logger.warning(f"Resource {uuid} does not exist")
|
192
|
+
else:
|
193
|
+
shard = await kb.get_resource_shard(shard_id)
|
194
|
+
if shard is None:
|
195
|
+
raise AttributeError("Shard not available")
|
196
|
+
await pgcatalog_delete(txn, message.kbid, uuid)
|
197
|
+
external_index_manager = await get_external_index_manager(kbid=message.kbid)
|
198
|
+
if external_index_manager is not None:
|
199
|
+
await self.external_index_delete_resource(external_index_manager, uuid)
|
200
|
+
else:
|
201
|
+
await self.index_node_shard_manager.delete_resource(
|
202
|
+
shard, message.uuid, seqid, partition, message.kbid
|
203
|
+
)
|
204
|
+
try:
|
205
|
+
await kb.delete_resource(message.uuid)
|
206
|
+
except Exception as exc:
|
207
|
+
await txn.abort()
|
208
|
+
await self.notify_abort(
|
209
|
+
partition=partition,
|
210
|
+
seqid=seqid,
|
211
|
+
multi=message.multiid,
|
212
|
+
kbid=message.kbid,
|
213
|
+
rid=message.uuid,
|
214
|
+
source=message.source,
|
215
|
+
)
|
216
|
+
raise exc
|
217
|
+
finally:
|
218
|
+
if txn.open:
|
219
|
+
if transaction_check:
|
220
|
+
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
221
|
+
await txn.commit()
|
222
|
+
await self.notify_commit(
|
223
|
+
partition=partition,
|
224
|
+
seqid=seqid,
|
225
|
+
multi=message.multiid,
|
226
|
+
message=message,
|
227
|
+
write_type=writer_pb2.Notification.WriteType.DELETED,
|
228
|
+
)
|
229
|
+
|
230
|
+
@processor_observer.wrap({"type": "commit_slug"})
|
231
|
+
async def commit_slug(self, resource: Resource) -> None:
|
232
|
+
# Slug may have conflicts as its not partitioned properly,
|
233
|
+
# so we commit it in a different transaction to make it as short as possible
|
234
|
+
prev_txn = resource.txn
|
235
|
+
try:
|
236
|
+
async with self.driver.transaction() as txn:
|
237
|
+
resource.txn = txn
|
238
|
+
await resource.set_slug()
|
239
|
+
await txn.commit()
|
240
|
+
finally:
|
241
|
+
resource.txn = prev_txn
|
242
|
+
|
243
|
+
@processor_observer.wrap({"type": "txn"})
|
244
|
+
async def txn(
|
245
|
+
self,
|
246
|
+
messages: list[writer_pb2.BrokerMessage],
|
247
|
+
seqid: int,
|
248
|
+
partition: str,
|
249
|
+
transaction_check: bool = True,
|
250
|
+
) -> None:
|
251
|
+
if len(messages) == 0:
|
252
|
+
return None
|
253
|
+
|
254
|
+
kbid = messages[0].kbid
|
255
|
+
if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
|
256
|
+
logger.info(f"KB {kbid} is deleted: skiping txn")
|
257
|
+
if transaction_check:
|
258
|
+
async with datamanagers.with_rw_transaction() as txn:
|
259
|
+
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
260
|
+
await txn.commit()
|
261
|
+
return None
|
262
|
+
|
263
|
+
async with self.driver.transaction() as txn:
|
264
|
+
try:
|
265
|
+
multi = messages[0].multiid
|
266
|
+
kb = KnowledgeBox(txn, self.storage, kbid)
|
267
|
+
uuid = await self.get_resource_uuid(kb, messages[0])
|
268
|
+
resource: Optional[Resource] = None
|
269
|
+
handled_exception = None
|
270
|
+
created = False
|
271
|
+
|
272
|
+
for message in messages:
|
273
|
+
if resource is not None:
|
274
|
+
assert resource.uuid == message.uuid
|
275
|
+
|
276
|
+
if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
|
277
|
+
resource = await kb.get(uuid)
|
278
|
+
if resource is None:
|
279
|
+
# It's a new resource
|
280
|
+
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
281
|
+
created = True
|
282
|
+
else:
|
283
|
+
# It's an update from writer for an existing resource
|
284
|
+
...
|
285
|
+
|
286
|
+
elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
287
|
+
resource = await kb.get(uuid)
|
288
|
+
if resource is None:
|
289
|
+
logger.info(
|
290
|
+
f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
|
291
|
+
)
|
292
|
+
continue
|
293
|
+
else:
|
294
|
+
# It's an update from processor for an existing resource
|
295
|
+
...
|
296
|
+
|
297
|
+
generated_fields = await get_generated_fields(message, resource)
|
298
|
+
if generated_fields.is_not_empty():
|
299
|
+
await send_generated_fields_to_process(
|
300
|
+
kbid, resource, generated_fields, message
|
301
|
+
)
|
302
|
+
# TODO: remove this when processor sends the field set
|
303
|
+
for generated_text in generated_fields.texts:
|
304
|
+
message.texts[
|
305
|
+
generated_text
|
306
|
+
].generated_by.data_augmentation.SetInParent()
|
307
|
+
|
308
|
+
else:
|
309
|
+
raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
|
310
|
+
|
311
|
+
# apply changes from the broker message to the resource
|
312
|
+
await self.apply_resource(message, resource, update=(not created))
|
313
|
+
|
314
|
+
# index message
|
315
|
+
|
316
|
+
if resource:
|
317
|
+
await resource.compute_global_text()
|
318
|
+
await resource.compute_global_tags(resource.indexer)
|
319
|
+
await resource.compute_security(resource.indexer)
|
320
|
+
if message.reindex:
|
321
|
+
# when reindexing, let's just generate full new index message
|
322
|
+
resource.replace_indexer(await resource.generate_index_message(reindex=True))
|
323
|
+
|
324
|
+
if resource and resource.modified:
|
325
|
+
await pgcatalog_update(txn, kbid, resource)
|
326
|
+
await self.index_resource( # noqa
|
327
|
+
resource=resource,
|
328
|
+
txn=txn,
|
329
|
+
uuid=uuid,
|
330
|
+
kbid=kbid,
|
331
|
+
seqid=seqid,
|
332
|
+
partition=partition,
|
333
|
+
kb=kb,
|
334
|
+
source=messages_source(messages),
|
335
|
+
)
|
336
|
+
if transaction_check:
|
337
|
+
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
338
|
+
await txn.commit()
|
339
|
+
|
340
|
+
if created:
|
341
|
+
await self.commit_slug(resource)
|
342
|
+
|
343
|
+
await self.notify_commit(
|
344
|
+
partition=partition,
|
345
|
+
seqid=seqid,
|
346
|
+
multi=multi,
|
347
|
+
message=message,
|
348
|
+
write_type=(
|
349
|
+
writer_pb2.Notification.WriteType.CREATED
|
350
|
+
if created
|
351
|
+
else writer_pb2.Notification.WriteType.MODIFIED
|
352
|
+
),
|
353
|
+
)
|
354
|
+
elif resource and resource.modified is False:
|
355
|
+
await txn.abort()
|
356
|
+
await self.notify_abort(
|
357
|
+
partition=partition,
|
358
|
+
seqid=seqid,
|
359
|
+
multi=multi,
|
360
|
+
kbid=kbid,
|
361
|
+
rid=uuid,
|
362
|
+
source=message.source,
|
363
|
+
)
|
364
|
+
logger.warning("This message did not modify the resource")
|
365
|
+
except (
|
366
|
+
asyncio.TimeoutError,
|
367
|
+
asyncio.CancelledError,
|
368
|
+
aiohttp.client_exceptions.ClientError,
|
369
|
+
ConflictError,
|
370
|
+
MaindbServerError,
|
371
|
+
nats.errors.NoRespondersError,
|
372
|
+
nats.js.errors.NoStreamResponseError,
|
373
|
+
): # pragma: no cover
|
374
|
+
# Unhandled exceptions here that should bubble and hard fail
|
375
|
+
# XXX We swallow too many exceptions here!
|
376
|
+
await self.notify_abort(
|
377
|
+
partition=partition,
|
378
|
+
seqid=seqid,
|
379
|
+
multi=multi,
|
380
|
+
kbid=kbid,
|
381
|
+
rid=uuid,
|
382
|
+
source=message.source,
|
383
|
+
)
|
384
|
+
raise
|
385
|
+
except Exception as exc:
|
386
|
+
# As we are in the middle of a transaction, we cannot let the exception raise directly
|
387
|
+
# as we need to do some cleanup. The exception will be reraised at the end of the function
|
388
|
+
# and then handled by the top caller, so errors can be handled in the same place.
|
389
|
+
await self.deadletter(messages, partition, seqid)
|
390
|
+
await self.notify_abort(
|
391
|
+
partition=partition,
|
392
|
+
seqid=seqid,
|
393
|
+
multi=multi,
|
394
|
+
kbid=kbid,
|
395
|
+
rid=uuid,
|
396
|
+
source=message.source,
|
397
|
+
)
|
398
|
+
handled_exception = exc
|
399
|
+
finally:
|
400
|
+
if resource is not None:
|
401
|
+
resource.clean()
|
402
|
+
# txn should be already commited or aborted, but in the event of an exception
|
403
|
+
# it could be left open. Make sure to close it if it's still open
|
404
|
+
if txn.open:
|
405
|
+
await txn.abort()
|
406
|
+
|
407
|
+
if handled_exception is not None:
|
408
|
+
if seqid == -1:
|
409
|
+
raise handled_exception
|
410
|
+
else:
|
411
|
+
if resource is not None:
|
412
|
+
await self._mark_resource_error(kb, resource, partition, seqid)
|
413
|
+
raise DeadletteredError() from handled_exception
|
414
|
+
|
415
|
+
return None
|
416
|
+
|
417
|
+
async def get_or_assign_resource_shard(
|
418
|
+
self, txn: Transaction, kb: KnowledgeBox, uuid: str
|
419
|
+
) -> writer_pb2.ShardObject:
|
420
|
+
kbid = kb.kbid
|
421
|
+
async with locking.distributed_lock(
|
422
|
+
locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=uuid)
|
423
|
+
):
|
424
|
+
# we need to have a lock at indexing time because we don't know if
|
425
|
+
# a resource was move to another shard while it was being indexed
|
426
|
+
shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
|
427
|
+
|
428
|
+
shard = None
|
429
|
+
if shard_id is not None:
|
430
|
+
# Resource already has a shard assigned
|
431
|
+
shard = await kb.get_resource_shard(shard_id)
|
432
|
+
if shard is None:
|
433
|
+
raise AttributeError("Shard not available")
|
434
|
+
else:
|
435
|
+
# It's a new resource, get KB's current active shard to place new resource on
|
436
|
+
shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
|
437
|
+
if shard is None:
|
438
|
+
# No current shard available, create a new one
|
439
|
+
shard = await self.index_node_shard_manager.create_shard_by_kbid(txn, kbid)
|
440
|
+
await datamanagers.resources.set_resource_shard_id(
|
441
|
+
txn, kbid=kbid, rid=uuid, shard=shard.shard
|
442
|
+
)
|
443
|
+
return shard
|
444
|
+
|
445
|
+
@processor_observer.wrap({"type": "index_resource"})
|
446
|
+
async def index_resource(
|
447
|
+
self,
|
448
|
+
resource: Resource,
|
449
|
+
txn: Transaction,
|
450
|
+
uuid: str,
|
451
|
+
kbid: str,
|
452
|
+
seqid: int,
|
453
|
+
partition: str,
|
454
|
+
kb: KnowledgeBox,
|
455
|
+
source: nodewriter_pb2.IndexMessageSource.ValueType,
|
456
|
+
) -> None:
|
457
|
+
validate_indexable_resource(resource.indexer.brain)
|
458
|
+
shard = await self.get_or_assign_resource_shard(txn, kb, uuid)
|
459
|
+
index_message = resource.indexer.brain
|
460
|
+
external_index_manager = await get_external_index_manager(kbid=kbid)
|
461
|
+
if external_index_manager is not None:
|
462
|
+
await self.external_index_add_resource(external_index_manager, uuid, index_message)
|
463
|
+
else:
|
464
|
+
await self.index_node_shard_manager.add_resource(
|
465
|
+
shard,
|
466
|
+
index_message,
|
467
|
+
seqid,
|
468
|
+
partition=partition,
|
469
|
+
kb=kbid,
|
470
|
+
source=source,
|
471
|
+
)
|
472
|
+
|
473
|
+
async def external_index_delete_resource(
|
474
|
+
self, external_index_manager: ExternalIndexManager, resource_uuid: str
|
475
|
+
):
|
476
|
+
if self.should_skip_external_index(external_index_manager):
|
477
|
+
logger.warning(
|
478
|
+
"Skipping external index delete resource",
|
479
|
+
extra={
|
480
|
+
"kbid": external_index_manager.kbid,
|
481
|
+
"rid": resource_uuid,
|
482
|
+
"provider": external_index_manager.type.value,
|
483
|
+
},
|
484
|
+
)
|
485
|
+
return
|
486
|
+
await external_index_manager.delete_resource(resource_uuid=resource_uuid)
|
487
|
+
|
488
|
+
def should_skip_external_index(self, external_index_manager: ExternalIndexManager) -> bool:
|
489
|
+
"""
|
490
|
+
This is a safety measure to skip external indexing in case that the external index provider is not working.
|
491
|
+
As we don't want to block the ingestion pipeline, this is a temporary measure until we implement async consumers
|
492
|
+
to index to external indexes.
|
493
|
+
"""
|
494
|
+
kbid = external_index_manager.kbid
|
495
|
+
provider_type = external_index_manager.type.value
|
496
|
+
return has_feature(
|
497
|
+
const.Features.SKIP_EXTERNAL_INDEX,
|
498
|
+
context={"kbid": kbid, "provider": provider_type},
|
499
|
+
default=False,
|
500
|
+
)
|
501
|
+
|
502
|
+
async def external_index_add_resource(
|
503
|
+
self,
|
504
|
+
external_index_manager: ExternalIndexManager,
|
505
|
+
resource_uuid: str,
|
506
|
+
index_message: PBBrainResource,
|
507
|
+
):
|
508
|
+
if not has_vectors_operation(index_message):
|
509
|
+
return
|
510
|
+
if self.should_skip_external_index(external_index_manager):
|
511
|
+
logger.warning(
|
512
|
+
"Skipping external index for resource",
|
513
|
+
extra={
|
514
|
+
"kbid": external_index_manager.kbid,
|
515
|
+
"rid": resource_uuid,
|
516
|
+
"provider": external_index_manager.type.value,
|
517
|
+
},
|
518
|
+
)
|
519
|
+
return
|
520
|
+
await external_index_manager.index_resource(
|
521
|
+
resource_uuid=resource_uuid, resource_data=index_message
|
522
|
+
)
|
523
|
+
|
524
|
+
async def multi(self, message: writer_pb2.BrokerMessage, seqid: int) -> None:
|
525
|
+
self.messages.setdefault(message.multiid, []).append(message)
|
526
|
+
|
527
|
+
async def commit(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
|
528
|
+
if message.multiid not in self.messages:
|
529
|
+
# Error
|
530
|
+
logger.error(f"Closed multi {message.multiid}")
|
531
|
+
await self.deadletter([message], partition, seqid)
|
532
|
+
else:
|
533
|
+
await self.txn(self.messages[message.multiid], seqid, partition)
|
534
|
+
|
535
|
+
async def rollback(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
|
536
|
+
# Error
|
537
|
+
logger.error(f"Closed multi {message.multiid}")
|
538
|
+
del self.messages[message.multiid]
|
539
|
+
await self.notify_abort(
|
540
|
+
partition=partition,
|
541
|
+
seqid=seqid,
|
542
|
+
multi=message.multiid,
|
543
|
+
kbid=message.kbid,
|
544
|
+
rid=message.uuid,
|
545
|
+
source=message.source,
|
546
|
+
)
|
547
|
+
|
548
|
+
async def deadletter(
|
549
|
+
self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
|
550
|
+
) -> None:
|
551
|
+
for seq, message in enumerate(messages):
|
552
|
+
await self.storage.deadletter(message, seq, seqid, partition)
|
553
|
+
|
554
|
+
@processor_observer.wrap({"type": "apply_resource"})
|
555
|
+
async def apply_resource(
|
556
|
+
self,
|
557
|
+
message: writer_pb2.BrokerMessage,
|
558
|
+
resource: Resource,
|
559
|
+
update: bool = False,
|
560
|
+
):
|
561
|
+
"""Apply broker message to resource object in the database"""
|
562
|
+
if update:
|
563
|
+
await self.maybe_update_resource_basic(resource, message)
|
564
|
+
|
565
|
+
if message.HasField("origin"):
|
566
|
+
await resource.set_origin(message.origin)
|
567
|
+
|
568
|
+
if message.HasField("extra"):
|
569
|
+
await resource.set_extra(message.extra)
|
570
|
+
|
571
|
+
if message.HasField("security"):
|
572
|
+
await resource.set_security(message.security)
|
573
|
+
|
574
|
+
await resource.apply_fields(message)
|
575
|
+
await resource.apply_extracted(message)
|
576
|
+
|
577
|
+
async def maybe_update_resource_basic(
|
578
|
+
self, resource: Resource, message: writer_pb2.BrokerMessage
|
579
|
+
) -> None:
|
580
|
+
basic_field_updates = message.HasField("basic")
|
581
|
+
deleted_fields = len(message.delete_fields) > 0
|
582
|
+
if not (basic_field_updates or deleted_fields):
|
583
|
+
return
|
584
|
+
|
585
|
+
await resource.set_basic(
|
586
|
+
message.basic,
|
587
|
+
deleted_fields=message.delete_fields, # type: ignore
|
588
|
+
)
|
589
|
+
|
590
|
+
async def get_extended_audit_data(self, message: writer_pb2.BrokerMessage) -> writer_pb2.Audit:
|
591
|
+
message_audit = writer_pb2.Audit()
|
592
|
+
message_audit.CopyFrom(message.audit)
|
593
|
+
message_audit.kbid = message.kbid
|
594
|
+
message_audit.uuid = message.uuid
|
595
|
+
message_audit.message_source = message.source
|
596
|
+
message_audit.field_metadata.extend([fcmw.field for fcmw in message.field_metadata])
|
597
|
+
audit_fields = await collect_audit_fields(self.driver, self.storage, message)
|
598
|
+
message_audit.audit_fields.extend(audit_fields)
|
599
|
+
return message_audit
|
600
|
+
|
601
|
+
async def notify_commit(
|
602
|
+
self,
|
603
|
+
*,
|
604
|
+
partition: str,
|
605
|
+
seqid: int,
|
606
|
+
multi: str,
|
607
|
+
message: writer_pb2.BrokerMessage,
|
608
|
+
write_type: writer_pb2.Notification.WriteType.ValueType,
|
609
|
+
):
|
610
|
+
message_audit = await self.get_extended_audit_data(message)
|
611
|
+
notification = writer_pb2.Notification(
|
612
|
+
partition=int(partition),
|
613
|
+
seqid=seqid,
|
614
|
+
multi=multi,
|
615
|
+
uuid=message.uuid,
|
616
|
+
kbid=message.kbid,
|
617
|
+
action=writer_pb2.Notification.Action.COMMIT,
|
618
|
+
write_type=write_type,
|
619
|
+
source=MESSAGE_TO_NOTIFICATION_SOURCE[message.source],
|
620
|
+
processing_errors=len(message.errors) > 0,
|
621
|
+
message_audit=message_audit,
|
622
|
+
)
|
623
|
+
|
624
|
+
await self.notify(
|
625
|
+
const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=message.kbid),
|
626
|
+
notification.SerializeToString(),
|
627
|
+
)
|
628
|
+
|
629
|
+
async def notify_abort(
|
630
|
+
self,
|
631
|
+
*,
|
632
|
+
partition: str,
|
633
|
+
seqid: int,
|
634
|
+
multi: str,
|
635
|
+
kbid: str,
|
636
|
+
rid: str,
|
637
|
+
source: writer_pb2.BrokerMessage.MessageSource.ValueType,
|
638
|
+
):
|
639
|
+
message = writer_pb2.Notification(
|
640
|
+
partition=int(partition),
|
641
|
+
seqid=seqid,
|
642
|
+
multi=multi,
|
643
|
+
uuid=rid,
|
644
|
+
kbid=kbid,
|
645
|
+
action=writer_pb2.Notification.ABORT,
|
646
|
+
source=MESSAGE_TO_NOTIFICATION_SOURCE[source],
|
647
|
+
)
|
648
|
+
await self.notify(
|
649
|
+
const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=kbid),
|
650
|
+
message.SerializeToString(),
|
651
|
+
)
|
652
|
+
|
653
|
+
async def notify(self, channel, payload: bytes):
|
654
|
+
if self.pubsub is not None:
|
655
|
+
await self.pubsub.publish(channel, payload)
|
656
|
+
|
657
|
+
async def _mark_resource_error(
|
658
|
+
self, kb: KnowledgeBox, resource: Optional[Resource], partition: str, seqid: int
|
659
|
+
) -> None:
|
660
|
+
"""
|
661
|
+
Unhandled error processing, try to mark resource as error
|
662
|
+
"""
|
663
|
+
if resource is None or resource.basic is None:
|
664
|
+
logger.info(f"Skip when resource does not even have basic metadata: {resource}")
|
665
|
+
return
|
666
|
+
try:
|
667
|
+
async with self.driver.transaction() as txn:
|
668
|
+
kb.txn = resource.txn = txn
|
669
|
+
|
670
|
+
shard_id = await datamanagers.resources.get_resource_shard_id(
|
671
|
+
txn, kbid=kb.kbid, rid=resource.uuid
|
672
|
+
)
|
673
|
+
shard = None
|
674
|
+
if shard_id is not None:
|
675
|
+
shard = await kb.get_resource_shard(shard_id)
|
676
|
+
if shard is None:
|
677
|
+
logger.warning(
|
678
|
+
"Unable to mark resource as error, shard is None. "
|
679
|
+
"This should not happen so you did something special to get here."
|
680
|
+
)
|
681
|
+
return
|
682
|
+
|
683
|
+
resource.basic.metadata.status = resources_pb2.Metadata.Status.ERROR
|
684
|
+
await resource.set_basic(resource.basic)
|
685
|
+
await txn.commit()
|
686
|
+
|
687
|
+
resource.indexer.set_processing_status(
|
688
|
+
basic=resource.basic, previous_status=resource._previous_status
|
689
|
+
)
|
690
|
+
await self.index_node_shard_manager.add_resource(
|
691
|
+
shard, resource.indexer.brain, seqid, partition=partition, kb=kb.kbid
|
692
|
+
)
|
693
|
+
except Exception:
|
694
|
+
logger.warning("Error while marking resource as error", exc_info=True)
|
695
|
+
|
696
|
+
# KB tools
|
697
|
+
# XXX: Why are these utility functions here?
|
698
|
+
async def get_kb_obj(
|
699
|
+
self, txn: Transaction, kbid: knowledgebox_pb2.KnowledgeBoxID
|
700
|
+
) -> Optional[KnowledgeBox]:
|
701
|
+
uuid: Optional[str] = kbid.uuid
|
702
|
+
if uuid == "":
|
703
|
+
uuid = await datamanagers.kb.get_kb_uuid(txn, slug=kbid.slug)
|
704
|
+
|
705
|
+
if uuid is None:
|
706
|
+
return None
|
707
|
+
|
708
|
+
if not (await datamanagers.kb.exists_kb(txn, kbid=uuid)):
|
709
|
+
return None
|
710
|
+
|
711
|
+
storage = await get_storage()
|
712
|
+
kbobj = KnowledgeBox(txn, storage, uuid)
|
713
|
+
return kbobj
|
714
|
+
|
715
|
+
|
716
|
+
def messages_source(messages: list[writer_pb2.BrokerMessage]):
|
717
|
+
from_writer = all(
|
718
|
+
(message.source == writer_pb2.BrokerMessage.MessageSource.WRITER for message in messages)
|
719
|
+
)
|
720
|
+
from_processor = all(
|
721
|
+
(message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR for message in messages)
|
722
|
+
)
|
723
|
+
if from_writer:
|
724
|
+
source = nodewriter_pb2.IndexMessageSource.WRITER
|
725
|
+
elif from_processor:
|
726
|
+
source = nodewriter_pb2.IndexMessageSource.PROCESSOR
|
727
|
+
else: # pragma: no cover
|
728
|
+
msg = "Processor received multiple broker messages with different sources in the same txn!"
|
729
|
+
logger.error(msg)
|
730
|
+
errors.capture_exception(Exception(msg))
|
731
|
+
source = nodewriter_pb2.IndexMessageSource.PROCESSOR
|
732
|
+
return source
|
733
|
+
|
734
|
+
|
735
|
+
def has_vectors_operation(index_message: PBBrainResource) -> bool:
|
736
|
+
"""
|
737
|
+
Returns True if the index message has any vectors to index or to delete.
|
738
|
+
"""
|
739
|
+
if (
|
740
|
+
len(index_message.sentences_to_delete) > 0
|
741
|
+
or len(index_message.paragraphs_to_delete) > 0
|
742
|
+
or any([len(deletions.items) for deletions in index_message.vector_prefixes_to_delete.values()])
|
743
|
+
):
|
744
|
+
return True
|
745
|
+
for field_paragraphs in index_message.paragraphs.values():
|
746
|
+
for paragraph in field_paragraphs.paragraphs.values():
|
747
|
+
if len(paragraph.sentences) > 0:
|
748
|
+
return True
|
749
|
+
for vectorset_sentences in paragraph.vectorsets_sentences.values():
|
750
|
+
if len(vectorset_sentences.sentences) > 0:
|
751
|
+
return True
|
752
|
+
return False
|