nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -16,704 +16,6 @@
|
|
16
16
|
#
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import asyncio
|
21
|
-
import logging
|
22
|
-
from typing import Optional
|
23
|
-
|
24
|
-
import aiohttp.client_exceptions
|
25
|
-
|
26
|
-
from nucliadb.common import datamanagers, locking
|
27
|
-
from nucliadb.common.cluster.settings import settings as cluster_settings
|
28
|
-
from nucliadb.common.cluster.utils import get_shard_manager
|
29
|
-
from nucliadb.common.maindb.driver import Driver, Transaction
|
30
|
-
from nucliadb.common.maindb.exceptions import ConflictError
|
31
|
-
from nucliadb.ingest.orm.exceptions import (
|
32
|
-
DeadletteredError,
|
33
|
-
KnowledgeBoxConflict,
|
34
|
-
ResourceNotIndexable,
|
35
|
-
SequenceOrderViolation,
|
36
|
-
)
|
37
|
-
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
38
|
-
from nucliadb.ingest.orm.metrics import processor_observer
|
39
|
-
from nucliadb.ingest.orm.processor import sequence_manager
|
40
|
-
from nucliadb.ingest.orm.processor.auditing import collect_audit_fields
|
41
|
-
from nucliadb.ingest.orm.resource import Resource
|
42
|
-
from nucliadb_protos import (
|
43
|
-
knowledgebox_pb2,
|
44
|
-
noderesources_pb2,
|
45
|
-
nodewriter_pb2,
|
46
|
-
resources_pb2,
|
47
|
-
utils_pb2,
|
48
|
-
writer_pb2,
|
49
|
-
)
|
50
|
-
from nucliadb_telemetry import errors
|
51
|
-
from nucliadb_utils import const
|
52
|
-
from nucliadb_utils.cache.pubsub import PubSubDriver
|
53
|
-
from nucliadb_utils.storages.storage import Storage
|
54
|
-
from nucliadb_utils.utilities import get_storage
|
55
|
-
|
56
|
-
logger = logging.getLogger(__name__)
|
57
|
-
|
58
|
-
|
59
|
-
MESSAGE_TO_NOTIFICATION_SOURCE = {
|
60
|
-
writer_pb2.BrokerMessage.MessageSource.WRITER: writer_pb2.NotificationSource.WRITER,
|
61
|
-
writer_pb2.BrokerMessage.MessageSource.PROCESSOR: writer_pb2.NotificationSource.PROCESSOR,
|
62
|
-
}
|
63
|
-
|
64
|
-
|
65
|
-
def validate_indexable_resource(resource: noderesources_pb2.Resource) -> None:
|
66
|
-
"""
|
67
|
-
It would be more optimal to move this to another layer but it'd also make the code
|
68
|
-
more difficult to grok and test because we'd need to move processable check and throw
|
69
|
-
an exception in the middle of a bunch of processing logic.
|
70
|
-
|
71
|
-
As it is implemented right now, we just do the check if a resource is indexable right
|
72
|
-
before we actually try to index it and not buried it somewhere else in the code base.
|
73
|
-
|
74
|
-
This is still an edge case.
|
75
|
-
"""
|
76
|
-
num_paragraphs = 0
|
77
|
-
for _, fparagraph in resource.paragraphs.items():
|
78
|
-
# this count should not be very expensive to do since we don't have
|
79
|
-
# a lot of different fields and we just do a count on a dict
|
80
|
-
num_paragraphs += len(fparagraph.paragraphs)
|
81
|
-
|
82
|
-
if num_paragraphs > cluster_settings.max_resource_paragraphs:
|
83
|
-
raise ResourceNotIndexable(
|
84
|
-
"Resource has too many paragraphs. "
|
85
|
-
f"Supported: {cluster_settings.max_resource_paragraphs} , Number: {num_paragraphs}"
|
86
|
-
)
|
87
|
-
|
88
|
-
|
89
|
-
class Processor:
|
90
|
-
"""
|
91
|
-
This class is responsible for processing messages from the broker
|
92
|
-
and attempts to manage sequencing correctly with a txn id implementation.
|
93
|
-
|
94
|
-
The "txn" in this implementation is oriented around the sequence id of
|
95
|
-
messages coming through the message broker.
|
96
|
-
|
97
|
-
Not all writes are going to have a transaction id. For example, writes
|
98
|
-
coming from processor can be coming through a different channel
|
99
|
-
and can not use the txn id
|
100
|
-
"""
|
101
|
-
|
102
|
-
messages: dict[str, list[writer_pb2.BrokerMessage]]
|
103
|
-
|
104
|
-
def __init__(
|
105
|
-
self,
|
106
|
-
driver: Driver,
|
107
|
-
storage: Storage,
|
108
|
-
pubsub: Optional[PubSubDriver] = None,
|
109
|
-
partition: Optional[str] = None,
|
110
|
-
):
|
111
|
-
self.messages = {}
|
112
|
-
self.driver = driver
|
113
|
-
self.storage = storage
|
114
|
-
self.partition = partition
|
115
|
-
self.pubsub = pubsub
|
116
|
-
self.shard_manager = get_shard_manager()
|
117
|
-
|
118
|
-
async def process(
|
119
|
-
self,
|
120
|
-
message: writer_pb2.BrokerMessage,
|
121
|
-
seqid: int,
|
122
|
-
partition: Optional[str] = None,
|
123
|
-
transaction_check: bool = True,
|
124
|
-
) -> None:
|
125
|
-
partition = partition if self.partition is None else self.partition
|
126
|
-
if partition is None:
|
127
|
-
raise AttributeError("Can't process message from unknown partition")
|
128
|
-
|
129
|
-
# When running in transactional mode, we need to check that
|
130
|
-
# that the current message doesn't violate the sequence order for the
|
131
|
-
# current partition
|
132
|
-
if transaction_check:
|
133
|
-
last_seqid = await sequence_manager.get_last_seqid(self.driver, partition)
|
134
|
-
if last_seqid is not None and seqid <= last_seqid:
|
135
|
-
raise SequenceOrderViolation(last_seqid)
|
136
|
-
|
137
|
-
if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
|
138
|
-
await self.delete_resource(message, seqid, partition, transaction_check)
|
139
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
|
140
|
-
await self.txn([message], seqid, partition, transaction_check)
|
141
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.MULTI:
|
142
|
-
# XXX Not supported right now
|
143
|
-
# MULTI, COMMIT and ROLLBACK are all not supported in transactional mode right now
|
144
|
-
# This concept is probably not tenable with current architecture because
|
145
|
-
# of how nats works and how we would need to manage rollbacks.
|
146
|
-
# XXX Should this be removed?
|
147
|
-
await self.multi(message, seqid)
|
148
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
|
149
|
-
await self.commit(message, seqid, partition)
|
150
|
-
elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
|
151
|
-
await self.rollback(message, seqid, partition)
|
152
|
-
|
153
|
-
async def get_resource_uuid(
|
154
|
-
self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage
|
155
|
-
) -> str:
|
156
|
-
if message.uuid is None:
|
157
|
-
uuid = await kb.get_resource_uuid_by_slug(message.slug)
|
158
|
-
else:
|
159
|
-
uuid = message.uuid
|
160
|
-
return uuid
|
161
|
-
|
162
|
-
@processor_observer.wrap({"type": "delete_resource"})
|
163
|
-
async def delete_resource(
|
164
|
-
self,
|
165
|
-
message: writer_pb2.BrokerMessage,
|
166
|
-
seqid: int,
|
167
|
-
partition: str,
|
168
|
-
transaction_check: bool = True,
|
169
|
-
) -> None:
|
170
|
-
txn = await self.driver.begin()
|
171
|
-
try:
|
172
|
-
kb = KnowledgeBox(txn, self.storage, message.kbid)
|
173
|
-
|
174
|
-
uuid = await self.get_resource_uuid(kb, message)
|
175
|
-
async with locking.distributed_lock(
|
176
|
-
locking.RESOURCE_INDEX_LOCK.format(kbid=message.kbid, resource_id=uuid)
|
177
|
-
):
|
178
|
-
# we need to have a lock at indexing time because we don't know if
|
179
|
-
# a resource was in the process of being moved when a delete occurred
|
180
|
-
shard_id = await datamanagers.resources.get_resource_shard_id(
|
181
|
-
txn, kbid=message.kbid, rid=uuid
|
182
|
-
)
|
183
|
-
if shard_id is None:
|
184
|
-
logger.warning(f"Resource {uuid} does not exist")
|
185
|
-
else:
|
186
|
-
shard = await kb.get_resource_shard(shard_id)
|
187
|
-
if shard is None:
|
188
|
-
raise AttributeError("Shard not available")
|
189
|
-
|
190
|
-
await self.shard_manager.delete_resource(
|
191
|
-
shard, message.uuid, seqid, partition, message.kbid
|
192
|
-
)
|
193
|
-
try:
|
194
|
-
await kb.delete_resource(message.uuid)
|
195
|
-
except Exception as exc:
|
196
|
-
await txn.abort()
|
197
|
-
await self.notify_abort(
|
198
|
-
partition=partition,
|
199
|
-
seqid=seqid,
|
200
|
-
multi=message.multiid,
|
201
|
-
kbid=message.kbid,
|
202
|
-
rid=message.uuid,
|
203
|
-
source=message.source,
|
204
|
-
)
|
205
|
-
raise exc
|
206
|
-
finally:
|
207
|
-
if txn.open:
|
208
|
-
if transaction_check:
|
209
|
-
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
210
|
-
await txn.commit()
|
211
|
-
await self.notify_commit(
|
212
|
-
partition=partition,
|
213
|
-
seqid=seqid,
|
214
|
-
multi=message.multiid,
|
215
|
-
message=message,
|
216
|
-
write_type=writer_pb2.Notification.WriteType.DELETED,
|
217
|
-
)
|
218
|
-
|
219
|
-
@processor_observer.wrap({"type": "commit_slug"})
|
220
|
-
async def commit_slug(self, resource: Resource) -> None:
|
221
|
-
# Slug may have conflicts as its not partitioned properly,
|
222
|
-
# so we commit it in a different transaction to make it as short as possible
|
223
|
-
prev_txn = resource.txn
|
224
|
-
try:
|
225
|
-
async with self.driver.transaction() as txn:
|
226
|
-
resource.txn = txn
|
227
|
-
await resource.set_slug()
|
228
|
-
await txn.commit()
|
229
|
-
finally:
|
230
|
-
resource.txn = prev_txn
|
231
|
-
|
232
|
-
@processor_observer.wrap({"type": "txn"})
|
233
|
-
async def txn(
|
234
|
-
self,
|
235
|
-
messages: list[writer_pb2.BrokerMessage],
|
236
|
-
seqid: int,
|
237
|
-
partition: str,
|
238
|
-
transaction_check: bool = True,
|
239
|
-
) -> None:
|
240
|
-
if len(messages) == 0:
|
241
|
-
return None
|
242
|
-
|
243
|
-
txn = await self.driver.begin()
|
244
|
-
kbid = messages[0].kbid
|
245
|
-
if not await datamanagers.kb.exists_kb(txn, kbid=kbid):
|
246
|
-
logger.info(f"KB {kbid} is deleted: skiping txn")
|
247
|
-
if transaction_check:
|
248
|
-
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
249
|
-
await txn.commit()
|
250
|
-
return None
|
251
|
-
|
252
|
-
try:
|
253
|
-
multi = messages[0].multiid
|
254
|
-
kb = KnowledgeBox(txn, self.storage, kbid)
|
255
|
-
uuid = await self.get_resource_uuid(kb, messages[0])
|
256
|
-
resource: Optional[Resource] = None
|
257
|
-
handled_exception = None
|
258
|
-
created = False
|
259
|
-
|
260
|
-
for message in messages:
|
261
|
-
if resource is not None:
|
262
|
-
assert resource.uuid == message.uuid
|
263
|
-
result = await self.apply_resource(message, kb, resource)
|
264
|
-
|
265
|
-
if result is None:
|
266
|
-
continue
|
267
|
-
|
268
|
-
resource, _created = result
|
269
|
-
created = created or _created
|
270
|
-
|
271
|
-
if resource:
|
272
|
-
await resource.compute_global_text()
|
273
|
-
await resource.compute_global_tags(resource.indexer)
|
274
|
-
await resource.compute_security(resource.indexer)
|
275
|
-
if message.reindex:
|
276
|
-
# when reindexing, let's just generate full new index message
|
277
|
-
resource.replace_indexer(await resource.generate_index_message())
|
278
|
-
|
279
|
-
if resource and resource.modified:
|
280
|
-
await self.index_resource( # noqa
|
281
|
-
resource=resource,
|
282
|
-
txn=txn,
|
283
|
-
uuid=uuid,
|
284
|
-
kbid=kbid,
|
285
|
-
seqid=seqid,
|
286
|
-
partition=partition,
|
287
|
-
kb=kb,
|
288
|
-
source=messages_source(messages),
|
289
|
-
)
|
290
|
-
|
291
|
-
if transaction_check:
|
292
|
-
await sequence_manager.set_last_seqid(txn, partition, seqid)
|
293
|
-
await txn.commit()
|
294
|
-
|
295
|
-
if created:
|
296
|
-
await self.commit_slug(resource)
|
297
|
-
|
298
|
-
await self.notify_commit(
|
299
|
-
partition=partition,
|
300
|
-
seqid=seqid,
|
301
|
-
multi=multi,
|
302
|
-
message=message,
|
303
|
-
write_type=(
|
304
|
-
writer_pb2.Notification.WriteType.CREATED
|
305
|
-
if created
|
306
|
-
else writer_pb2.Notification.WriteType.MODIFIED
|
307
|
-
),
|
308
|
-
)
|
309
|
-
elif resource and resource.modified is False:
|
310
|
-
await txn.abort()
|
311
|
-
await self.notify_abort(
|
312
|
-
partition=partition,
|
313
|
-
seqid=seqid,
|
314
|
-
multi=multi,
|
315
|
-
kbid=kbid,
|
316
|
-
rid=uuid,
|
317
|
-
source=message.source,
|
318
|
-
)
|
319
|
-
logger.warning("This message did not modify the resource")
|
320
|
-
except (
|
321
|
-
asyncio.TimeoutError,
|
322
|
-
asyncio.CancelledError,
|
323
|
-
aiohttp.client_exceptions.ClientError,
|
324
|
-
ConflictError,
|
325
|
-
): # pragma: no cover
|
326
|
-
# Unhandled exceptions here that should bubble and hard fail
|
327
|
-
# XXX We swallow too many exceptions here!
|
328
|
-
await self.notify_abort(
|
329
|
-
partition=partition,
|
330
|
-
seqid=seqid,
|
331
|
-
multi=multi,
|
332
|
-
kbid=kbid,
|
333
|
-
rid=uuid,
|
334
|
-
source=message.source,
|
335
|
-
)
|
336
|
-
raise
|
337
|
-
except Exception as exc:
|
338
|
-
# As we are in the middle of a transaction, we cannot let the exception raise directly
|
339
|
-
# as we need to do some cleanup. The exception will be reraised at the end of the function
|
340
|
-
# and then handled by the top caller, so errors can be handled in the same place.
|
341
|
-
await self.deadletter(messages, partition, seqid)
|
342
|
-
await self.notify_abort(
|
343
|
-
partition=partition,
|
344
|
-
seqid=seqid,
|
345
|
-
multi=multi,
|
346
|
-
kbid=kbid,
|
347
|
-
rid=uuid,
|
348
|
-
source=message.source,
|
349
|
-
)
|
350
|
-
handled_exception = exc
|
351
|
-
finally:
|
352
|
-
if resource is not None:
|
353
|
-
resource.clean()
|
354
|
-
# txn should be already commited or aborted, but in the event of an exception
|
355
|
-
# it could be left open. Make sure to close it if it's still open
|
356
|
-
if txn.open:
|
357
|
-
await txn.abort()
|
358
|
-
|
359
|
-
if handled_exception is not None:
|
360
|
-
if seqid == -1:
|
361
|
-
raise handled_exception
|
362
|
-
else:
|
363
|
-
if resource is not None:
|
364
|
-
await self._mark_resource_error(kb, resource, partition, seqid)
|
365
|
-
raise DeadletteredError() from handled_exception
|
366
|
-
|
367
|
-
return None
|
368
|
-
|
369
|
-
@processor_observer.wrap({"type": "index_resource"})
|
370
|
-
async def index_resource(
|
371
|
-
self,
|
372
|
-
resource: Resource,
|
373
|
-
txn: Transaction,
|
374
|
-
uuid: str,
|
375
|
-
kbid: str,
|
376
|
-
seqid: int,
|
377
|
-
partition: str,
|
378
|
-
kb: KnowledgeBox,
|
379
|
-
source: nodewriter_pb2.IndexMessageSource.ValueType,
|
380
|
-
) -> None:
|
381
|
-
validate_indexable_resource(resource.indexer.brain)
|
382
|
-
|
383
|
-
async with locking.distributed_lock(
|
384
|
-
locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=uuid)
|
385
|
-
):
|
386
|
-
# we need to have a lock at indexing time because we don't know if
|
387
|
-
# a resource was move to another shard while it was being indexed
|
388
|
-
shard_id = await datamanagers.resources.get_resource_shard_id(
|
389
|
-
txn, kbid=kbid, rid=uuid
|
390
|
-
)
|
391
|
-
|
392
|
-
shard = None
|
393
|
-
if shard_id is not None:
|
394
|
-
shard = await kb.get_resource_shard(shard_id)
|
395
|
-
|
396
|
-
if shard is None:
|
397
|
-
# It's a new resource, get current active shard to place
|
398
|
-
# new resource on
|
399
|
-
shard = await self.shard_manager.get_current_active_shard(txn, kbid)
|
400
|
-
if shard is None:
|
401
|
-
# no shard available, create a new one
|
402
|
-
shard = await self.shard_manager.create_shard_by_kbid(txn, kbid)
|
403
|
-
await datamanagers.resources.set_resource_shard_id(
|
404
|
-
txn, kbid=kbid, rid=uuid, shard=shard.shard
|
405
|
-
)
|
406
|
-
|
407
|
-
if shard is not None:
|
408
|
-
index_message = resource.indexer.brain
|
409
|
-
await self.shard_manager.add_resource(
|
410
|
-
shard,
|
411
|
-
index_message,
|
412
|
-
seqid,
|
413
|
-
partition=partition,
|
414
|
-
kb=kbid,
|
415
|
-
source=source,
|
416
|
-
)
|
417
|
-
else:
|
418
|
-
raise AttributeError("Shard is not available")
|
419
|
-
|
420
|
-
async def multi(self, message: writer_pb2.BrokerMessage, seqid: int) -> None:
|
421
|
-
self.messages.setdefault(message.multiid, []).append(message)
|
422
|
-
|
423
|
-
async def commit(
|
424
|
-
self, message: writer_pb2.BrokerMessage, seqid: int, partition: str
|
425
|
-
) -> None:
|
426
|
-
if message.multiid not in self.messages:
|
427
|
-
# Error
|
428
|
-
logger.error(f"Closed multi {message.multiid}")
|
429
|
-
await self.deadletter([message], partition, seqid)
|
430
|
-
else:
|
431
|
-
await self.txn(self.messages[message.multiid], seqid, partition)
|
432
|
-
|
433
|
-
async def rollback(
|
434
|
-
self, message: writer_pb2.BrokerMessage, seqid: int, partition: str
|
435
|
-
) -> None:
|
436
|
-
# Error
|
437
|
-
logger.error(f"Closed multi {message.multiid}")
|
438
|
-
del self.messages[message.multiid]
|
439
|
-
await self.notify_abort(
|
440
|
-
partition=partition,
|
441
|
-
seqid=seqid,
|
442
|
-
multi=message.multiid,
|
443
|
-
kbid=message.kbid,
|
444
|
-
rid=message.uuid,
|
445
|
-
source=message.source,
|
446
|
-
)
|
447
|
-
|
448
|
-
async def deadletter(
|
449
|
-
self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
|
450
|
-
) -> None:
|
451
|
-
for seq, message in enumerate(messages):
|
452
|
-
await self.storage.deadletter(message, seq, seqid, partition)
|
453
|
-
|
454
|
-
@processor_observer.wrap({"type": "apply_resource"})
|
455
|
-
async def apply_resource(
|
456
|
-
self,
|
457
|
-
message: writer_pb2.BrokerMessage,
|
458
|
-
kb: KnowledgeBox,
|
459
|
-
resource: Optional[Resource] = None,
|
460
|
-
) -> Optional[tuple[Resource, bool]]:
|
461
|
-
"""
|
462
|
-
Convert a broker message into a resource object, and apply it to the database
|
463
|
-
"""
|
464
|
-
created = False
|
465
|
-
|
466
|
-
if resource is None:
|
467
|
-
# Make sure we load the resource in case it already exists on db
|
468
|
-
if message.uuid is None and message.slug:
|
469
|
-
uuid = await kb.get_resource_uuid_by_slug(message.slug)
|
470
|
-
else:
|
471
|
-
uuid = message.uuid
|
472
|
-
resource = await kb.get(uuid)
|
473
|
-
|
474
|
-
if resource is None and message.source is message.MessageSource.WRITER:
|
475
|
-
# It's a new resource
|
476
|
-
resource = await kb.add_resource(uuid, message.slug, message.basic)
|
477
|
-
created = True
|
478
|
-
elif resource is not None:
|
479
|
-
# It's an update of an existing resource, can come either from writer or
|
480
|
-
# from processing
|
481
|
-
await self.maybe_update_resource_basic(resource, message)
|
482
|
-
elif resource is None and message.source is message.MessageSource.PROCESSOR:
|
483
|
-
# It's a new resource, and somehow we received the message coming from processing before
|
484
|
-
# the "fast" one, this shouldn't happen
|
485
|
-
logger.info(
|
486
|
-
f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
|
487
|
-
)
|
488
|
-
return None
|
489
|
-
|
490
|
-
if resource is None:
|
491
|
-
return None
|
492
|
-
|
493
|
-
if message.HasField("origin"):
|
494
|
-
await resource.set_origin(message.origin)
|
495
|
-
|
496
|
-
if message.HasField("extra"):
|
497
|
-
await resource.set_extra(message.extra)
|
498
|
-
|
499
|
-
if message.HasField("security"):
|
500
|
-
await resource.set_security(message.security)
|
501
|
-
|
502
|
-
await resource.apply_fields(message)
|
503
|
-
await resource.apply_extracted(message)
|
504
|
-
return (resource, created)
|
505
|
-
|
506
|
-
async def maybe_update_resource_basic(
|
507
|
-
self, resource: Resource, message: writer_pb2.BrokerMessage
|
508
|
-
) -> None:
|
509
|
-
basic_field_updates = message.HasField("basic")
|
510
|
-
deleted_fields = len(message.delete_fields) > 0
|
511
|
-
if not (basic_field_updates or deleted_fields):
|
512
|
-
return
|
513
|
-
|
514
|
-
await resource.set_basic(
|
515
|
-
message.basic,
|
516
|
-
deleted_fields=message.delete_fields, # type: ignore
|
517
|
-
)
|
518
|
-
|
519
|
-
async def get_extended_audit_data(
|
520
|
-
self, message: writer_pb2.BrokerMessage
|
521
|
-
) -> writer_pb2.Audit:
|
522
|
-
message_audit = writer_pb2.Audit()
|
523
|
-
message_audit.CopyFrom(message.audit)
|
524
|
-
message_audit.kbid = message.kbid
|
525
|
-
message_audit.uuid = message.uuid
|
526
|
-
message_audit.message_source = message.source
|
527
|
-
message_audit.field_metadata.extend(
|
528
|
-
[fcmw.field for fcmw in message.field_metadata]
|
529
|
-
)
|
530
|
-
audit_fields = await collect_audit_fields(self.driver, self.storage, message)
|
531
|
-
message_audit.audit_fields.extend(audit_fields)
|
532
|
-
return message_audit
|
533
|
-
|
534
|
-
async def notify_commit(
|
535
|
-
self,
|
536
|
-
*,
|
537
|
-
partition: str,
|
538
|
-
seqid: int,
|
539
|
-
multi: str,
|
540
|
-
message: writer_pb2.BrokerMessage,
|
541
|
-
write_type: writer_pb2.Notification.WriteType.ValueType,
|
542
|
-
):
|
543
|
-
message_audit = await self.get_extended_audit_data(message)
|
544
|
-
notification = writer_pb2.Notification(
|
545
|
-
partition=int(partition),
|
546
|
-
seqid=seqid,
|
547
|
-
multi=multi,
|
548
|
-
uuid=message.uuid,
|
549
|
-
kbid=message.kbid,
|
550
|
-
action=writer_pb2.Notification.Action.COMMIT,
|
551
|
-
write_type=write_type,
|
552
|
-
source=MESSAGE_TO_NOTIFICATION_SOURCE[message.source],
|
553
|
-
processing_errors=len(message.errors) > 0,
|
554
|
-
message_audit=message_audit,
|
555
|
-
)
|
556
|
-
|
557
|
-
await self.notify(
|
558
|
-
const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=message.kbid),
|
559
|
-
notification.SerializeToString(),
|
560
|
-
)
|
561
|
-
|
562
|
-
async def notify_abort(
|
563
|
-
self,
|
564
|
-
*,
|
565
|
-
partition: str,
|
566
|
-
seqid: int,
|
567
|
-
multi: str,
|
568
|
-
kbid: str,
|
569
|
-
rid: str,
|
570
|
-
source: writer_pb2.BrokerMessage.MessageSource.ValueType,
|
571
|
-
):
|
572
|
-
message = writer_pb2.Notification(
|
573
|
-
partition=int(partition),
|
574
|
-
seqid=seqid,
|
575
|
-
multi=multi,
|
576
|
-
uuid=rid,
|
577
|
-
kbid=kbid,
|
578
|
-
action=writer_pb2.Notification.ABORT,
|
579
|
-
source=MESSAGE_TO_NOTIFICATION_SOURCE[source],
|
580
|
-
)
|
581
|
-
await self.notify(
|
582
|
-
const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=kbid),
|
583
|
-
message.SerializeToString(),
|
584
|
-
)
|
585
|
-
|
586
|
-
async def notify(self, channel, payload: bytes):
|
587
|
-
if self.pubsub is not None:
|
588
|
-
await self.pubsub.publish(channel, payload)
|
589
|
-
|
590
|
-
async def _mark_resource_error(
|
591
|
-
self, kb: KnowledgeBox, resource: Optional[Resource], partition: str, seqid: int
|
592
|
-
) -> None:
|
593
|
-
"""
|
594
|
-
Unhandled error processing, try to mark resource as error
|
595
|
-
"""
|
596
|
-
if resource is None or resource.basic is None:
|
597
|
-
logger.info(
|
598
|
-
f"Skip when resource does not even have basic metadata: {resource}"
|
599
|
-
)
|
600
|
-
return
|
601
|
-
try:
|
602
|
-
async with self.driver.transaction() as txn:
|
603
|
-
kb.txn = resource.txn = txn
|
604
|
-
|
605
|
-
shard_id = await datamanagers.resources.get_resource_shard_id(
|
606
|
-
txn, kbid=kb.kbid, rid=resource.uuid
|
607
|
-
)
|
608
|
-
shard = None
|
609
|
-
if shard_id is not None:
|
610
|
-
shard = await kb.get_resource_shard(shard_id)
|
611
|
-
if shard is None:
|
612
|
-
logger.warning(
|
613
|
-
"Unable to mark resource as error, shard is None. "
|
614
|
-
"This should not happen so you did something special to get here."
|
615
|
-
)
|
616
|
-
return
|
617
|
-
|
618
|
-
resource.basic.metadata.status = resources_pb2.Metadata.Status.ERROR
|
619
|
-
await resource.set_basic(resource.basic)
|
620
|
-
await txn.commit()
|
621
|
-
|
622
|
-
resource.indexer.set_processing_status(
|
623
|
-
basic=resource.basic, previous_status=resource._previous_status
|
624
|
-
)
|
625
|
-
await self.shard_manager.add_resource(
|
626
|
-
shard, resource.indexer.brain, seqid, partition=partition, kb=kb.kbid
|
627
|
-
)
|
628
|
-
except Exception:
|
629
|
-
logger.warning("Error while marking resource as error", exc_info=True)
|
630
|
-
|
631
|
-
# KB tools
|
632
|
-
# XXX: Why are these utility functions here?
|
633
|
-
async def get_kb_obj(
|
634
|
-
self, txn: Transaction, kbid: knowledgebox_pb2.KnowledgeBoxID
|
635
|
-
) -> Optional[KnowledgeBox]:
|
636
|
-
uuid: Optional[str] = kbid.uuid
|
637
|
-
if uuid == "":
|
638
|
-
uuid = await datamanagers.kb.get_kb_uuid(txn, slug=kbid.slug)
|
639
|
-
|
640
|
-
if uuid is None:
|
641
|
-
return None
|
642
|
-
|
643
|
-
if not (await datamanagers.kb.exists_kb(txn, kbid=uuid)):
|
644
|
-
return None
|
645
|
-
|
646
|
-
storage = await get_storage()
|
647
|
-
kbobj = KnowledgeBox(txn, storage, uuid)
|
648
|
-
return kbobj
|
649
|
-
|
650
|
-
@processor_observer.wrap({"type": "create_kb"})
|
651
|
-
async def create_kb(
|
652
|
-
self,
|
653
|
-
slug: str,
|
654
|
-
config: Optional[knowledgebox_pb2.KnowledgeBoxConfig],
|
655
|
-
semantic_model: knowledgebox_pb2.SemanticModelMetadata,
|
656
|
-
forceuuid: Optional[str] = None,
|
657
|
-
release_channel: utils_pb2.ReleaseChannel.ValueType = utils_pb2.ReleaseChannel.STABLE,
|
658
|
-
) -> str:
|
659
|
-
async with self.driver.transaction() as txn:
|
660
|
-
try:
|
661
|
-
uuid, failed = await KnowledgeBox.create(
|
662
|
-
txn,
|
663
|
-
slug,
|
664
|
-
semantic_model,
|
665
|
-
uuid=forceuuid,
|
666
|
-
config=config,
|
667
|
-
release_channel=release_channel,
|
668
|
-
)
|
669
|
-
if failed:
|
670
|
-
raise Exception("Failed to create KB")
|
671
|
-
await txn.commit()
|
672
|
-
return uuid
|
673
|
-
except KnowledgeBoxConflict:
|
674
|
-
raise
|
675
|
-
except Exception as e:
|
676
|
-
errors.capture_exception(e)
|
677
|
-
raise e
|
678
|
-
|
679
|
-
async def update_kb(
|
680
|
-
self,
|
681
|
-
kbid: str,
|
682
|
-
slug: str,
|
683
|
-
config: Optional[knowledgebox_pb2.KnowledgeBoxConfig],
|
684
|
-
) -> str:
|
685
|
-
async with self.driver.transaction() as txn:
|
686
|
-
uuid = await KnowledgeBox.update(txn, kbid, slug, config=config)
|
687
|
-
await txn.commit()
|
688
|
-
return uuid
|
689
|
-
|
690
|
-
async def delete_kb(self, kbid: str) -> str:
|
691
|
-
async with self.driver.transaction() as txn:
|
692
|
-
uuid = await KnowledgeBox.delete_kb(txn, kbid)
|
693
|
-
await txn.commit()
|
694
|
-
return uuid
|
695
|
-
|
696
19
|
|
697
|
-
|
698
|
-
|
699
|
-
(
|
700
|
-
message.source == writer_pb2.BrokerMessage.MessageSource.WRITER
|
701
|
-
for message in messages
|
702
|
-
)
|
703
|
-
)
|
704
|
-
from_processor = all(
|
705
|
-
(
|
706
|
-
message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR
|
707
|
-
for message in messages
|
708
|
-
)
|
709
|
-
)
|
710
|
-
if from_writer:
|
711
|
-
source = nodewriter_pb2.IndexMessageSource.WRITER
|
712
|
-
elif from_processor:
|
713
|
-
source = nodewriter_pb2.IndexMessageSource.PROCESSOR
|
714
|
-
else: # pragma: nocover
|
715
|
-
msg = "Processor received multiple broker messages with different sources in the same txn!"
|
716
|
-
logger.error(msg)
|
717
|
-
errors.capture_exception(Exception(msg))
|
718
|
-
source = nodewriter_pb2.IndexMessageSource.PROCESSOR
|
719
|
-
return source
|
20
|
+
# reexports
|
21
|
+
from .processor import Processor, validate_indexable_resource # noqa: F401
|