nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/ingest/app.py
CHANGED
@@ -18,10 +18,9 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import asyncio
|
21
|
+
import importlib.metadata
|
21
22
|
from typing import Awaitable, Callable
|
22
23
|
|
23
|
-
import pkg_resources
|
24
|
-
|
25
24
|
from nucliadb import health
|
26
25
|
from nucliadb.common.cluster.discovery.utils import (
|
27
26
|
setup_cluster_discovery,
|
@@ -30,10 +29,12 @@ from nucliadb.common.cluster.discovery.utils import (
|
|
30
29
|
from nucliadb.common.cluster.settings import settings as cluster_settings
|
31
30
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
32
31
|
from nucliadb.common.context import ApplicationContext
|
32
|
+
from nucliadb.common.nidx import start_nidx_utility
|
33
33
|
from nucliadb.export_import.tasks import get_exports_consumer, get_imports_consumer
|
34
34
|
from nucliadb.ingest import SERVICE_NAME
|
35
35
|
from nucliadb.ingest.consumer import service as consumer_service
|
36
36
|
from nucliadb.ingest.partitions import assign_partitions
|
37
|
+
from nucliadb.ingest.processing import start_processing_engine, stop_processing_engine
|
37
38
|
from nucliadb.ingest.service import start_grpc
|
38
39
|
from nucliadb.ingest.settings import settings
|
39
40
|
from nucliadb_telemetry import errors
|
@@ -46,10 +47,12 @@ from nucliadb_utils.utilities import (
|
|
46
47
|
start_audit_utility,
|
47
48
|
start_indexing_utility,
|
48
49
|
start_nats_manager,
|
50
|
+
start_partitioning_utility,
|
49
51
|
start_transaction_utility,
|
50
52
|
stop_audit_utility,
|
51
53
|
stop_indexing_utility,
|
52
54
|
stop_nats_manager,
|
55
|
+
stop_partitioning_utility,
|
53
56
|
stop_transaction_utility,
|
54
57
|
)
|
55
58
|
|
@@ -59,15 +62,17 @@ async def initialize() -> list[Callable[[], Awaitable[None]]]:
|
|
59
62
|
|
60
63
|
await setup_cluster()
|
61
64
|
await start_transaction_utility(SERVICE_NAME)
|
62
|
-
if
|
63
|
-
not cluster_settings.standalone_mode
|
64
|
-
and indexing_settings.index_jetstream_servers is not None
|
65
|
-
):
|
65
|
+
if not cluster_settings.standalone_mode and indexing_settings.index_jetstream_servers is not None:
|
66
66
|
await start_indexing_utility(SERVICE_NAME)
|
67
67
|
|
68
|
+
start_partitioning_utility()
|
69
|
+
|
70
|
+
await start_nidx_utility()
|
71
|
+
|
68
72
|
await start_audit_utility(SERVICE_NAME)
|
69
73
|
|
70
74
|
finalizers = [
|
75
|
+
stop_partitioning_utility,
|
71
76
|
stop_transaction_utility,
|
72
77
|
stop_indexing_utility,
|
73
78
|
stop_audit_utility,
|
@@ -123,8 +128,7 @@ async def main_consumer(): # pragma: no cover
|
|
123
128
|
ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
|
124
129
|
|
125
130
|
await run_until_exit(
|
126
|
-
[grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown]
|
127
|
-
+ finalizers
|
131
|
+
[grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown] + finalizers
|
128
132
|
)
|
129
133
|
|
130
134
|
|
@@ -138,12 +142,13 @@ async def main_orm_grpc(): # pragma: no cover
|
|
138
142
|
async def main_ingest_processed_consumer(): # pragma: no cover
|
139
143
|
finalizers = await initialize()
|
140
144
|
|
145
|
+
await start_processing_engine()
|
141
146
|
metrics_server = await serve_metrics()
|
142
147
|
grpc_health_finalizer = await health.start_grpc_health_service(settings.grpc_port)
|
143
148
|
consumer = await consumer_service.start_ingest_processed_consumer(SERVICE_NAME)
|
144
149
|
|
145
150
|
await run_until_exit(
|
146
|
-
[grpc_health_finalizer, consumer, metrics_server.shutdown] + finalizers
|
151
|
+
[grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine] + finalizers
|
147
152
|
)
|
148
153
|
|
149
154
|
|
@@ -181,10 +186,9 @@ async def main_subscriber_workers(): # pragma: no cover
|
|
181
186
|
|
182
187
|
def setup_configuration(): # pragma: no cover
|
183
188
|
setup_logging()
|
184
|
-
|
185
189
|
assign_partitions(settings)
|
186
190
|
|
187
|
-
errors.setup_error_handling(
|
191
|
+
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
188
192
|
|
189
193
|
if asyncio._get_running_loop() is not None:
|
190
194
|
raise RuntimeError("cannot be called from a running event loop")
|
@@ -23,9 +23,11 @@ import logging
|
|
23
23
|
import uuid
|
24
24
|
from functools import partial
|
25
25
|
|
26
|
+
from nucliadb.common import datamanagers
|
26
27
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
27
28
|
from nucliadb.common.cluster.manager import choose_node
|
28
29
|
from nucliadb.common.cluster.utils import get_shard_manager
|
30
|
+
from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
|
29
31
|
from nucliadb_protos import audit_pb2, nodereader_pb2, noderesources_pb2, writer_pb2
|
30
32
|
from nucliadb_utils import const
|
31
33
|
from nucliadb_utils.audit.audit import AuditStorage
|
@@ -91,16 +93,14 @@ class IndexAuditHandler:
|
|
91
93
|
metrics.total_messages.inc({"action": "ignored", "type": "audit_counter"})
|
92
94
|
return
|
93
95
|
|
94
|
-
self.task_handler.schedule(
|
95
|
-
notification.kbid, partial(self.process_kb, notification.kbid)
|
96
|
-
)
|
96
|
+
self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
|
97
97
|
metrics.total_messages.inc({"action": "scheduled", "type": "audit_counter"})
|
98
98
|
|
99
99
|
@metrics.handler_histo.wrap({"type": "audit_counter"})
|
100
100
|
async def process_kb(self, kbid: str) -> None:
|
101
101
|
try:
|
102
|
-
shard_groups: list[writer_pb2.ShardObject] = (
|
103
|
-
|
102
|
+
shard_groups: list[writer_pb2.ShardObject] = await self.shard_manager.get_shards_by_kbid(
|
103
|
+
kbid
|
104
104
|
)
|
105
105
|
except ShardsNotFound:
|
106
106
|
logger.warning(f"No shards found for kbid {kbid}, skipping")
|
@@ -112,7 +112,8 @@ class IndexAuditHandler:
|
|
112
112
|
total_paragraphs = 0
|
113
113
|
|
114
114
|
for shard_obj in shard_groups:
|
115
|
-
node,
|
115
|
+
# TODO: Uses node for auditing, don't want to suddenly change metrics
|
116
|
+
node, shard_id = choose_node(shard_obj, use_nidx=False)
|
116
117
|
shard: nodereader_pb2.Shard = await node.reader.GetShard(
|
117
118
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
118
119
|
)
|
@@ -120,12 +121,18 @@ class IndexAuditHandler:
|
|
120
121
|
total_fields += shard.fields
|
121
122
|
total_paragraphs += shard.paragraphs
|
122
123
|
|
123
|
-
|
124
|
+
async with datamanagers.with_ro_transaction() as txn:
|
125
|
+
num_vectorsets = (
|
126
|
+
len([vs async for vs in datamanagers.vectorsets.iter(txn=txn, kbid=kbid)]) or 1
|
127
|
+
)
|
128
|
+
|
129
|
+
self.audit.report_storage(
|
124
130
|
kbid=kbid,
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
131
|
+
paragraphs=total_paragraphs,
|
132
|
+
fields=total_fields,
|
133
|
+
bytes=total_paragraphs # This is an estimation of bytes stored in a KB
|
134
|
+
* AVG_PARAGRAPH_SIZE_BYTES
|
135
|
+
* num_vectorsets,
|
129
136
|
)
|
130
137
|
|
131
138
|
|
@@ -170,21 +177,16 @@ class ResourceWritesAuditHandler:
|
|
170
177
|
return
|
171
178
|
|
172
179
|
message_audit: writer_pb2.Audit = notification.message_audit
|
173
|
-
if
|
174
|
-
message_audit.message_source
|
175
|
-
== writer_pb2.BrokerMessage.MessageSource.PROCESSOR
|
176
|
-
):
|
180
|
+
if message_audit.message_source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
177
181
|
metrics.total_messages.inc({"action": "ignored", "type": "audit_fields"})
|
178
182
|
return
|
179
183
|
|
180
|
-
logger.info(
|
181
|
-
{"message": "Processing field audit for kbid", "kbid": notification.kbid}
|
182
|
-
)
|
184
|
+
logger.info({"message": "Processing field audit for kbid", "kbid": notification.kbid})
|
183
185
|
|
184
186
|
metrics.total_messages.inc({"action": "scheduled", "type": "audit_fields"})
|
185
187
|
with metrics.handler_histo({"type": "audit_fields"}):
|
186
188
|
when = message_audit.when if message_audit.HasField("when") else None
|
187
|
-
|
189
|
+
self.audit.report_and_send(
|
188
190
|
kbid=message_audit.kbid,
|
189
191
|
when=when,
|
190
192
|
user=message_audit.user,
|
@@ -25,8 +25,9 @@ from typing import Optional, Union
|
|
25
25
|
import backoff
|
26
26
|
import nats
|
27
27
|
import nats.js.api
|
28
|
+
import nats.js.errors
|
28
29
|
from nats.aio.client import Msg
|
29
|
-
from
|
30
|
+
from nats.js import JetStreamContext
|
30
31
|
|
31
32
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
32
33
|
from nucliadb.common.maindb.driver import Driver
|
@@ -34,16 +35,18 @@ from nucliadb.common.maindb.exceptions import ConflictError
|
|
34
35
|
from nucliadb.ingest import logger
|
35
36
|
from nucliadb.ingest.orm.exceptions import DeadletteredError, SequenceOrderViolation
|
36
37
|
from nucliadb.ingest.orm.processor import Processor, sequence_manager
|
38
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
|
37
39
|
from nucliadb_telemetry import context, errors, metrics
|
38
40
|
from nucliadb_utils import const
|
39
41
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
40
42
|
from nucliadb_utils.nats import MessageProgressUpdater, NatsConnectionManager
|
41
43
|
from nucliadb_utils.settings import nats_consumer_settings
|
42
44
|
from nucliadb_utils.storages.storage import Storage
|
45
|
+
from nucliadb_utils.utilities import has_feature
|
43
46
|
|
44
47
|
consumer_observer = metrics.Observer(
|
45
48
|
"message_processor",
|
46
|
-
labels={"source": ""},
|
49
|
+
labels={"source": "", "partition": ""},
|
47
50
|
buckets=[
|
48
51
|
0.01,
|
49
52
|
0.025,
|
@@ -83,40 +86,62 @@ class IngestConsumer:
|
|
83
86
|
|
84
87
|
self.lock = lock or asyncio.Lock()
|
85
88
|
self.processor = Processor(driver, storage, pubsub, partition)
|
89
|
+
self.subscription: Optional[JetStreamContext.PullSubscription] = None
|
90
|
+
|
91
|
+
async def ack_message(self, msg: Msg, kbid: Optional[str] = None):
|
92
|
+
context = {}
|
93
|
+
if kbid:
|
94
|
+
context["kbid"] = kbid
|
95
|
+
if has_feature(const.Features.NATS_SYNC_ACK, default=False, context=context):
|
96
|
+
await msg.ack_sync(timeout=10)
|
97
|
+
else:
|
98
|
+
await msg.ack()
|
86
99
|
|
87
100
|
async def initialize(self):
|
88
101
|
await self.setup_nats_subscription()
|
89
102
|
self.initialized = True
|
90
103
|
|
104
|
+
async def finalize(self):
|
105
|
+
if self.initialized:
|
106
|
+
await self.teardown_nats_subscription()
|
107
|
+
self.initialized = False
|
108
|
+
|
109
|
+
async def teardown_nats_subscription(self):
|
110
|
+
if self.subscription is not None:
|
111
|
+
try:
|
112
|
+
await self.nats_connection_manager.unsubscribe(self.subscription)
|
113
|
+
except nats.errors.ConnectionClosedError:
|
114
|
+
logger.warning("Connection closed while unsubscribing")
|
115
|
+
pass
|
116
|
+
self.subscription = None
|
117
|
+
|
91
118
|
async def setup_nats_subscription(self):
|
92
119
|
last_seqid = await sequence_manager.get_last_seqid(self.driver, self.partition)
|
93
120
|
if last_seqid is None:
|
94
121
|
last_seqid = 1
|
95
122
|
subject = const.Streams.INGEST.subject.format(partition=self.partition)
|
96
|
-
|
97
|
-
|
98
|
-
queue=const.Streams.INGEST.group.format(partition=self.partition),
|
123
|
+
durable_name = const.Streams.INGEST.group.format(partition=self.partition)
|
124
|
+
self.subscription = await self.nats_connection_manager.pull_subscribe(
|
99
125
|
stream=const.Streams.INGEST.name,
|
100
|
-
|
126
|
+
subject=subject,
|
127
|
+
durable=durable_name,
|
101
128
|
cb=self.subscription_worker,
|
102
129
|
subscription_lost_cb=self.setup_nats_subscription,
|
103
130
|
config=nats.js.api.ConsumerConfig(
|
131
|
+
durable_name=durable_name,
|
104
132
|
deliver_policy=nats.js.api.DeliverPolicy.BY_START_SEQUENCE,
|
105
133
|
opt_start_seq=last_seqid,
|
106
134
|
ack_policy=nats.js.api.AckPolicy.EXPLICIT,
|
107
|
-
max_ack_pending=
|
135
|
+
max_ack_pending=1,
|
108
136
|
max_deliver=nats_consumer_settings.nats_max_deliver,
|
109
137
|
ack_wait=nats_consumer_settings.nats_ack_wait,
|
110
|
-
idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
|
111
138
|
),
|
112
139
|
)
|
113
140
|
logger.info(
|
114
|
-
f"Subscribed to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
|
141
|
+
f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
|
115
142
|
)
|
116
143
|
|
117
|
-
@backoff.on_exception(
|
118
|
-
backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
|
119
|
-
)
|
144
|
+
@backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
|
120
145
|
async def _process(self, pb: BrokerMessage, seqid: int):
|
121
146
|
await self.processor.process(pb, seqid, self.partition)
|
122
147
|
|
@@ -142,19 +167,29 @@ class IngestConsumer:
|
|
142
167
|
logger.warning("Could not delete blob reference", exc_info=True)
|
143
168
|
|
144
169
|
async def subscription_worker(self, msg: Msg):
|
170
|
+
kbid: Optional[str] = None
|
145
171
|
subject = msg.subject
|
146
172
|
reply = msg.reply
|
147
173
|
seqid = int(reply.split(".")[5])
|
148
174
|
message_source = "<msg source not set>"
|
175
|
+
num_delivered = msg.metadata.num_delivered
|
176
|
+
if num_delivered > 1:
|
177
|
+
logger.warning(
|
178
|
+
"Message has been redelivered",
|
179
|
+
extra={
|
180
|
+
"seqid": seqid,
|
181
|
+
"subject": subject,
|
182
|
+
"reply": reply,
|
183
|
+
"num_delivered": num_delivered,
|
184
|
+
},
|
185
|
+
)
|
149
186
|
start = time.monotonic()
|
150
187
|
|
151
188
|
async with (
|
152
189
|
MessageProgressUpdater(msg, nats_consumer_settings.nats_ack_wait * 0.66),
|
153
190
|
self.lock,
|
154
191
|
):
|
155
|
-
logger.info(
|
156
|
-
f"Message processing: subject:{subject}, seqid: {seqid}, reply: {reply}"
|
157
|
-
)
|
192
|
+
logger.info(f"Message processing: subject:{subject}, seqid: {seqid}, reply: {reply}")
|
158
193
|
try:
|
159
194
|
pb = await self.get_broker_message(msg)
|
160
195
|
if pb.source == pb.MessageSource.PROCESSOR:
|
@@ -170,32 +205,27 @@ class IngestConsumer:
|
|
170
205
|
f"Received from {message_source} on {pb.kbid}/{pb.uuid} seq {seqid} partition {self.partition} at {time}" # noqa
|
171
206
|
)
|
172
207
|
context.add_context({"kbid": pb.kbid, "rid": pb.uuid})
|
173
|
-
|
208
|
+
kbid = pb.kbid
|
174
209
|
try:
|
175
|
-
|
176
|
-
|
177
|
-
"source": (
|
178
|
-
"writer"
|
179
|
-
if pb.source == pb.MessageSource.WRITER
|
180
|
-
else "processor"
|
181
|
-
)
|
182
|
-
}
|
183
|
-
):
|
210
|
+
source = "writer" if pb.source == pb.MessageSource.WRITER else "processor"
|
211
|
+
with consumer_observer({"source": source, "partition": self.partition}):
|
184
212
|
await self._process(pb, seqid)
|
185
213
|
except SequenceOrderViolation as err:
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
214
|
+
logger.log(
|
215
|
+
level=logging.ERROR if seqid < err.last_seqid else logging.WARNING,
|
216
|
+
msg="Old txn. Discarding message",
|
217
|
+
extra={
|
218
|
+
"stored_seqid": err.last_seqid,
|
219
|
+
"message_seqid": seqid,
|
220
|
+
"partition": self.partition,
|
221
|
+
"kbid": pb.kbid,
|
222
|
+
"msg_delivered_count": msg.metadata.num_delivered,
|
223
|
+
},
|
192
224
|
)
|
193
225
|
else:
|
194
226
|
message_type_name = pb.MessageType.Name(pb.type)
|
195
227
|
time_to_process = time.monotonic() - start
|
196
|
-
log_level =
|
197
|
-
logging.INFO if time_to_process < 10 else logging.WARNING
|
198
|
-
)
|
228
|
+
log_level = logging.INFO if time_to_process < 10 else logging.WARNING
|
199
229
|
logger.log(
|
200
230
|
log_level,
|
201
231
|
f"Successfully processed {message_type_name} message",
|
@@ -218,7 +248,8 @@ class IngestConsumer:
|
|
218
248
|
f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
|
219
249
|
f"Check sentry for more details: {str(e)}"
|
220
250
|
)
|
221
|
-
await
|
251
|
+
await self.ack_message(msg, kbid)
|
252
|
+
logger.info("Message acked because of deadletter", extra={"seqid": seqid})
|
222
253
|
except (ShardsNotFound,) as e:
|
223
254
|
# Any messages that for some unexpected inconsistency have failed and won't be tried again
|
224
255
|
# as we cannot do anything about it
|
@@ -229,7 +260,8 @@ class IngestConsumer:
|
|
229
260
|
f"This message has been dropped and won't be retried again"
|
230
261
|
f"Check sentry for more details: {str(e)}"
|
231
262
|
)
|
232
|
-
await
|
263
|
+
await self.ack_message(msg, kbid)
|
264
|
+
logger.info("Message acked because of drop", extra={"seqid": seqid})
|
233
265
|
except Exception as e:
|
234
266
|
# Unhandled exceptions that need to be retried after a small delay
|
235
267
|
errors.capture_exception(e)
|
@@ -239,10 +271,12 @@ class IngestConsumer:
|
|
239
271
|
f"Check sentry for more details: {str(e)}"
|
240
272
|
)
|
241
273
|
await msg.nak()
|
274
|
+
logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})
|
242
275
|
raise e
|
243
276
|
else:
|
244
277
|
# Successful processing
|
245
|
-
await
|
278
|
+
await self.ack_message(msg, kbid)
|
279
|
+
logger.info("Message acked because of success", extra={"seqid": seqid})
|
246
280
|
await self.clean_broker_message(msg)
|
247
281
|
|
248
282
|
|
@@ -259,28 +293,29 @@ class IngestProcessedConsumer(IngestConsumer):
|
|
259
293
|
|
260
294
|
async def setup_nats_subscription(self):
|
261
295
|
subject = const.Streams.INGEST_PROCESSED.subject
|
262
|
-
|
263
|
-
|
264
|
-
queue=const.Streams.INGEST_PROCESSED.group,
|
296
|
+
durable_name = const.Streams.INGEST_PROCESSED.group
|
297
|
+
self.subscription = await self.nats_connection_manager.pull_subscribe(
|
265
298
|
stream=const.Streams.INGEST_PROCESSED.name,
|
266
|
-
|
299
|
+
subject=subject,
|
300
|
+
durable=durable_name,
|
267
301
|
cb=self.subscription_worker,
|
268
302
|
subscription_lost_cb=self.setup_nats_subscription,
|
269
303
|
config=nats.js.api.ConsumerConfig(
|
304
|
+
durable_name=durable_name,
|
270
305
|
ack_policy=nats.js.api.AckPolicy.EXPLICIT,
|
271
|
-
|
306
|
+
deliver_policy=nats.js.api.DeliverPolicy.ALL,
|
307
|
+
# We set it to 20 because we don't care about order here and we want to be able to HPA based
|
308
|
+
# on the number of pending messages in the queue.
|
309
|
+
max_ack_pending=20,
|
272
310
|
max_deliver=nats_consumer_settings.nats_max_deliver,
|
273
311
|
ack_wait=nats_consumer_settings.nats_ack_wait,
|
274
|
-
idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
|
275
312
|
),
|
276
313
|
)
|
277
314
|
logger.info(
|
278
|
-
f"Subscribed to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
|
315
|
+
f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
|
279
316
|
)
|
280
317
|
|
281
|
-
@backoff.on_exception(
|
282
|
-
backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
|
283
|
-
)
|
318
|
+
@backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
|
284
319
|
async def _process(self, pb: BrokerMessage, seqid: int):
|
285
320
|
"""
|
286
321
|
We are setting `transaction_check` to False here because we can not mix
|
@@ -83,27 +83,20 @@ class MaterializerHandler:
|
|
83
83
|
notification.ParseFromString(data)
|
84
84
|
|
85
85
|
if (
|
86
|
-
notification.action
|
87
|
-
!= writer_pb2.Notification.Action.COMMIT # only on commits
|
86
|
+
notification.action != writer_pb2.Notification.Action.COMMIT # only on commits
|
88
87
|
or notification.write_type
|
89
88
|
== writer_pb2.Notification.WriteType.MODIFIED # only on new resources and deletes
|
90
89
|
):
|
91
90
|
return
|
92
91
|
|
93
|
-
self.task_handler.schedule(
|
94
|
-
notification.kbid, partial(self.process, notification.kbid)
|
95
|
-
)
|
92
|
+
self.task_handler.schedule(notification.kbid, partial(self.process, notification.kbid))
|
96
93
|
|
97
94
|
async def process(self, kbid: str) -> None:
|
98
95
|
logger.info(f"Materializing knowledgebox", extra={"kbid": kbid})
|
99
|
-
async with datamanagers.
|
100
|
-
value = await datamanagers.resources.calculate_number_of_resources(
|
101
|
-
txn, kbid=kbid
|
102
|
-
)
|
96
|
+
async with datamanagers.with_ro_transaction() as txn:
|
97
|
+
value = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
|
103
98
|
async with datamanagers.with_transaction() as txn:
|
104
|
-
await datamanagers.resources.set_number_of_resources(
|
105
|
-
txn, kbid=kbid, value=value
|
106
|
-
)
|
99
|
+
await datamanagers.resources.set_number_of_resources(txn, kbid=kbid, value=value)
|
107
100
|
await txn.commit()
|
108
101
|
|
109
102
|
audit = get_audit()
|
nucliadb/ingest/consumer/pull.py
CHANGED
@@ -21,10 +21,7 @@ import asyncio
|
|
21
21
|
import base64
|
22
22
|
from typing import Optional
|
23
23
|
|
24
|
-
import nats
|
25
|
-
import nats.errors
|
26
24
|
from aiohttp.client_exceptions import ClientConnectorError
|
27
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
|
28
25
|
|
29
26
|
from nucliadb.common import datamanagers
|
30
27
|
from nucliadb.common.http_clients.processing import ProcessingHTTPClient, get_nua_api_id
|
@@ -32,11 +29,13 @@ from nucliadb.common.maindb.driver import Driver
|
|
32
29
|
from nucliadb.ingest import logger, logger_activity
|
33
30
|
from nucliadb.ingest.orm.exceptions import ReallyStopPulling
|
34
31
|
from nucliadb.ingest.orm.processor import Processor
|
32
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
|
35
33
|
from nucliadb_telemetry import errors
|
36
34
|
from nucliadb_utils import const
|
37
35
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
38
36
|
from nucliadb_utils.settings import nuclia_settings
|
39
37
|
from nucliadb_utils.storages.storage import Storage
|
38
|
+
from nucliadb_utils.transaction import MaxTransactionSizeExceededError
|
40
39
|
from nucliadb_utils.utilities import get_storage, get_transaction_utility
|
41
40
|
|
42
41
|
|
@@ -78,9 +77,7 @@ class PullWorker:
|
|
78
77
|
data = base64.b64decode(payload)
|
79
78
|
pb.ParseFromString(data)
|
80
79
|
|
81
|
-
logger.debug(
|
82
|
-
f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}"
|
83
|
-
)
|
80
|
+
logger.debug(f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}")
|
84
81
|
|
85
82
|
if not self.local_subscriber:
|
86
83
|
transaction_utility = get_transaction_utility()
|
@@ -93,11 +90,9 @@ class PullWorker:
|
|
93
90
|
# send to separate processor
|
94
91
|
target_subject=const.Streams.INGEST_PROCESSED.subject,
|
95
92
|
)
|
96
|
-
except
|
93
|
+
except MaxTransactionSizeExceededError:
|
97
94
|
storage = await get_storage()
|
98
|
-
stored_key = await storage.set_stream_message(
|
99
|
-
kbid=pb.kbid, rid=pb.uuid, data=data
|
100
|
-
)
|
95
|
+
stored_key = await storage.set_stream_message(kbid=pb.kbid, rid=pb.uuid, data=data)
|
101
96
|
referenced_pb = BrokerMessageBlobReference(
|
102
97
|
uuid=pb.uuid, kbid=pb.kbid, storage_key=stored_key
|
103
98
|
)
|
@@ -141,9 +136,7 @@ class PullWorker:
|
|
141
136
|
try:
|
142
137
|
pull_type_id = get_nua_api_id()
|
143
138
|
except Exception as exc:
|
144
|
-
logger.exception(
|
145
|
-
"Could not read NUA API Key. Can not start pull worker"
|
146
|
-
)
|
139
|
+
logger.exception("Could not read NUA API Key. Can not start pull worker")
|
147
140
|
raise ReallyStopPulling() from exc
|
148
141
|
else:
|
149
142
|
pull_type_id = "main"
|
@@ -152,7 +145,7 @@ class PullWorker:
|
|
152
145
|
logger.info(f"Collecting from NucliaDB Cloud {self.partition} partition")
|
153
146
|
while True:
|
154
147
|
try:
|
155
|
-
async with datamanagers.
|
148
|
+
async with datamanagers.with_ro_transaction() as txn:
|
156
149
|
cursor = await datamanagers.processing.get_pull_offset(
|
157
150
|
txn, pull_type_id=pull_type_id, partition=self.partition
|
158
151
|
)
|
@@ -176,9 +169,7 @@ class PullWorker:
|
|
176
169
|
await self.handle_message(payload)
|
177
170
|
except Exception as e:
|
178
171
|
errors.capture_exception(e)
|
179
|
-
logger.exception(
|
180
|
-
"Error while pulling and processing message/s"
|
181
|
-
)
|
172
|
+
logger.exception("Error while pulling and processing message/s")
|
182
173
|
raise e
|
183
174
|
async with datamanagers.with_transaction() as txn:
|
184
175
|
await datamanagers.processing.set_pull_offset(
|
@@ -189,9 +180,7 @@ class PullWorker:
|
|
189
180
|
)
|
190
181
|
await txn.commit()
|
191
182
|
elif data.status == "empty":
|
192
|
-
logger_activity.debug(
|
193
|
-
f"No messages waiting in partition #{self.partition}"
|
194
|
-
)
|
183
|
+
logger_activity.debug(f"No messages waiting in partition #{self.partition}")
|
195
184
|
await asyncio.sleep(self.pull_time_empty_backoff)
|
196
185
|
else:
|
197
186
|
logger.info(f"Proxy pull answered with error: {data}")
|
@@ -202,9 +191,7 @@ class PullWorker:
|
|
202
191
|
KeyboardInterrupt,
|
203
192
|
SystemExit,
|
204
193
|
):
|
205
|
-
logger.info(
|
206
|
-
f"Pull task for partition #{self.partition} was canceled, exiting"
|
207
|
-
)
|
194
|
+
logger.info(f"Pull task for partition #{self.partition} was canceled, exiting")
|
208
195
|
raise ReallyStopPulling()
|
209
196
|
|
210
197
|
except ClientConnectorError:
|
@@ -214,14 +201,12 @@ class PullWorker:
|
|
214
201
|
)
|
215
202
|
await asyncio.sleep(self.pull_time_error_backoff)
|
216
203
|
|
217
|
-
except
|
204
|
+
except MaxTransactionSizeExceededError as e:
|
218
205
|
if data is not None:
|
219
206
|
payload_length = 0
|
220
207
|
if data.payload:
|
221
208
|
payload_length = len(base64.b64decode(data.payload))
|
222
|
-
logger.error(
|
223
|
-
f"Message too big for transaction: {payload_length}"
|
224
|
-
)
|
209
|
+
logger.error(f"Message too big for transaction: {payload_length}")
|
225
210
|
raise e
|
226
211
|
except Exception:
|
227
212
|
logger.exception("Unhandled error pulling messages from processing")
|