nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -25,8 +25,9 @@ from typing import Optional, Union
|
|
25
25
|
import backoff
|
26
26
|
import nats
|
27
27
|
import nats.js.api
|
28
|
+
import nats.js.errors
|
28
29
|
from nats.aio.client import Msg
|
29
|
-
from
|
30
|
+
from nats.js import JetStreamContext
|
30
31
|
|
31
32
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
32
33
|
from nucliadb.common.maindb.driver import Driver
|
@@ -34,16 +35,18 @@ from nucliadb.common.maindb.exceptions import ConflictError
|
|
34
35
|
from nucliadb.ingest import logger
|
35
36
|
from nucliadb.ingest.orm.exceptions import DeadletteredError, SequenceOrderViolation
|
36
37
|
from nucliadb.ingest.orm.processor import Processor, sequence_manager
|
38
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
|
37
39
|
from nucliadb_telemetry import context, errors, metrics
|
38
40
|
from nucliadb_utils import const
|
39
41
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
40
42
|
from nucliadb_utils.nats import MessageProgressUpdater, NatsConnectionManager
|
41
43
|
from nucliadb_utils.settings import nats_consumer_settings
|
42
44
|
from nucliadb_utils.storages.storage import Storage
|
45
|
+
from nucliadb_utils.utilities import has_feature
|
43
46
|
|
44
47
|
consumer_observer = metrics.Observer(
|
45
48
|
"message_processor",
|
46
|
-
labels={"source": ""},
|
49
|
+
labels={"source": "", "partition": ""},
|
47
50
|
buckets=[
|
48
51
|
0.01,
|
49
52
|
0.025,
|
@@ -83,40 +86,62 @@ class IngestConsumer:
|
|
83
86
|
|
84
87
|
self.lock = lock or asyncio.Lock()
|
85
88
|
self.processor = Processor(driver, storage, pubsub, partition)
|
89
|
+
self.subscription: Optional[JetStreamContext.PullSubscription] = None
|
90
|
+
|
91
|
+
async def ack_message(self, msg: Msg, kbid: Optional[str] = None):
|
92
|
+
context = {}
|
93
|
+
if kbid:
|
94
|
+
context["kbid"] = kbid
|
95
|
+
if has_feature(const.Features.NATS_SYNC_ACK, default=False, context=context):
|
96
|
+
await msg.ack_sync(timeout=10)
|
97
|
+
else:
|
98
|
+
await msg.ack()
|
86
99
|
|
87
100
|
async def initialize(self):
|
88
101
|
await self.setup_nats_subscription()
|
89
102
|
self.initialized = True
|
90
103
|
|
104
|
+
async def finalize(self):
|
105
|
+
if self.initialized:
|
106
|
+
await self.teardown_nats_subscription()
|
107
|
+
self.initialized = False
|
108
|
+
|
109
|
+
async def teardown_nats_subscription(self):
|
110
|
+
if self.subscription is not None:
|
111
|
+
try:
|
112
|
+
await self.nats_connection_manager.unsubscribe(self.subscription)
|
113
|
+
except nats.errors.ConnectionClosedError:
|
114
|
+
logger.warning("Connection closed while unsubscribing")
|
115
|
+
pass
|
116
|
+
self.subscription = None
|
117
|
+
|
91
118
|
async def setup_nats_subscription(self):
|
92
119
|
last_seqid = await sequence_manager.get_last_seqid(self.driver, self.partition)
|
93
120
|
if last_seqid is None:
|
94
121
|
last_seqid = 1
|
95
122
|
subject = const.Streams.INGEST.subject.format(partition=self.partition)
|
96
|
-
|
97
|
-
|
98
|
-
queue=const.Streams.INGEST.group.format(partition=self.partition),
|
123
|
+
durable_name = const.Streams.INGEST.group.format(partition=self.partition)
|
124
|
+
self.subscription = await self.nats_connection_manager.pull_subscribe(
|
99
125
|
stream=const.Streams.INGEST.name,
|
100
|
-
|
126
|
+
subject=subject,
|
127
|
+
durable=durable_name,
|
101
128
|
cb=self.subscription_worker,
|
102
129
|
subscription_lost_cb=self.setup_nats_subscription,
|
103
130
|
config=nats.js.api.ConsumerConfig(
|
131
|
+
durable_name=durable_name,
|
104
132
|
deliver_policy=nats.js.api.DeliverPolicy.BY_START_SEQUENCE,
|
105
133
|
opt_start_seq=last_seqid,
|
106
134
|
ack_policy=nats.js.api.AckPolicy.EXPLICIT,
|
107
|
-
max_ack_pending=
|
135
|
+
max_ack_pending=1,
|
108
136
|
max_deliver=nats_consumer_settings.nats_max_deliver,
|
109
137
|
ack_wait=nats_consumer_settings.nats_ack_wait,
|
110
|
-
idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
|
111
138
|
),
|
112
139
|
)
|
113
140
|
logger.info(
|
114
|
-
f"Subscribed to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
|
141
|
+
f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
|
115
142
|
)
|
116
143
|
|
117
|
-
@backoff.on_exception(
|
118
|
-
backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
|
119
|
-
)
|
144
|
+
@backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
|
120
145
|
async def _process(self, pb: BrokerMessage, seqid: int):
|
121
146
|
await self.processor.process(pb, seqid, self.partition)
|
122
147
|
|
@@ -142,18 +167,29 @@ class IngestConsumer:
|
|
142
167
|
logger.warning("Could not delete blob reference", exc_info=True)
|
143
168
|
|
144
169
|
async def subscription_worker(self, msg: Msg):
|
170
|
+
kbid: Optional[str] = None
|
145
171
|
subject = msg.subject
|
146
172
|
reply = msg.reply
|
147
173
|
seqid = int(reply.split(".")[5])
|
148
174
|
message_source = "<msg source not set>"
|
175
|
+
num_delivered = msg.metadata.num_delivered
|
176
|
+
if num_delivered > 1:
|
177
|
+
logger.warning(
|
178
|
+
"Message has been redelivered",
|
179
|
+
extra={
|
180
|
+
"seqid": seqid,
|
181
|
+
"subject": subject,
|
182
|
+
"reply": reply,
|
183
|
+
"num_delivered": num_delivered,
|
184
|
+
},
|
185
|
+
)
|
149
186
|
start = time.monotonic()
|
150
187
|
|
151
|
-
async with
|
152
|
-
msg, nats_consumer_settings.nats_ack_wait * 0.66
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
)
|
188
|
+
async with (
|
189
|
+
MessageProgressUpdater(msg, nats_consumer_settings.nats_ack_wait * 0.66),
|
190
|
+
self.lock,
|
191
|
+
):
|
192
|
+
logger.info(f"Message processing: subject:{subject}, seqid: {seqid}, reply: {reply}")
|
157
193
|
try:
|
158
194
|
pb = await self.get_broker_message(msg)
|
159
195
|
if pb.source == pb.MessageSource.PROCESSOR:
|
@@ -169,36 +205,39 @@ class IngestConsumer:
|
|
169
205
|
f"Received from {message_source} on {pb.kbid}/{pb.uuid} seq {seqid} partition {self.partition} at {time}" # noqa
|
170
206
|
)
|
171
207
|
context.add_context({"kbid": pb.kbid, "rid": pb.uuid})
|
172
|
-
|
208
|
+
kbid = pb.kbid
|
173
209
|
try:
|
174
|
-
|
175
|
-
|
176
|
-
"source": "writer"
|
177
|
-
if pb.source == pb.MessageSource.WRITER
|
178
|
-
else "processor"
|
179
|
-
}
|
180
|
-
):
|
210
|
+
source = "writer" if pb.source == pb.MessageSource.WRITER else "processor"
|
211
|
+
with consumer_observer({"source": source, "partition": self.partition}):
|
181
212
|
await self._process(pb, seqid)
|
182
213
|
except SequenceOrderViolation as err:
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
214
|
+
logger.log(
|
215
|
+
level=logging.ERROR if seqid < err.last_seqid else logging.WARNING,
|
216
|
+
msg="Old txn. Discarding message",
|
217
|
+
extra={
|
218
|
+
"stored_seqid": err.last_seqid,
|
219
|
+
"message_seqid": seqid,
|
220
|
+
"partition": self.partition,
|
221
|
+
"kbid": pb.kbid,
|
222
|
+
"msg_delivered_count": msg.metadata.num_delivered,
|
223
|
+
},
|
189
224
|
)
|
190
225
|
else:
|
191
226
|
message_type_name = pb.MessageType.Name(pb.type)
|
192
227
|
time_to_process = time.monotonic() - start
|
193
|
-
log_level =
|
194
|
-
logging.INFO if time_to_process < 10 else logging.WARNING
|
195
|
-
)
|
228
|
+
log_level = logging.INFO if time_to_process < 10 else logging.WARNING
|
196
229
|
logger.log(
|
197
230
|
log_level,
|
198
|
-
f"Successfully processed {message_type_name} message
|
199
|
-
|
200
|
-
|
201
|
-
|
231
|
+
f"Successfully processed {message_type_name} message",
|
232
|
+
extra={
|
233
|
+
"kbid": pb.kbid,
|
234
|
+
"rid": pb.uuid,
|
235
|
+
"message_source": message_source,
|
236
|
+
"nucliadb_seqid": seqid,
|
237
|
+
"partition": self.partition,
|
238
|
+
"total_time": time_to_process,
|
239
|
+
"audit_time": audit_time,
|
240
|
+
},
|
202
241
|
)
|
203
242
|
except DeadletteredError as e:
|
204
243
|
# Messages that have been sent to deadletter at some point
|
@@ -209,7 +248,8 @@ class IngestConsumer:
|
|
209
248
|
f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
|
210
249
|
f"Check sentry for more details: {str(e)}"
|
211
250
|
)
|
212
|
-
await
|
251
|
+
await self.ack_message(msg, kbid)
|
252
|
+
logger.info("Message acked because of deadletter", extra={"seqid": seqid})
|
213
253
|
except (ShardsNotFound,) as e:
|
214
254
|
# Any messages that for some unexpected inconsistency have failed and won't be tried again
|
215
255
|
# as we cannot do anything about it
|
@@ -220,7 +260,8 @@ class IngestConsumer:
|
|
220
260
|
f"This message has been dropped and won't be retried again"
|
221
261
|
f"Check sentry for more details: {str(e)}"
|
222
262
|
)
|
223
|
-
await
|
263
|
+
await self.ack_message(msg, kbid)
|
264
|
+
logger.info("Message acked because of drop", extra={"seqid": seqid})
|
224
265
|
except Exception as e:
|
225
266
|
# Unhandled exceptions that need to be retried after a small delay
|
226
267
|
errors.capture_exception(e)
|
@@ -230,10 +271,12 @@ class IngestConsumer:
|
|
230
271
|
f"Check sentry for more details: {str(e)}"
|
231
272
|
)
|
232
273
|
await msg.nak()
|
274
|
+
logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})
|
233
275
|
raise e
|
234
276
|
else:
|
235
277
|
# Successful processing
|
236
|
-
await
|
278
|
+
await self.ack_message(msg, kbid)
|
279
|
+
logger.info("Message acked because of success", extra={"seqid": seqid})
|
237
280
|
await self.clean_broker_message(msg)
|
238
281
|
|
239
282
|
|
@@ -250,28 +293,29 @@ class IngestProcessedConsumer(IngestConsumer):
|
|
250
293
|
|
251
294
|
async def setup_nats_subscription(self):
|
252
295
|
subject = const.Streams.INGEST_PROCESSED.subject
|
253
|
-
|
254
|
-
|
255
|
-
queue=const.Streams.INGEST_PROCESSED.group,
|
296
|
+
durable_name = const.Streams.INGEST_PROCESSED.group
|
297
|
+
self.subscription = await self.nats_connection_manager.pull_subscribe(
|
256
298
|
stream=const.Streams.INGEST_PROCESSED.name,
|
257
|
-
|
299
|
+
subject=subject,
|
300
|
+
durable=durable_name,
|
258
301
|
cb=self.subscription_worker,
|
259
302
|
subscription_lost_cb=self.setup_nats_subscription,
|
260
303
|
config=nats.js.api.ConsumerConfig(
|
304
|
+
durable_name=durable_name,
|
261
305
|
ack_policy=nats.js.api.AckPolicy.EXPLICIT,
|
262
|
-
|
306
|
+
deliver_policy=nats.js.api.DeliverPolicy.ALL,
|
307
|
+
# We set it to 20 because we don't care about order here and we want to be able to HPA based
|
308
|
+
# on the number of pending messages in the queue.
|
309
|
+
max_ack_pending=20,
|
263
310
|
max_deliver=nats_consumer_settings.nats_max_deliver,
|
264
311
|
ack_wait=nats_consumer_settings.nats_ack_wait,
|
265
|
-
idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
|
266
312
|
),
|
267
313
|
)
|
268
314
|
logger.info(
|
269
|
-
f"Subscribed to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
|
315
|
+
f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
|
270
316
|
)
|
271
317
|
|
272
|
-
@backoff.on_exception(
|
273
|
-
backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
|
274
|
-
)
|
318
|
+
@backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
|
275
319
|
async def _process(self, pb: BrokerMessage, seqid: int):
|
276
320
|
"""
|
277
321
|
We are setting `transaction_check` to False here because we can not mix
|
@@ -29,6 +29,7 @@ from nucliadb_protos import writer_pb2
|
|
29
29
|
from nucliadb_utils import const
|
30
30
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
31
31
|
from nucliadb_utils.storages.storage import Storage
|
32
|
+
from nucliadb_utils.utilities import get_audit
|
32
33
|
|
33
34
|
from .utils import DelayedTaskHandler
|
34
35
|
|
@@ -82,25 +83,22 @@ class MaterializerHandler:
|
|
82
83
|
notification.ParseFromString(data)
|
83
84
|
|
84
85
|
if (
|
85
|
-
notification.action
|
86
|
-
!= writer_pb2.Notification.Action.COMMIT # only on commits
|
86
|
+
notification.action != writer_pb2.Notification.Action.COMMIT # only on commits
|
87
87
|
or notification.write_type
|
88
88
|
== writer_pb2.Notification.WriteType.MODIFIED # only on new resources and deletes
|
89
89
|
):
|
90
90
|
return
|
91
91
|
|
92
|
-
self.task_handler.schedule(
|
93
|
-
notification.kbid, partial(self.process, notification.kbid)
|
94
|
-
)
|
92
|
+
self.task_handler.schedule(notification.kbid, partial(self.process, notification.kbid))
|
95
93
|
|
96
94
|
async def process(self, kbid: str) -> None:
|
97
95
|
logger.info(f"Materializing knowledgebox", extra={"kbid": kbid})
|
98
|
-
async with datamanagers.
|
99
|
-
value = await datamanagers.resources.calculate_number_of_resources(
|
100
|
-
txn, kbid=kbid
|
101
|
-
)
|
96
|
+
async with datamanagers.with_ro_transaction() as txn:
|
97
|
+
value = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
|
102
98
|
async with datamanagers.with_transaction() as txn:
|
103
|
-
await datamanagers.resources.set_number_of_resources(
|
104
|
-
txn, kbid=kbid, value=value
|
105
|
-
)
|
99
|
+
await datamanagers.resources.set_number_of_resources(txn, kbid=kbid, value=value)
|
106
100
|
await txn.commit()
|
101
|
+
|
102
|
+
audit = get_audit()
|
103
|
+
if audit:
|
104
|
+
audit.report_resources(kbid=kbid, resources=value)
|
nucliadb/ingest/consumer/pull.py
CHANGED
@@ -21,10 +21,7 @@ import asyncio
|
|
21
21
|
import base64
|
22
22
|
from typing import Optional
|
23
23
|
|
24
|
-
import nats
|
25
|
-
import nats.errors
|
26
24
|
from aiohttp.client_exceptions import ClientConnectorError
|
27
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
|
28
25
|
|
29
26
|
from nucliadb.common import datamanagers
|
30
27
|
from nucliadb.common.http_clients.processing import ProcessingHTTPClient, get_nua_api_id
|
@@ -32,11 +29,13 @@ from nucliadb.common.maindb.driver import Driver
|
|
32
29
|
from nucliadb.ingest import logger, logger_activity
|
33
30
|
from nucliadb.ingest.orm.exceptions import ReallyStopPulling
|
34
31
|
from nucliadb.ingest.orm.processor import Processor
|
32
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
|
35
33
|
from nucliadb_telemetry import errors
|
36
34
|
from nucliadb_utils import const
|
37
35
|
from nucliadb_utils.cache.pubsub import PubSubDriver
|
38
36
|
from nucliadb_utils.settings import nuclia_settings
|
39
37
|
from nucliadb_utils.storages.storage import Storage
|
38
|
+
from nucliadb_utils.transaction import MaxTransactionSizeExceededError
|
40
39
|
from nucliadb_utils.utilities import get_storage, get_transaction_utility
|
41
40
|
|
42
41
|
|
@@ -78,9 +77,7 @@ class PullWorker:
|
|
78
77
|
data = base64.b64decode(payload)
|
79
78
|
pb.ParseFromString(data)
|
80
79
|
|
81
|
-
logger.debug(
|
82
|
-
f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}"
|
83
|
-
)
|
80
|
+
logger.debug(f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}")
|
84
81
|
|
85
82
|
if not self.local_subscriber:
|
86
83
|
transaction_utility = get_transaction_utility()
|
@@ -93,11 +90,9 @@ class PullWorker:
|
|
93
90
|
# send to separate processor
|
94
91
|
target_subject=const.Streams.INGEST_PROCESSED.subject,
|
95
92
|
)
|
96
|
-
except
|
93
|
+
except MaxTransactionSizeExceededError:
|
97
94
|
storage = await get_storage()
|
98
|
-
stored_key = await storage.set_stream_message(
|
99
|
-
kbid=pb.kbid, rid=pb.uuid, data=data
|
100
|
-
)
|
95
|
+
stored_key = await storage.set_stream_message(kbid=pb.kbid, rid=pb.uuid, data=data)
|
101
96
|
referenced_pb = BrokerMessageBlobReference(
|
102
97
|
uuid=pb.uuid, kbid=pb.kbid, storage_key=stored_key
|
103
98
|
)
|
@@ -141,9 +136,7 @@ class PullWorker:
|
|
141
136
|
try:
|
142
137
|
pull_type_id = get_nua_api_id()
|
143
138
|
except Exception as exc:
|
144
|
-
logger.exception(
|
145
|
-
"Could not read NUA API Key. Can not start pull worker"
|
146
|
-
)
|
139
|
+
logger.exception("Could not read NUA API Key. Can not start pull worker")
|
147
140
|
raise ReallyStopPulling() from exc
|
148
141
|
else:
|
149
142
|
pull_type_id = "main"
|
@@ -152,7 +145,7 @@ class PullWorker:
|
|
152
145
|
logger.info(f"Collecting from NucliaDB Cloud {self.partition} partition")
|
153
146
|
while True:
|
154
147
|
try:
|
155
|
-
async with datamanagers.
|
148
|
+
async with datamanagers.with_ro_transaction() as txn:
|
156
149
|
cursor = await datamanagers.processing.get_pull_offset(
|
157
150
|
txn, pull_type_id=pull_type_id, partition=self.partition
|
158
151
|
)
|
@@ -176,9 +169,7 @@ class PullWorker:
|
|
176
169
|
await self.handle_message(payload)
|
177
170
|
except Exception as e:
|
178
171
|
errors.capture_exception(e)
|
179
|
-
logger.exception(
|
180
|
-
"Error while pulling and processing message/s"
|
181
|
-
)
|
172
|
+
logger.exception("Error while pulling and processing message/s")
|
182
173
|
raise e
|
183
174
|
async with datamanagers.with_transaction() as txn:
|
184
175
|
await datamanagers.processing.set_pull_offset(
|
@@ -189,9 +180,7 @@ class PullWorker:
|
|
189
180
|
)
|
190
181
|
await txn.commit()
|
191
182
|
elif data.status == "empty":
|
192
|
-
logger_activity.debug(
|
193
|
-
f"No messages waiting in partition #{self.partition}"
|
194
|
-
)
|
183
|
+
logger_activity.debug(f"No messages waiting in partition #{self.partition}")
|
195
184
|
await asyncio.sleep(self.pull_time_empty_backoff)
|
196
185
|
else:
|
197
186
|
logger.info(f"Proxy pull answered with error: {data}")
|
@@ -202,9 +191,7 @@ class PullWorker:
|
|
202
191
|
KeyboardInterrupt,
|
203
192
|
SystemExit,
|
204
193
|
):
|
205
|
-
logger.info(
|
206
|
-
f"Pull task for partition #{self.partition} was canceled, exiting"
|
207
|
-
)
|
194
|
+
logger.info(f"Pull task for partition #{self.partition} was canceled, exiting")
|
208
195
|
raise ReallyStopPulling()
|
209
196
|
|
210
197
|
except ClientConnectorError:
|
@@ -214,14 +201,12 @@ class PullWorker:
|
|
214
201
|
)
|
215
202
|
await asyncio.sleep(self.pull_time_error_backoff)
|
216
203
|
|
217
|
-
except
|
204
|
+
except MaxTransactionSizeExceededError as e:
|
218
205
|
if data is not None:
|
219
206
|
payload_length = 0
|
220
207
|
if data.payload:
|
221
208
|
payload_length = len(base64.b64decode(data.payload))
|
222
|
-
logger.error(
|
223
|
-
f"Message too big for transaction: {payload_length}"
|
224
|
-
)
|
209
|
+
logger.error(f"Message too big for transaction: {payload_length}")
|
225
210
|
raise e
|
226
211
|
except Exception:
|
227
212
|
logger.exception("Unhandled error pulling messages from processing")
|
@@ -45,9 +45,7 @@ from .shard_creator import ShardCreatorHandler
|
|
45
45
|
def _handle_task_result(task: asyncio.Task) -> None:
|
46
46
|
e = task.exception()
|
47
47
|
if e:
|
48
|
-
logger.exception(
|
49
|
-
"Loop stopped by exception. This should not happen. Exiting.", exc_info=e
|
50
|
-
)
|
48
|
+
logger.exception("Loop stopped by exception. This should not happen. Exiting.", exc_info=e)
|
51
49
|
sys.exit(1)
|
52
50
|
|
53
51
|
|
@@ -87,9 +85,7 @@ async def start_ingest_consumers(
|
|
87
85
|
if transaction_settings.transaction_local:
|
88
86
|
raise ConfigurationError("Can not start ingest consumers in local mode")
|
89
87
|
|
90
|
-
while len(
|
91
|
-
manager.get_index_nodes()
|
92
|
-
) == 0 and running_settings.running_environment not in (
|
88
|
+
while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
|
93
89
|
"local",
|
94
90
|
"test",
|
95
91
|
):
|
@@ -101,9 +97,9 @@ async def start_ingest_consumers(
|
|
101
97
|
storage = await get_storage(service_name=service_name or SERVICE_NAME)
|
102
98
|
nats_connection_manager = get_nats_manager()
|
103
99
|
|
104
|
-
max_concurrent_processing = asyncio.Semaphore(
|
105
|
-
|
106
|
-
|
100
|
+
max_concurrent_processing = asyncio.Semaphore(settings.max_concurrent_ingest_processing)
|
101
|
+
|
102
|
+
consumer_finalizers = []
|
107
103
|
|
108
104
|
for partition in settings.partitions:
|
109
105
|
consumer = IngestConsumer(
|
@@ -115,8 +111,15 @@ async def start_ingest_consumers(
|
|
115
111
|
lock=max_concurrent_processing,
|
116
112
|
)
|
117
113
|
await consumer.initialize()
|
114
|
+
consumer_finalizers.append(consumer.finalize)
|
118
115
|
|
119
|
-
|
116
|
+
async def _finalize():
|
117
|
+
# Finalize all the consumers and the nats connection manager
|
118
|
+
for consumer_finalize in consumer_finalizers:
|
119
|
+
await consumer_finalize()
|
120
|
+
await nats_connection_manager.finalize()
|
121
|
+
|
122
|
+
return _finalize
|
120
123
|
|
121
124
|
|
122
125
|
async def start_ingest_processed_consumer(
|
@@ -132,9 +135,7 @@ async def start_ingest_processed_consumer(
|
|
132
135
|
if transaction_settings.transaction_local:
|
133
136
|
raise ConfigurationError("Can not start ingest consumers in local mode")
|
134
137
|
|
135
|
-
while len(
|
136
|
-
manager.get_index_nodes()
|
137
|
-
) == 0 and running_settings.running_environment not in (
|
138
|
+
while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
|
138
139
|
"local",
|
139
140
|
"test",
|
140
141
|
):
|
@@ -159,22 +160,22 @@ async def start_ingest_processed_consumer(
|
|
159
160
|
|
160
161
|
|
161
162
|
async def start_auditor() -> Callable[[], Awaitable[None]]:
|
162
|
-
driver = await setup_driver()
|
163
163
|
audit = get_audit()
|
164
164
|
assert audit is not None
|
165
|
+
|
165
166
|
pubsub = await get_pubsub()
|
166
167
|
assert pubsub is not None, "Pubsub is not configured"
|
167
168
|
storage = await get_storage(service_name=SERVICE_NAME)
|
168
|
-
index_auditor = IndexAuditHandler(
|
169
|
-
resource_writes_auditor = ResourceWritesAuditHandler(
|
170
|
-
driver=driver, storage=storage, audit=audit, pubsub=pubsub
|
171
|
-
)
|
169
|
+
index_auditor = IndexAuditHandler(audit=audit, pubsub=pubsub)
|
170
|
+
resource_writes_auditor = ResourceWritesAuditHandler(storage=storage, audit=audit, pubsub=pubsub)
|
172
171
|
|
173
172
|
await index_auditor.initialize()
|
174
173
|
await resource_writes_auditor.initialize()
|
175
174
|
|
176
175
|
return partial(
|
177
|
-
asyncio.gather,
|
176
|
+
asyncio.gather,
|
177
|
+
index_auditor.finalize(),
|
178
|
+
resource_writes_auditor.finalize(), # type: ignore
|
178
179
|
)
|
179
180
|
|
180
181
|
|
@@ -22,7 +22,7 @@ import logging
|
|
22
22
|
import uuid
|
23
23
|
from functools import partial
|
24
24
|
|
25
|
-
from nucliadb.common import
|
25
|
+
from nucliadb.common import locking
|
26
26
|
from nucliadb.common.cluster.manager import choose_node
|
27
27
|
from nucliadb.common.cluster.utils import get_shard_manager
|
28
28
|
from nucliadb.common.maindb.driver import Driver
|
@@ -82,21 +82,19 @@ class ShardCreatorHandler:
|
|
82
82
|
metrics.total_messages.inc({"type": "shard_creator", "action": "ignored"})
|
83
83
|
return
|
84
84
|
|
85
|
-
self.task_handler.schedule(
|
86
|
-
notification.kbid, partial(self.process_kb, notification.kbid)
|
87
|
-
)
|
85
|
+
self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
|
88
86
|
metrics.total_messages.inc({"type": "shard_creator", "action": "scheduled"})
|
89
87
|
|
90
88
|
@metrics.handler_histo.wrap({"type": "shard_creator"})
|
91
89
|
async def process_kb(self, kbid: str) -> None:
|
92
90
|
logger.info({"message": "Processing notification for kbid", "kbid": kbid})
|
93
91
|
async with self.driver.transaction(read_only=True) as txn:
|
94
|
-
kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
|
95
92
|
current_shard = await self.shard_manager.get_current_active_shard(txn, kbid)
|
96
93
|
|
97
|
-
if
|
94
|
+
if current_shard is None:
|
98
95
|
logger.error(
|
99
|
-
"Processing a notification for
|
96
|
+
"Processing a notification for KB with no current shard",
|
97
|
+
extra={"kbid": kbid},
|
100
98
|
)
|
101
99
|
return
|
102
100
|
|
@@ -105,13 +103,8 @@ class ShardCreatorHandler:
|
|
105
103
|
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
106
104
|
# remember, a lock will do at least 1+ reads and 1 write.
|
107
105
|
# with heavy writes, this adds some simple k/v pressure
|
108
|
-
node, shard_id = choose_node(current_shard)
|
106
|
+
node, shard_id = choose_node(current_shard, use_nidx=True)
|
109
107
|
shard: nodereader_pb2.Shard = await node.reader.GetShard(
|
110
108
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
111
109
|
)
|
112
|
-
await self.shard_manager.maybe_create_new_shard(
|
113
|
-
kbid,
|
114
|
-
shard.paragraphs,
|
115
|
-
shard.fields,
|
116
|
-
kb_shards.release_channel,
|
117
|
-
)
|
110
|
+
await self.shard_manager.maybe_create_new_shard(kbid, shard.paragraphs)
|
@@ -48,9 +48,7 @@ class DelayedTaskHandler:
|
|
48
48
|
for task in list(self.outstanding_tasks.values()):
|
49
49
|
await task
|
50
50
|
|
51
|
-
def schedule(
|
52
|
-
self, key: str, handler: Callable[[], Coroutine[None, None, None]]
|
53
|
-
) -> None:
|
51
|
+
def schedule(self, key: str, handler: Callable[[], Coroutine[None, None, None]]) -> None:
|
54
52
|
if key in self.to_process:
|
55
53
|
# already waiting to process this key, ignore
|
56
54
|
return
|