nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,8 +18,10 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import enum
|
21
|
+
from typing import Optional
|
21
22
|
|
22
|
-
from pydantic import
|
23
|
+
from pydantic import Field
|
24
|
+
from pydantic_settings import BaseSettings
|
23
25
|
|
24
26
|
|
25
27
|
class ClusterDiscoveryMode(str, enum.Enum):
|
@@ -28,7 +30,7 @@ class ClusterDiscoveryMode(str, enum.Enum):
|
|
28
30
|
SINGLE_NODE = "single_node"
|
29
31
|
|
30
32
|
|
31
|
-
class StandaloneNodeRole(
|
33
|
+
class StandaloneNodeRole(enum.Enum):
|
32
34
|
ALL = "all"
|
33
35
|
INDEX = "index"
|
34
36
|
WORKER = "worker"
|
@@ -55,16 +57,10 @@ class Settings(BaseSettings):
|
|
55
57
|
|
56
58
|
# Node limits
|
57
59
|
max_shard_paragraphs: int = Field(
|
58
|
-
default=
|
60
|
+
default=500_000,
|
59
61
|
title="Max shard paragraphs",
|
60
62
|
description="Maximum number of paragraphs to target per shard",
|
61
63
|
)
|
62
|
-
max_shard_fields: int = Field(
|
63
|
-
default=125_000,
|
64
|
-
title="Max shard fields",
|
65
|
-
description="Maximum number of fields to target per shard. "
|
66
|
-
"If this is reached before max_shard_paragraphs, we will create a new shard",
|
67
|
-
)
|
68
64
|
max_node_replicas: int = Field(
|
69
65
|
default=800,
|
70
66
|
title="Max node replicas",
|
@@ -76,6 +72,12 @@ class Settings(BaseSettings):
|
|
76
72
|
description="Maximum number of paragraphs allowed on a single resource",
|
77
73
|
)
|
78
74
|
|
75
|
+
drain_nodes: list[str] = Field(
|
76
|
+
default=[],
|
77
|
+
title="Drain nodes",
|
78
|
+
description="List of node IDs to ignore when creating new shards. It is used for draining nodes from a cluster. Example: ['1bf3bfe7-e164-4a19-a4d9-41372fc15aca',]", # noqa: E501
|
79
|
+
)
|
80
|
+
|
79
81
|
local_reader_threads: int = 5
|
80
82
|
local_writer_threads: int = 5
|
81
83
|
|
@@ -84,6 +86,11 @@ class Settings(BaseSettings):
|
|
84
86
|
cluster_discovery_kubernetes_selector: str = "appType=node"
|
85
87
|
cluster_discovery_manual_addresses: list[str] = []
|
86
88
|
|
89
|
+
nidx_api_address: Optional[str] = Field(default=None, description="NIDX gRPC API address")
|
90
|
+
nidx_searcher_address: Optional[str] = Field(
|
91
|
+
default=None, description="NIDX gRPC searcher API address"
|
92
|
+
)
|
93
|
+
|
87
94
|
|
88
95
|
settings = Settings()
|
89
96
|
|
@@ -30,6 +30,7 @@ from nucliadb_protos.nodereader_pb2 import (
|
|
30
30
|
DocumentItem,
|
31
31
|
EdgeList,
|
32
32
|
GetShardRequest,
|
33
|
+
IdCollection,
|
33
34
|
ParagraphItem,
|
34
35
|
ParagraphSearchRequest,
|
35
36
|
ParagraphSearchResponse,
|
@@ -46,17 +47,14 @@ from nucliadb_protos.noderesources_pb2 import (
|
|
46
47
|
EmptyResponse,
|
47
48
|
Resource,
|
48
49
|
ResourceID,
|
49
|
-
)
|
50
|
-
from nucliadb_protos.noderesources_pb2 import Shard as NodeResourcesShard
|
51
|
-
from nucliadb_protos.noderesources_pb2 import (
|
52
50
|
ShardCreated,
|
53
51
|
ShardId,
|
54
52
|
ShardIds,
|
55
|
-
ShardMetadata,
|
56
53
|
VectorSetID,
|
57
54
|
VectorSetList,
|
58
55
|
)
|
59
|
-
from nucliadb_protos.
|
56
|
+
from nucliadb_protos.noderesources_pb2 import Shard as NodeResourcesShard
|
57
|
+
from nucliadb_protos.nodewriter_pb2 import NewShardRequest, OpStatus
|
60
58
|
|
61
59
|
from ..settings import settings
|
62
60
|
|
@@ -69,8 +67,7 @@ except ImportError: # pragma: no cover
|
|
69
67
|
IndexNodeException = Exception
|
70
68
|
|
71
69
|
try:
|
72
|
-
from nucliadb_node_binding import NodeReader
|
73
|
-
from nucliadb_node_binding import NodeWriter # type: ignore
|
70
|
+
from nucliadb_node_binding import NodeReader, NodeWriter
|
74
71
|
except ImportError: # pragma: no cover
|
75
72
|
NodeReader = None
|
76
73
|
NodeWriter = None
|
@@ -81,15 +78,11 @@ class StandaloneReaderWrapper:
|
|
81
78
|
|
82
79
|
def __init__(self):
|
83
80
|
if NodeReader is None:
|
84
|
-
raise ImportError(
|
85
|
-
"NucliaDB index node bindings are not installed (reader not found)"
|
86
|
-
)
|
81
|
+
raise ImportError("NucliaDB index node bindings are not installed (reader not found)")
|
87
82
|
self.reader = NodeReader()
|
88
83
|
self.executor = ThreadPoolExecutor(settings.local_reader_threads)
|
89
84
|
|
90
|
-
async def Search(
|
91
|
-
self, request: SearchRequest, retry: bool = False
|
92
|
-
) -> SearchResponse:
|
85
|
+
async def Search(self, request: SearchRequest, retry: bool = False) -> SearchResponse:
|
93
86
|
try:
|
94
87
|
loop = asyncio.get_running_loop()
|
95
88
|
result = await loop.run_in_executor(
|
@@ -113,30 +106,6 @@ class StandaloneReaderWrapper:
|
|
113
106
|
else:
|
114
107
|
raise
|
115
108
|
|
116
|
-
async def ParagraphSearch(
|
117
|
-
self, request: ParagraphSearchRequest
|
118
|
-
) -> ParagraphSearchResponse:
|
119
|
-
loop = asyncio.get_running_loop()
|
120
|
-
result = await loop.run_in_executor(
|
121
|
-
self.executor, self.reader.paragraph_search, request.SerializeToString()
|
122
|
-
)
|
123
|
-
pb_bytes = bytes(result)
|
124
|
-
pb = ParagraphSearchResponse()
|
125
|
-
pb.ParseFromString(pb_bytes)
|
126
|
-
return pb
|
127
|
-
|
128
|
-
async def RelationSearch(
|
129
|
-
self, request: RelationSearchRequest
|
130
|
-
) -> RelationSearchResponse:
|
131
|
-
loop = asyncio.get_running_loop()
|
132
|
-
result = await loop.run_in_executor(
|
133
|
-
self.executor, self.reader.relation_search, request.SerializeToString()
|
134
|
-
)
|
135
|
-
pb_bytes = bytes(result)
|
136
|
-
pb = RelationSearchResponse()
|
137
|
-
pb.ParseFromString(pb_bytes)
|
138
|
-
return pb
|
139
|
-
|
140
109
|
async def GetShard(self, request: GetShardRequest) -> NodeResourcesShard:
|
141
110
|
loop = asyncio.get_running_loop()
|
142
111
|
result = await loop.run_in_executor(
|
@@ -201,9 +170,7 @@ class StandaloneReaderWrapper:
|
|
201
170
|
raise exception
|
202
171
|
await loop.run_in_executor(self.executor, t1.join)
|
203
172
|
|
204
|
-
async def Paragraphs(
|
205
|
-
self, stream_request: StreamRequest
|
206
|
-
) -> AsyncIterator[ParagraphItem]:
|
173
|
+
async def Paragraphs(self, stream_request: StreamRequest) -> AsyncIterator[ParagraphItem]:
|
207
174
|
loop = asyncio.get_running_loop()
|
208
175
|
q: asyncio.Queue[ParagraphItem] = asyncio.Queue(1)
|
209
176
|
exception = None
|
@@ -249,30 +216,15 @@ class StandaloneReaderWrapper:
|
|
249
216
|
edge_list.ParseFromString(pb_bytes)
|
250
217
|
return edge_list
|
251
218
|
|
252
|
-
|
253
|
-
async def Search(self, request: SearchRequest, retry: bool = False) -> SearchResponse:
|
254
|
-
try:
|
219
|
+
async def VectorIds(self, request: VectorSetID) -> IdCollection:
|
255
220
|
loop = asyncio.get_running_loop()
|
256
221
|
result = await loop.run_in_executor(
|
257
|
-
self.executor, self.reader.
|
222
|
+
self.executor, self.reader.vector_ids, request.SerializeToString()
|
258
223
|
)
|
259
224
|
pb_bytes = bytes(result)
|
260
|
-
|
261
|
-
|
262
|
-
return
|
263
|
-
except IndexNodeException as exc:
|
264
|
-
if "IO error" not in str(exc):
|
265
|
-
# ignore any other error
|
266
|
-
raise
|
267
|
-
|
268
|
-
# try some mitigations...
|
269
|
-
logger.error(f"IndexNodeException in Search: {request}", exc_info=True)
|
270
|
-
if not retry:
|
271
|
-
# reinit?
|
272
|
-
self.reader = NodeReader()
|
273
|
-
return await self.Search(request, retry=True)
|
274
|
-
else:
|
275
|
-
raise
|
225
|
+
ids = IdCollection()
|
226
|
+
ids.ParseFromString(pb_bytes)
|
227
|
+
return ids
|
276
228
|
|
277
229
|
|
278
230
|
class StandaloneWriterWrapper:
|
@@ -281,13 +233,11 @@ class StandaloneWriterWrapper:
|
|
281
233
|
def __init__(self):
|
282
234
|
os.makedirs(settings.data_path, exist_ok=True)
|
283
235
|
if NodeWriter is None:
|
284
|
-
raise ImportError(
|
285
|
-
"NucliaDB index node bindings are not installed (writer not found)"
|
286
|
-
)
|
236
|
+
raise ImportError("NucliaDB index node bindings are not installed (writer not found)")
|
287
237
|
self.writer = NodeWriter()
|
288
238
|
self.executor = ThreadPoolExecutor(settings.local_writer_threads)
|
289
239
|
|
290
|
-
async def NewShard(self, request:
|
240
|
+
async def NewShard(self, request: NewShardRequest) -> ShardCreated:
|
291
241
|
loop = asyncio.get_running_loop()
|
292
242
|
resp = await loop.run_in_executor(
|
293
243
|
self.executor, self.writer.new_shard, request.SerializeToString()
|
@@ -318,33 +268,33 @@ class StandaloneWriterWrapper:
|
|
318
268
|
shard_ids.ParseFromString(pb_bytes)
|
319
269
|
return shard_ids
|
320
270
|
|
321
|
-
async def
|
271
|
+
async def AddVectorSet(self, request: VectorSetID):
|
322
272
|
loop = asyncio.get_running_loop()
|
323
273
|
resp = await loop.run_in_executor(
|
324
|
-
self.executor, self.writer.
|
274
|
+
self.executor, self.writer.add_vectorset, request.SerializeToString()
|
325
275
|
)
|
326
276
|
pb_bytes = bytes(resp)
|
327
277
|
resp = OpStatus()
|
328
278
|
resp.ParseFromString(pb_bytes)
|
329
279
|
return resp
|
330
280
|
|
331
|
-
async def
|
281
|
+
async def ListVectorSets(self, request: ShardId):
|
332
282
|
loop = asyncio.get_running_loop()
|
333
283
|
resp = await loop.run_in_executor(
|
334
|
-
self.executor, self.writer.
|
284
|
+
self.executor, self.writer.list_vectorsets, request.SerializeToString()
|
335
285
|
)
|
336
286
|
pb_bytes = bytes(resp)
|
337
|
-
resp =
|
287
|
+
resp = VectorSetList()
|
338
288
|
resp.ParseFromString(pb_bytes)
|
339
289
|
return resp
|
340
290
|
|
341
|
-
async def
|
291
|
+
async def RemoveVectorSet(self, request: VectorSetID):
|
342
292
|
loop = asyncio.get_running_loop()
|
343
293
|
resp = await loop.run_in_executor(
|
344
|
-
self.executor, self.writer.
|
294
|
+
self.executor, self.writer.remove_vectorset, request.SerializeToString()
|
345
295
|
)
|
346
296
|
pb_bytes = bytes(resp)
|
347
|
-
resp =
|
297
|
+
resp = OpStatus()
|
348
298
|
resp.ParseFromString(pb_bytes)
|
349
299
|
return resp
|
350
300
|
|
@@ -370,9 +320,7 @@ class StandaloneWriterWrapper:
|
|
370
320
|
|
371
321
|
async def GC(self, request: ShardId) -> EmptyResponse:
|
372
322
|
loop = asyncio.get_running_loop()
|
373
|
-
resp = await loop.run_in_executor(
|
374
|
-
self.executor, self.writer.gc, request.SerializeToString()
|
375
|
-
)
|
323
|
+
resp = await loop.run_in_executor(self.executor, self.writer.gc, request.SerializeToString())
|
376
324
|
pb_bytes = bytes(resp)
|
377
325
|
op_status = EmptyResponse()
|
378
326
|
op_status.ParseFromString(pb_bytes)
|
@@ -389,7 +337,7 @@ READER_METHODS = {
|
|
389
337
|
"RelationEdges": (ShardId, EdgeList),
|
390
338
|
}
|
391
339
|
WRITER_METHODS = {
|
392
|
-
"NewShard": (
|
340
|
+
"NewShard": (NewShardRequest, ShardCreated),
|
393
341
|
"DeleteShard": (ShardId, ShardId),
|
394
342
|
"ListShards": (EmptyQuery, ShardIds),
|
395
343
|
"RemoveVectorSet": (VectorSetID, OpStatus),
|
@@ -20,10 +20,7 @@
|
|
20
20
|
from typing import Any, Optional
|
21
21
|
|
22
22
|
from nucliadb.common.cluster.base import AbstractIndexNode
|
23
|
-
from nucliadb.common.cluster.grpc_node_dummy import
|
24
|
-
DummyReaderStub,
|
25
|
-
DummyWriterStub,
|
26
|
-
)
|
23
|
+
from nucliadb.common.cluster.grpc_node_dummy import DummyReaderStub, DummyWriterStub
|
27
24
|
from nucliadb.common.cluster.settings import settings as cluster_settings
|
28
25
|
from nucliadb.common.cluster.standalone import grpc_node_binding
|
29
26
|
from nucliadb_protos import standalone_pb2, standalone_pb2_grpc
|
@@ -79,7 +76,7 @@ class ProxyCallerWrapper:
|
|
79
76
|
else:
|
80
77
|
grpc_address = address
|
81
78
|
self._channel = get_traced_grpc_channel(grpc_address, "standalone_proxy")
|
82
|
-
self._stub = standalone_pb2_grpc.StandaloneClusterServiceStub(self._channel)
|
79
|
+
self._stub = standalone_pb2_grpc.StandaloneClusterServiceStub(self._channel)
|
83
80
|
|
84
81
|
def __getattr__(self, name):
|
85
82
|
async def call(request):
|
@@ -95,9 +92,7 @@ class ProxyCallerWrapper:
|
|
95
92
|
else:
|
96
93
|
raise NotImplementedError(f"Unknown type {self._type}")
|
97
94
|
except KeyError:
|
98
|
-
raise NotImplementedError(
|
99
|
-
f"Unknown method for type {self._type}: {name}"
|
100
|
-
)
|
95
|
+
raise NotImplementedError(f"Unknown method for type {self._type}: {name}")
|
101
96
|
return_value = return_type()
|
102
97
|
return_value.ParseFromString(resp.payload)
|
103
98
|
return return_value
|
@@ -116,9 +111,7 @@ class ProxyStandaloneIndexNode(StandaloneIndexNode):
|
|
116
111
|
available_disk: int,
|
117
112
|
dummy: bool = False,
|
118
113
|
):
|
119
|
-
super().__init__(
|
120
|
-
id, address, shard_count, available_disk=available_disk, dummy=dummy
|
121
|
-
)
|
114
|
+
super().__init__(id, address, shard_count, available_disk=available_disk, dummy=dummy)
|
122
115
|
if dummy:
|
123
116
|
return
|
124
117
|
|
@@ -32,9 +32,7 @@ from nucliadb_protos import standalone_pb2, standalone_pb2_grpc
|
|
32
32
|
from nucliadb_utils.grpc import get_traced_grpc_server
|
33
33
|
|
34
34
|
|
35
|
-
class StandaloneClusterServiceServicer(
|
36
|
-
standalone_pb2_grpc.StandaloneClusterServiceServicer
|
37
|
-
):
|
35
|
+
class StandaloneClusterServiceServicer(standalone_pb2_grpc.StandaloneClusterServiceServicer):
|
38
36
|
@backoff.on_exception(backoff.expo, (AioRpcError,), max_time=60)
|
39
37
|
async def NodeAction( # type: ignore
|
40
38
|
self, request: standalone_pb2.NodeActionRequest, context
|
@@ -61,9 +59,7 @@ class StandaloneClusterServiceServicer(
|
|
61
59
|
self, request: standalone_pb2.NodeInfoRequest, context
|
62
60
|
) -> standalone_pb2.NodeInfoResponse:
|
63
61
|
index_node = get_self()
|
64
|
-
index_node.shard_count = len(
|
65
|
-
os.listdir(os.path.join(cluster_settings.data_path, "shards"))
|
66
|
-
)
|
62
|
+
index_node.shard_count = len(os.listdir(os.path.join(cluster_settings.data_path, "shards")))
|
67
63
|
total_disk, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
|
68
64
|
return standalone_pb2.NodeInfoResponse(
|
69
65
|
id=index_node.id,
|
@@ -56,9 +56,7 @@ def get_self() -> StandaloneIndexNode:
|
|
56
56
|
make another grpc request since this node can service it directly.
|
57
57
|
"""
|
58
58
|
if not is_index_node():
|
59
|
-
raise Exception(
|
60
|
-
"This node is not an Index Node. You should not reach this code path."
|
61
|
-
)
|
59
|
+
raise Exception("This node is not an Index Node. You should not reach this code path.")
|
62
60
|
global _SELF_INDEX_NODE
|
63
61
|
node_id = get_standalone_node_id()
|
64
62
|
if _SELF_INDEX_NODE is None or node_id != _SELF_INDEX_NODE.id:
|
@@ -68,9 +66,7 @@ def get_self() -> StandaloneIndexNode:
|
|
68
66
|
host = f"{hn}.{ns}"
|
69
67
|
else:
|
70
68
|
host = gethostname()
|
71
|
-
_SELF_INDEX_NODE = StandaloneIndexNode(
|
72
|
-
id=node_id, address=host, shard_count=0, available_disk=0
|
73
|
-
)
|
69
|
+
_SELF_INDEX_NODE = StandaloneIndexNode(id=node_id, address=host, shard_count=0, available_disk=0)
|
74
70
|
try:
|
75
71
|
_, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
|
76
72
|
_SELF_INDEX_NODE.available_disk = available_disk
|
@@ -95,3 +91,10 @@ def is_index_node() -> bool:
|
|
95
91
|
StandaloneNodeRole.ALL,
|
96
92
|
StandaloneNodeRole.INDEX,
|
97
93
|
)
|
94
|
+
|
95
|
+
|
96
|
+
def is_worker_node() -> bool:
|
97
|
+
return cluster_settings.standalone_node_role in (
|
98
|
+
StandaloneNodeRole.ALL,
|
99
|
+
StandaloneNodeRole.WORKER,
|
100
|
+
)
|
nucliadb/common/cluster/utils.py
CHANGED
@@ -27,14 +27,20 @@ from nucliadb.common.cluster.discovery.utils import (
|
|
27
27
|
setup_cluster_discovery,
|
28
28
|
teardown_cluster_discovery,
|
29
29
|
)
|
30
|
-
from nucliadb.common.cluster.manager import
|
30
|
+
from nucliadb.common.cluster.manager import (
|
31
|
+
KBShardManager,
|
32
|
+
StandaloneKBShardManager,
|
33
|
+
clear_index_nodes,
|
34
|
+
)
|
31
35
|
from nucliadb.common.cluster.settings import settings
|
32
36
|
from nucliadb.common.cluster.standalone.service import (
|
33
37
|
start_grpc as start_standalone_grpc,
|
34
38
|
)
|
35
39
|
from nucliadb.common.cluster.standalone.utils import is_index_node
|
36
|
-
from
|
40
|
+
from nucliadb.ingest.orm.resource import Resource
|
41
|
+
from nucliadb_protos import nodereader_pb2, writer_pb2
|
37
42
|
from nucliadb_utils import const
|
43
|
+
from nucliadb_utils.settings import is_onprem_nucliadb
|
38
44
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
39
45
|
|
40
46
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -79,12 +85,18 @@ async def teardown_cluster():
|
|
79
85
|
await std_server.stop(None)
|
80
86
|
clean_utility(_STANDALONE_SERVER)
|
81
87
|
|
88
|
+
clear_index_nodes()
|
89
|
+
|
82
90
|
|
83
91
|
def get_shard_manager() -> KBShardManager:
|
84
92
|
return get_utility(Utility.SHARD_MANAGER) # type: ignore
|
85
93
|
|
86
94
|
|
87
95
|
async def wait_for_node(app_context: ApplicationContext, node_id: str) -> None:
|
96
|
+
if is_onprem_nucliadb():
|
97
|
+
# On onprem deployments indexing is synchronous right now, so we don't need to wait
|
98
|
+
return
|
99
|
+
|
88
100
|
logged = False
|
89
101
|
while True:
|
90
102
|
# get raw js client
|
@@ -108,40 +120,44 @@ async def wait_for_node(app_context: ApplicationContext, node_id: str) -> None:
|
|
108
120
|
await asyncio.sleep(sleep)
|
109
121
|
|
110
122
|
|
111
|
-
|
112
|
-
|
113
|
-
)
|
123
|
+
async def get_resource(kbid: str, resource_id: str) -> Optional[Resource]:
|
124
|
+
async with datamanagers.with_ro_transaction() as txn:
|
125
|
+
return await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
|
126
|
+
|
127
|
+
|
128
|
+
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
|
129
|
+
async def get_resource_index_message(kbid: str, resource_id: str) -> Optional[nodereader_pb2.Resource]:
|
130
|
+
async with datamanagers.with_ro_transaction() as txn:
|
131
|
+
resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
|
132
|
+
if resource is None:
|
133
|
+
logger.warning(
|
134
|
+
"Resource not found while indexing, skipping",
|
135
|
+
extra={"kbid": kbid, "resource_id": resource_id},
|
136
|
+
)
|
137
|
+
return None
|
138
|
+
resource_index_message = (await resource.generate_index_message(reindex=False)).brain
|
139
|
+
return resource_index_message
|
140
|
+
|
141
|
+
|
142
|
+
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
|
114
143
|
async def index_resource_to_shard(
|
115
144
|
app_context: ApplicationContext,
|
116
145
|
kbid: str,
|
117
146
|
resource_id: str,
|
118
147
|
shard: writer_pb2.ShardObject,
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
)
|
123
|
-
|
148
|
+
resource_index_message: Optional[nodereader_pb2.Resource] = None,
|
149
|
+
) -> None:
|
150
|
+
logger.info("Indexing resource", extra={"kbid": kbid, "resource_id": resource_id})
|
124
151
|
sm = app_context.shard_manager
|
125
152
|
partitioning = app_context.partitioning
|
126
153
|
|
127
|
-
async with datamanagers.with_transaction() as txn:
|
128
|
-
resource_index_message = (
|
129
|
-
await datamanagers.resources.get_resource_index_message(
|
130
|
-
txn, kbid=kbid, rid=resource_id
|
131
|
-
)
|
132
|
-
)
|
133
|
-
|
134
154
|
if resource_index_message is None:
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
return None
|
155
|
+
resource_index_message = await get_resource_index_message(kbid, resource_id)
|
156
|
+
if resource_index_message is None:
|
157
|
+
return
|
158
|
+
|
140
159
|
partition = partitioning.generate_partition(kbid, resource_id)
|
141
|
-
await sm.add_resource(
|
142
|
-
shard, resource_index_message, txid=-1, partition=str(partition), kb=kbid
|
143
|
-
)
|
144
|
-
return resource_index_message
|
160
|
+
await sm.add_resource(shard, resource_index_message, txid=-1, partition=str(partition), kb=kbid)
|
145
161
|
|
146
162
|
|
147
163
|
async def delete_resource_from_shard(
|
@@ -150,9 +166,7 @@ async def delete_resource_from_shard(
|
|
150
166
|
resource_id: str,
|
151
167
|
shard: writer_pb2.ShardObject,
|
152
168
|
) -> None:
|
153
|
-
logger.
|
154
|
-
"Deleting resource", extra={"kbid": kbid, "resource_id": resource_id}
|
155
|
-
)
|
169
|
+
logger.info("Deleting resource", extra={"kbid": kbid, "resource_id": resource_id})
|
156
170
|
|
157
171
|
sm = app_context.shard_manager
|
158
172
|
partitioning = app_context.partitioning
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
AVG_PARAGRAPH_SIZE_BYTES = 10_000
|
@@ -24,14 +24,13 @@ from nucliadb.common.cluster.settings import in_standalone_mode
|
|
24
24
|
from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
|
25
25
|
from nucliadb.common.maindb.driver import Driver
|
26
26
|
from nucliadb.common.maindb.utils import setup_driver, teardown_driver
|
27
|
+
from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
|
27
28
|
from nucliadb_utils.indexing import IndexingUtility
|
28
29
|
from nucliadb_utils.nats import NatsConnectionManager
|
29
30
|
from nucliadb_utils.partition import PartitionUtility
|
30
31
|
from nucliadb_utils.settings import indexing_settings
|
31
32
|
from nucliadb_utils.storages.storage import Storage
|
32
33
|
from nucliadb_utils.utilities import (
|
33
|
-
Utility,
|
34
|
-
clean_utility,
|
35
34
|
get_storage,
|
36
35
|
start_indexing_utility,
|
37
36
|
start_nats_manager,
|
@@ -41,6 +40,7 @@ from nucliadb_utils.utilities import (
|
|
41
40
|
stop_nats_manager,
|
42
41
|
stop_partitioning_utility,
|
43
42
|
stop_transaction_utility,
|
43
|
+
teardown_storage,
|
44
44
|
)
|
45
45
|
|
46
46
|
|
@@ -79,18 +79,20 @@ class ApplicationContext:
|
|
79
79
|
)
|
80
80
|
self.indexing = await start_indexing_utility()
|
81
81
|
self.transaction = await start_transaction_utility(self.service_name)
|
82
|
+
self.nidx = await start_nidx_utility()
|
82
83
|
|
83
84
|
async def finalize(self) -> None:
|
84
85
|
if not self._initialized:
|
85
86
|
return
|
86
87
|
|
88
|
+
await stop_nidx_utility()
|
87
89
|
await stop_transaction_utility()
|
88
90
|
if not in_standalone_mode():
|
89
91
|
await stop_indexing_utility()
|
90
92
|
await stop_nats_manager()
|
93
|
+
|
91
94
|
stop_partitioning_utility()
|
92
95
|
await teardown_cluster()
|
93
96
|
await teardown_driver()
|
94
|
-
await
|
95
|
-
clean_utility(Utility.STORAGE)
|
97
|
+
await teardown_storage()
|
96
98
|
self._initialized = False
|
@@ -18,25 +18,28 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
+
from contextlib import asynccontextmanager
|
22
|
+
|
21
23
|
from fastapi import FastAPI
|
22
24
|
from starlette.routing import Mount
|
23
25
|
|
24
26
|
from nucliadb.common.context import ApplicationContext
|
25
27
|
|
26
28
|
|
27
|
-
|
29
|
+
@asynccontextmanager
|
30
|
+
async def inject_app_context(app: FastAPI):
|
28
31
|
context = ApplicationContext()
|
29
32
|
|
30
33
|
app.state.context = context
|
31
|
-
app.add_event_handler("startup", context.initialize)
|
32
|
-
app.add_event_handler("shutdown", context.finalize)
|
33
34
|
|
34
35
|
# Need to add app context in all sub-applications
|
35
36
|
for route in app.router.routes:
|
36
37
|
if isinstance(route, Mount) and isinstance(route.app, FastAPI):
|
37
38
|
route.app.state.context = context
|
38
|
-
|
39
|
-
|
39
|
+
|
40
|
+
await context.initialize()
|
41
|
+
yield context
|
42
|
+
await context.finalize()
|
40
43
|
|
41
44
|
|
42
45
|
def get_app_context(application: FastAPI) -> ApplicationContext:
|
@@ -18,5 +18,11 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
-
from
|
22
|
-
|
21
|
+
from dataclasses import dataclass
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass
|
25
|
+
class IndexCounts:
|
26
|
+
fields: int
|
27
|
+
paragraphs: int
|
28
|
+
sentences: int
|
@@ -28,17 +28,36 @@
|
|
28
28
|
# - First argument is always a transaction, all other arguments are keyword arguments and must be explicit
|
29
29
|
# (better for readability and code editors)
|
30
30
|
# ==============================================================================
|
31
|
-
from . import
|
32
|
-
|
31
|
+
from . import (
|
32
|
+
atomic,
|
33
|
+
cluster,
|
34
|
+
entities,
|
35
|
+
exceptions,
|
36
|
+
fields,
|
37
|
+
kb,
|
38
|
+
labels,
|
39
|
+
processing,
|
40
|
+
resources,
|
41
|
+
rollover,
|
42
|
+
synonyms,
|
43
|
+
vectorsets,
|
44
|
+
)
|
45
|
+
from .utils import with_ro_transaction, with_rw_transaction, with_transaction
|
33
46
|
|
34
47
|
__all__ = (
|
48
|
+
"atomic",
|
35
49
|
"cluster",
|
36
|
-
"kb",
|
37
50
|
"entities",
|
51
|
+
"exceptions",
|
52
|
+
"fields",
|
53
|
+
"kb",
|
38
54
|
"labels",
|
55
|
+
"processing",
|
39
56
|
"resources",
|
40
57
|
"rollover",
|
41
|
-
"
|
42
|
-
"
|
58
|
+
"synonyms",
|
59
|
+
"vectorsets",
|
43
60
|
"with_transaction",
|
61
|
+
"with_rw_transaction",
|
62
|
+
"with_ro_transaction",
|
44
63
|
)
|