nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/ingest/settings.py
CHANGED
@@ -25,10 +25,8 @@ from pydantic_settings import BaseSettings
|
|
25
25
|
|
26
26
|
|
27
27
|
class DriverConfig(Enum):
|
28
|
-
REDIS = "redis"
|
29
|
-
TIKV = "tikv"
|
30
28
|
PG = "pg"
|
31
|
-
LOCAL = "local"
|
29
|
+
LOCAL = "local" # Not recommended for production
|
32
30
|
NOT_SET = "notset" # setting not provided
|
33
31
|
|
34
32
|
@classmethod
|
@@ -42,19 +40,7 @@ class DriverConfig(Enum):
|
|
42
40
|
|
43
41
|
|
44
42
|
class DriverSettings(BaseSettings):
|
45
|
-
driver: DriverConfig = Field(
|
46
|
-
default=DriverConfig.NOT_SET, description="K/V storage driver"
|
47
|
-
)
|
48
|
-
driver_redis_url: Optional[str] = Field(
|
49
|
-
default=None, description="Redis URL. Example: redis://localhost:6379"
|
50
|
-
)
|
51
|
-
driver_tikv_url: Optional[list[str]] = Field(
|
52
|
-
default=None,
|
53
|
-
description=(
|
54
|
-
"TiKV PD (Placement Driver) URLs. The URL to the cluster manager of"
|
55
|
-
"TiKV. Example: '[\"tikv-pd.svc:2379\"]'"
|
56
|
-
),
|
57
|
-
)
|
43
|
+
driver: DriverConfig = Field(default=DriverConfig.PG, description="K/V storage driver")
|
58
44
|
driver_local_url: Optional[str] = Field(
|
59
45
|
default=None,
|
60
46
|
description="Local path to store data on file system. Example: /nucliadb/data/main",
|
@@ -63,13 +49,17 @@ class DriverSettings(BaseSettings):
|
|
63
49
|
default=None,
|
64
50
|
description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.", # noqa
|
65
51
|
)
|
52
|
+
driver_pg_connection_pool_min_size: int = Field(
|
53
|
+
default=10,
|
54
|
+
description="PostgreSQL min pool size. The minimum number of connections to the PostgreSQL server.",
|
55
|
+
)
|
66
56
|
driver_pg_connection_pool_max_size: int = Field(
|
67
57
|
default=20,
|
68
58
|
description="PostgreSQL max pool size. The maximum number of connections to the PostgreSQL server.",
|
69
59
|
)
|
70
|
-
|
71
|
-
default=
|
72
|
-
description="
|
60
|
+
driver_pg_connection_pool_acquire_timeout_ms: int = Field(
|
61
|
+
default=1000,
|
62
|
+
description="PostgreSQL pool acquire timeout in ms. The maximum time to wait until a connection becomes available.",
|
73
63
|
)
|
74
64
|
|
75
65
|
|
@@ -87,7 +77,7 @@ class Settings(DriverSettings):
|
|
87
77
|
total_replicas: int = 1 # number of ingest processor replicas in the cluster
|
88
78
|
nuclia_partitions: int = 50
|
89
79
|
|
90
|
-
max_receive_message_length: int =
|
80
|
+
max_receive_message_length: int = 500 # In MB
|
91
81
|
|
92
82
|
# Search query timeouts
|
93
83
|
relation_search_timeout: float = 10.0
|
nucliadb/ingest/utils.py
CHANGED
@@ -19,9 +19,8 @@
|
|
19
19
|
#
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
23
|
-
|
24
22
|
from nucliadb.common.maindb.utils import setup_driver
|
23
|
+
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
25
24
|
from nucliadb_utils.grpc import get_traced_grpc_channel
|
26
25
|
from nucliadb_utils.settings import nucliadb_settings
|
27
26
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
@@ -37,11 +36,9 @@ async def start_ingest(service_name: Optional[str] = None):
|
|
37
36
|
if nucliadb_settings.nucliadb_ingest is not None:
|
38
37
|
# Its distributed lets create a GRPC client
|
39
38
|
# We want Jaeger telemetry enabled
|
40
|
-
channel = get_traced_grpc_channel(
|
41
|
-
nucliadb_settings.nucliadb_ingest, service_name or "ingest"
|
42
|
-
)
|
39
|
+
channel = get_traced_grpc_channel(nucliadb_settings.nucliadb_ingest, service_name or "ingest")
|
43
40
|
set_utility(Utility.CHANNEL, channel)
|
44
|
-
ingest = WriterStub(channel)
|
41
|
+
ingest = WriterStub(channel)
|
45
42
|
set_utility(Utility.INGEST, ingest)
|
46
43
|
else:
|
47
44
|
# Its not distributed create a ingest
|
nucliadb/learning_proxy.py
CHANGED
@@ -20,16 +20,21 @@
|
|
20
20
|
import contextlib
|
21
21
|
import json
|
22
22
|
import logging
|
23
|
+
import os
|
24
|
+
from abc import ABC, abstractmethod
|
23
25
|
from collections.abc import AsyncIterator
|
24
|
-
from enum import Enum
|
26
|
+
from enum import Enum, IntEnum
|
25
27
|
from typing import Any, Optional, Union
|
26
28
|
|
27
29
|
import backoff
|
28
30
|
import httpx
|
29
31
|
from fastapi import Request, Response
|
30
32
|
from fastapi.responses import StreamingResponse
|
33
|
+
from lru import LRU
|
31
34
|
from pydantic import BaseModel, Field, model_validator
|
35
|
+
from typing_extensions import Self
|
32
36
|
|
37
|
+
from nucliadb_protos import knowledgebox_pb2, utils_pb2
|
33
38
|
from nucliadb_telemetry import errors
|
34
39
|
from nucliadb_utils.settings import is_onprem_nucliadb, nuclia_settings
|
35
40
|
|
@@ -50,7 +55,32 @@ WHITELISTED_HEADERS = {
|
|
50
55
|
|
51
56
|
class LearningService(Enum):
|
52
57
|
CONFIG = "config"
|
53
|
-
|
58
|
+
|
59
|
+
|
60
|
+
class SimilarityFunction(IntEnum):
|
61
|
+
# Keep this in sync with learning config repo
|
62
|
+
# It's an IntEnum to match the protobuf definition
|
63
|
+
DOT = 0
|
64
|
+
COSINE = 1
|
65
|
+
|
66
|
+
|
67
|
+
class SemanticConfig(BaseModel):
|
68
|
+
# Keep this in sync with learning config repo
|
69
|
+
similarity: SimilarityFunction
|
70
|
+
size: int
|
71
|
+
threshold: float
|
72
|
+
matryoshka_dims: list[int] = []
|
73
|
+
|
74
|
+
def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
|
75
|
+
semantic_model = knowledgebox_pb2.SemanticModelMetadata()
|
76
|
+
LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
|
77
|
+
SimilarityFunction.COSINE: utils_pb2.VectorSimilarity.COSINE,
|
78
|
+
SimilarityFunction.DOT: utils_pb2.VectorSimilarity.DOT,
|
79
|
+
}
|
80
|
+
semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[self.similarity]
|
81
|
+
semantic_model.vector_dimension = self.size
|
82
|
+
semantic_model.matryoshka_dimensions.extend(self.matryoshka_dims)
|
83
|
+
return semantic_model
|
54
84
|
|
55
85
|
|
56
86
|
# Subset of learning configuration of nucliadb's interest. Look at
|
@@ -69,52 +99,104 @@ class LearningConfiguration(BaseModel):
|
|
69
99
|
default=None, alias="semantic_matryoshka_dims"
|
70
100
|
)
|
71
101
|
|
72
|
-
|
102
|
+
semantic_models: list[str] = Field(default_factory=list)
|
103
|
+
|
104
|
+
# This is where the config for each semantic model (aka vectorsets) is returned
|
105
|
+
semantic_model_configs: dict[str, SemanticConfig] = Field(default_factory=dict)
|
106
|
+
|
107
|
+
@model_validator(mode="before")
|
73
108
|
@classmethod
|
74
|
-
def
|
75
|
-
|
76
|
-
|
109
|
+
def maintain_bw_compatibility_with_single_model_configs(cls, data: Any) -> Any:
|
110
|
+
if isinstance(data, dict):
|
111
|
+
if not data.get("semantic_model", None) and len(data.get("semantic_models", [])) > 0:
|
112
|
+
data["semantic_model"] = data["semantic_models"][0]
|
113
|
+
return data
|
114
|
+
|
115
|
+
@model_validator(mode="after")
|
116
|
+
def validate_matryoshka_and_vector_dimension_consistency(self) -> Self:
|
117
|
+
vector_size = self.semantic_vector_size
|
118
|
+
matryoshka_dimensions = self.semantic_matryoshka_dimensions or []
|
77
119
|
if (
|
78
120
|
len(matryoshka_dimensions) > 0
|
79
121
|
and vector_size is not None
|
80
122
|
and vector_size not in matryoshka_dimensions
|
81
123
|
):
|
82
|
-
raise ValueError(
|
83
|
-
|
84
|
-
|
85
|
-
|
124
|
+
raise ValueError("Semantic vector size is inconsistent with matryoshka dimensions")
|
125
|
+
return self
|
126
|
+
|
127
|
+
def into_semantic_models_metadata(
|
128
|
+
self,
|
129
|
+
) -> dict[str, knowledgebox_pb2.SemanticModelMetadata]:
|
130
|
+
result = {}
|
131
|
+
for model_name, config in self.semantic_model_configs.items():
|
132
|
+
result[model_name] = config.into_semantic_model_metadata()
|
133
|
+
return result
|
134
|
+
|
135
|
+
def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
|
136
|
+
semantic_model = knowledgebox_pb2.SemanticModelMetadata()
|
137
|
+
|
138
|
+
LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
|
139
|
+
"cosine": utils_pb2.VectorSimilarity.COSINE,
|
140
|
+
"dot": utils_pb2.VectorSimilarity.DOT,
|
141
|
+
}
|
142
|
+
semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[
|
143
|
+
self.semantic_vector_similarity.lower()
|
144
|
+
]
|
145
|
+
|
146
|
+
if self.semantic_vector_size is not None:
|
147
|
+
semantic_model.vector_dimension = self.semantic_vector_size
|
148
|
+
else:
|
149
|
+
logger.warning("Vector dimension not set!")
|
150
|
+
|
151
|
+
if self.semantic_matryoshka_dimensions is not None:
|
152
|
+
semantic_model.matryoshka_dimensions.extend(self.semantic_matryoshka_dimensions)
|
153
|
+
|
154
|
+
return semantic_model
|
155
|
+
|
156
|
+
|
157
|
+
class ProxiedLearningConfigError(Exception):
|
158
|
+
def __init__(self, status_code: int, content: bytes, content_type: str):
|
159
|
+
self.status_code = status_code
|
160
|
+
self.content = content
|
161
|
+
self.content_type = content_type
|
162
|
+
|
163
|
+
|
164
|
+
def raise_for_status(response: httpx.Response) -> None:
|
165
|
+
try:
|
166
|
+
response.raise_for_status()
|
167
|
+
except httpx.HTTPStatusError as err:
|
168
|
+
content_type = err.response.headers.get("Content-Type", "application/json")
|
169
|
+
raise ProxiedLearningConfigError(
|
170
|
+
status_code=err.response.status_code,
|
171
|
+
content=err.response.content,
|
172
|
+
content_type=content_type,
|
173
|
+
)
|
86
174
|
|
87
175
|
|
88
176
|
async def get_configuration(
|
89
177
|
kbid: str,
|
90
178
|
) -> Optional[LearningConfiguration]:
|
91
|
-
|
92
|
-
resp = await client.get(f"config/{kbid}")
|
93
|
-
try:
|
94
|
-
resp.raise_for_status()
|
95
|
-
except httpx.HTTPStatusError as err:
|
96
|
-
if err.response.status_code == 404:
|
97
|
-
return None
|
98
|
-
raise
|
99
|
-
return LearningConfiguration.parse_obj(resp.json())
|
179
|
+
return await learning_config_service().get_configuration(kbid)
|
100
180
|
|
101
181
|
|
102
182
|
async def set_configuration(
|
103
183
|
kbid: str,
|
104
184
|
config: dict[str, Any],
|
105
185
|
) -> LearningConfiguration:
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
186
|
+
return await learning_config_service().set_configuration(kbid, config)
|
187
|
+
|
188
|
+
|
189
|
+
async def update_configuration(
|
190
|
+
kbid: str,
|
191
|
+
config: dict[str, Any],
|
192
|
+
) -> None:
|
193
|
+
return await learning_config_service().update_configuration(kbid, config)
|
110
194
|
|
111
195
|
|
112
196
|
async def delete_configuration(
|
113
197
|
kbid: str,
|
114
198
|
) -> None:
|
115
|
-
|
116
|
-
resp = await client.delete(f"config/{kbid}")
|
117
|
-
resp.raise_for_status()
|
199
|
+
return await learning_config_service().delete_configuration(kbid)
|
118
200
|
|
119
201
|
|
120
202
|
async def learning_config_proxy(
|
@@ -132,21 +214,6 @@ async def learning_config_proxy(
|
|
132
214
|
)
|
133
215
|
|
134
216
|
|
135
|
-
async def learning_collector_proxy(
|
136
|
-
request: Request,
|
137
|
-
method: str,
|
138
|
-
url: str,
|
139
|
-
extra_headers: Optional[dict[str, str]] = None,
|
140
|
-
) -> Union[Response, StreamingResponse]:
|
141
|
-
return await proxy(
|
142
|
-
service=LearningService.COLLECTOR,
|
143
|
-
request=request,
|
144
|
-
method=method,
|
145
|
-
url=url,
|
146
|
-
extra_headers=extra_headers,
|
147
|
-
)
|
148
|
-
|
149
|
-
|
150
217
|
def is_white_listed_header(header: str) -> bool:
|
151
218
|
return header.lower() in WHITELISTED_HEADERS
|
152
219
|
|
@@ -238,13 +305,9 @@ async def proxy(
|
|
238
305
|
|
239
306
|
def get_base_url(service: LearningService) -> str:
|
240
307
|
if is_onprem_nucliadb():
|
241
|
-
nuclia_public_url = nuclia_settings.nuclia_public_url.format(
|
242
|
-
zone=nuclia_settings.nuclia_zone
|
243
|
-
)
|
308
|
+
nuclia_public_url = nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone)
|
244
309
|
return f"{nuclia_public_url}/api/v1"
|
245
|
-
learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(
|
246
|
-
service=service.value
|
247
|
-
)
|
310
|
+
learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(service=service.value)
|
248
311
|
return f"{learning_svc_base_url}/api/v1/internal"
|
249
312
|
|
250
313
|
|
@@ -273,9 +336,7 @@ async def service_client(
|
|
273
336
|
# This is a workaround to be able to run integration tests that start nucliadb with docker.
|
274
337
|
# The learning APIs are not available in the docker setup, so we use a dummy client.
|
275
338
|
client = DummyClient(base_url=base_url, headers=headers)
|
276
|
-
logger.warning(
|
277
|
-
"Using dummy client. If you see this in production, something is wrong."
|
278
|
-
)
|
339
|
+
logger.warning("Using dummy client. If you see this in production, something is wrong.")
|
279
340
|
else:
|
280
341
|
client = httpx.AsyncClient(base_url=base_url, headers=headers) # type: ignore
|
281
342
|
try:
|
@@ -324,14 +385,31 @@ class DummyClient(httpx.AsyncClient):
|
|
324
385
|
return self._handle_request("DELETE", *args, **kwargs)
|
325
386
|
|
326
387
|
def get_config(self, *args: Any, **kwargs: Any):
|
388
|
+
size = 768 if os.environ.get("TEST_SENTENCE_ENCODER") == "multilingual-2023-02-21" else 512
|
327
389
|
lconfig = LearningConfiguration(
|
328
390
|
semantic_model="multilingual",
|
329
391
|
semantic_vector_similarity="cosine",
|
330
|
-
semantic_vector_size=
|
392
|
+
semantic_vector_size=size,
|
331
393
|
semantic_threshold=None,
|
332
394
|
semantic_matryoshka_dims=[],
|
395
|
+
semantic_model_configs={
|
396
|
+
"multilingual": SemanticConfig(
|
397
|
+
similarity=SimilarityFunction.COSINE,
|
398
|
+
size=size,
|
399
|
+
threshold=0,
|
400
|
+
matryoshka_dims=[],
|
401
|
+
)
|
402
|
+
},
|
333
403
|
)
|
334
|
-
return self._response(content=lconfig.
|
404
|
+
return self._response(content=lconfig.model_dump())
|
405
|
+
|
406
|
+
def post_config(self, *args: Any, **kwargs: Any):
|
407
|
+
# simulate post that returns the created config
|
408
|
+
return self.get_config(*args, **kwargs)
|
409
|
+
|
410
|
+
def patch_config(self, *args: Any, **kwargs: Any):
|
411
|
+
# simulate patch that returns the updated config
|
412
|
+
return self.get_config(*args, **kwargs)
|
335
413
|
|
336
414
|
async def request( # type: ignore
|
337
415
|
self,
|
@@ -341,9 +419,7 @@ class DummyClient(httpx.AsyncClient):
|
|
341
419
|
content=None,
|
342
420
|
headers=None,
|
343
421
|
) -> httpx.Response:
|
344
|
-
return self._handle_request(
|
345
|
-
method, url, params=params, content=content, headers=headers
|
346
|
-
)
|
422
|
+
return self._handle_request(method, url, params=params, content=content, headers=headers)
|
347
423
|
|
348
424
|
def _handle_request(self, *args: Any, **kwargs: Any) -> httpx.Response:
|
349
425
|
"""
|
@@ -357,3 +433,114 @@ class DummyClient(httpx.AsyncClient):
|
|
357
433
|
return getattr(self, method)(*args, **kwargs)
|
358
434
|
else:
|
359
435
|
return self._response()
|
436
|
+
|
437
|
+
|
438
|
+
class LearningConfigService(ABC):
|
439
|
+
@abstractmethod
|
440
|
+
async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]: ...
|
441
|
+
|
442
|
+
@abstractmethod
|
443
|
+
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration: ...
|
444
|
+
|
445
|
+
@abstractmethod
|
446
|
+
async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None: ...
|
447
|
+
|
448
|
+
@abstractmethod
|
449
|
+
async def delete_configuration(self, kbid: str) -> None: ...
|
450
|
+
|
451
|
+
|
452
|
+
class ProxiedLearningConfig(LearningConfigService):
|
453
|
+
async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
|
454
|
+
async with self._client() as client:
|
455
|
+
resp = await client.get(f"config/{kbid}")
|
456
|
+
try:
|
457
|
+
raise_for_status(resp)
|
458
|
+
except ProxiedLearningConfigError as err:
|
459
|
+
if err.status_code == 404:
|
460
|
+
return None
|
461
|
+
raise
|
462
|
+
return LearningConfiguration.model_validate(resp.json())
|
463
|
+
|
464
|
+
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
|
465
|
+
async with self._client() as client:
|
466
|
+
resp = await client.post(f"config/{kbid}", json=config)
|
467
|
+
raise_for_status(resp)
|
468
|
+
return LearningConfiguration.model_validate(resp.json())
|
469
|
+
|
470
|
+
async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
|
471
|
+
async with self._client() as client:
|
472
|
+
resp = await client.patch(f"config/{kbid}", json=config)
|
473
|
+
raise_for_status(resp)
|
474
|
+
return
|
475
|
+
|
476
|
+
async def delete_configuration(self, kbid: str) -> None:
|
477
|
+
async with self._client() as client:
|
478
|
+
resp = await client.delete(f"config/{kbid}")
|
479
|
+
raise_for_status(resp)
|
480
|
+
|
481
|
+
@contextlib.asynccontextmanager
|
482
|
+
async def _client(self) -> AsyncIterator[httpx.AsyncClient]:
|
483
|
+
async with httpx.AsyncClient(
|
484
|
+
base_url=get_base_url(LearningService.CONFIG),
|
485
|
+
headers=get_auth_headers(),
|
486
|
+
) as client:
|
487
|
+
yield client
|
488
|
+
|
489
|
+
|
490
|
+
_IN_MEMORY_CONFIGS: dict[str, LearningConfiguration]
|
491
|
+
_IN_MEMORY_CONFIGS = LRU(50) # type: ignore
|
492
|
+
|
493
|
+
|
494
|
+
class InMemoryLearningConfig(LearningConfigService):
|
495
|
+
def __init__(self):
|
496
|
+
self.in_memory_configs = {}
|
497
|
+
|
498
|
+
async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
|
499
|
+
return _IN_MEMORY_CONFIGS.get(kbid, None)
|
500
|
+
|
501
|
+
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
|
502
|
+
if not config:
|
503
|
+
# generate a default config
|
504
|
+
default_model = os.environ.get("TEST_SENTENCE_ENCODER", "multilingual")
|
505
|
+
size = 768 if default_model == "multilingual-2023-02-21" else 512
|
506
|
+
# XXX for some reason, we override the model name and set this one
|
507
|
+
# default_model = "multilingual"
|
508
|
+
learning_config = LearningConfiguration(
|
509
|
+
semantic_model=default_model,
|
510
|
+
semantic_vector_similarity="cosine",
|
511
|
+
semantic_vector_size=size,
|
512
|
+
semantic_threshold=None,
|
513
|
+
semantic_matryoshka_dims=[],
|
514
|
+
semantic_models=[default_model],
|
515
|
+
semantic_model_configs={
|
516
|
+
default_model: SemanticConfig(
|
517
|
+
similarity=SimilarityFunction.COSINE,
|
518
|
+
size=size,
|
519
|
+
threshold=0,
|
520
|
+
matryoshka_dims=[],
|
521
|
+
)
|
522
|
+
},
|
523
|
+
)
|
524
|
+
|
525
|
+
else:
|
526
|
+
learning_config = LearningConfiguration.model_validate(config)
|
527
|
+
|
528
|
+
_IN_MEMORY_CONFIGS[kbid] = learning_config
|
529
|
+
return learning_config
|
530
|
+
|
531
|
+
async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
|
532
|
+
if kbid not in _IN_MEMORY_CONFIGS:
|
533
|
+
raise ValueError(f"Configuration for kbid {kbid} not found")
|
534
|
+
learning_config = _IN_MEMORY_CONFIGS[kbid]
|
535
|
+
learning_config = learning_config.model_copy(update=config)
|
536
|
+
_IN_MEMORY_CONFIGS[kbid] = learning_config
|
537
|
+
|
538
|
+
async def delete_configuration(self, kbid: str) -> None:
|
539
|
+
_IN_MEMORY_CONFIGS.pop(kbid, None)
|
540
|
+
|
541
|
+
|
542
|
+
def learning_config_service() -> LearningConfigService:
|
543
|
+
if nuclia_settings.dummy_learning_services:
|
544
|
+
return InMemoryLearningConfig()
|
545
|
+
else:
|
546
|
+
return ProxiedLearningConfig()
|
nucliadb/metrics_exporter.py
CHANGED
@@ -20,12 +20,14 @@
|
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
22
|
import asyncio
|
23
|
-
from typing import AsyncGenerator, Callable
|
23
|
+
from typing import AsyncGenerator, Callable, Tuple, cast
|
24
24
|
|
25
25
|
from nucliadb import logger
|
26
26
|
from nucliadb.common import datamanagers
|
27
27
|
from nucliadb.common.cluster import manager as cluster_manager
|
28
28
|
from nucliadb.common.context import ApplicationContext
|
29
|
+
from nucliadb.common.maindb.pg import PGDriver
|
30
|
+
from nucliadb.common.maindb.utils import get_driver
|
29
31
|
from nucliadb.migrator.datamanager import MigrationsDataManager
|
30
32
|
from nucliadb_telemetry import metrics
|
31
33
|
from nucliadb_telemetry.logs import setup_logging
|
@@ -34,9 +36,9 @@ from nucliadb_utils.fastapi.run import serve_metrics
|
|
34
36
|
|
35
37
|
SHARD_COUNT = metrics.Gauge("nucliadb_node_shard_count", labels={"node": ""})
|
36
38
|
|
37
|
-
MIGRATION_COUNT = metrics.Gauge(
|
38
|
-
|
39
|
-
)
|
39
|
+
MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "version": ""})
|
40
|
+
|
41
|
+
PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
|
40
42
|
|
41
43
|
|
42
44
|
async def update_node_metrics(context: ApplicationContext):
|
@@ -57,7 +59,7 @@ async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
|
|
57
59
|
"""
|
58
60
|
Return a list of all KB ids.
|
59
61
|
"""
|
60
|
-
async with context.kv_driver.transaction() as txn:
|
62
|
+
async with context.kv_driver.transaction(read_only=True) as txn:
|
61
63
|
async for kbid, _ in datamanagers.kb.get_kbs(txn):
|
62
64
|
yield kbid
|
63
65
|
|
@@ -72,9 +74,7 @@ async def update_migration_metrics(context: ApplicationContext):
|
|
72
74
|
mdm = MigrationsDataManager(context.kv_driver)
|
73
75
|
global_info = await mdm.get_global_info()
|
74
76
|
if global_info is not None:
|
75
|
-
MIGRATION_COUNT.set(
|
76
|
-
1, labels=dict(type="global", version=str(global_info.current_version))
|
77
|
-
)
|
77
|
+
MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
|
78
78
|
|
79
79
|
version_count: dict[str, int] = {}
|
80
80
|
async for kbid in iter_kbids(context):
|
@@ -88,9 +88,25 @@ async def update_migration_metrics(context: ApplicationContext):
|
|
88
88
|
MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
|
89
89
|
|
90
90
|
|
91
|
-
async def
|
92
|
-
|
93
|
-
|
91
|
+
async def update_resource_metrics(context: ApplicationContext):
|
92
|
+
"""
|
93
|
+
Report the number of pending resources older than some estimated processing time
|
94
|
+
"""
|
95
|
+
driver = get_driver()
|
96
|
+
if not isinstance(driver, PGDriver):
|
97
|
+
return
|
98
|
+
|
99
|
+
async with driver._get_connection() as conn, conn.cursor() as cur:
|
100
|
+
await cur.execute(
|
101
|
+
"SELECT COUNT(*) FROM catalog "
|
102
|
+
"WHERE labels @> '{/n/s/PENDING}' "
|
103
|
+
"AND COALESCE(modified_at, created_at) BETWEEN NOW() - INTERVAL '1 month' AND NOW() - INTERVAL '6 hours'"
|
104
|
+
)
|
105
|
+
count = cast(Tuple[int], await cur.fetchone())[0]
|
106
|
+
PENDING_RESOURCE_COUNT.set(count)
|
107
|
+
|
108
|
+
|
109
|
+
async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
|
94
110
|
"""
|
95
111
|
Run coroutine infinitely, catching exceptions and logging them.
|
96
112
|
It will wait for the interval before running again.
|
@@ -100,9 +116,7 @@ async def run_exporter_task(
|
|
100
116
|
try:
|
101
117
|
await exporter_task(context)
|
102
118
|
except Exception:
|
103
|
-
logger.error(
|
104
|
-
f"Error on exporter task {exporter_task.__name__}", exc_info=True
|
105
|
-
)
|
119
|
+
logger.error(f"Error on exporter task {exporter_task.__name__}", exc_info=True)
|
106
120
|
await asyncio.sleep(interval)
|
107
121
|
except asyncio.CancelledError:
|
108
122
|
pass
|
@@ -114,12 +128,9 @@ async def run_exporter(context: ApplicationContext):
|
|
114
128
|
for export_task, interval in [
|
115
129
|
(update_node_metrics, 10),
|
116
130
|
(update_migration_metrics, 60 * 3),
|
131
|
+
(update_resource_metrics, 60 * 5),
|
117
132
|
]:
|
118
|
-
tasks.append(
|
119
|
-
asyncio.create_task(
|
120
|
-
run_exporter_task(context, export_task, interval=interval)
|
121
|
-
)
|
122
|
-
)
|
133
|
+
tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))
|
123
134
|
try:
|
124
135
|
while True:
|
125
136
|
await asyncio.sleep(10)
|
nucliadb/middleware/__init__.py
CHANGED
@@ -39,9 +39,7 @@ class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
|
|
39
39
|
exposed_headers.append(PROCESS_TIME_HEADER)
|
40
40
|
response.headers[ACCESS_CONTROL_EXPOSE_HEADER] = ",".join(exposed_headers)
|
41
41
|
|
42
|
-
async def dispatch(
|
43
|
-
self, request: Request, call_next: RequestResponseEndpoint
|
44
|
-
) -> Response:
|
42
|
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
45
43
|
response = None
|
46
44
|
start = time.perf_counter()
|
47
45
|
try:
|
nucliadb/migrator/command.py
CHANGED
@@ -53,9 +53,7 @@ def validate():
|
|
53
53
|
versions = set()
|
54
54
|
for migration in migrations:
|
55
55
|
if migration.version in versions:
|
56
|
-
raise MigrationValidationError(
|
57
|
-
f"Migration {migration.version} is duplicated"
|
58
|
-
)
|
56
|
+
raise MigrationValidationError(f"Migration {migration.version} is duplicated")
|
59
57
|
versions.add(migration.version)
|
60
58
|
|
61
59
|
|
nucliadb/migrator/datamanager.py
CHANGED
@@ -47,17 +47,19 @@ class MigrationsDataManager:
|
|
47
47
|
self.driver = driver
|
48
48
|
|
49
49
|
async def schedule_all_kbs(self, target_version: int) -> None:
|
50
|
+
# Get all kb ids
|
51
|
+
async with self.driver.transaction(read_only=True) as txn:
|
52
|
+
kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
|
53
|
+
# Schedule the migrations
|
50
54
|
async with self.driver.transaction() as txn:
|
51
|
-
|
52
|
-
await txn.set(
|
53
|
-
MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode()
|
54
|
-
)
|
55
|
+
for kbid in kbids:
|
56
|
+
await txn.set(MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode())
|
55
57
|
await txn.commit()
|
56
58
|
|
57
|
-
async def get_kb_migrations(self
|
59
|
+
async def get_kb_migrations(self) -> list[str]:
|
58
60
|
keys = []
|
59
61
|
async with self.driver.transaction() as txn:
|
60
|
-
async for key in txn.keys(MIGRATIONS_CONTAINER_KEY
|
62
|
+
async for key in txn.keys(MIGRATIONS_CONTAINER_KEY):
|
61
63
|
keys.append(key.split("/")[-1])
|
62
64
|
|
63
65
|
return keys
|
@@ -68,7 +70,7 @@ class MigrationsDataManager:
|
|
68
70
|
await txn.commit()
|
69
71
|
|
70
72
|
async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
|
71
|
-
async with self.driver.transaction() as txn:
|
73
|
+
async with self.driver.transaction(read_only=True) as txn:
|
72
74
|
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
73
75
|
if kb_config is None:
|
74
76
|
return None
|
@@ -76,7 +78,7 @@ class MigrationsDataManager:
|
|
76
78
|
|
77
79
|
async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
|
78
80
|
async with self.driver.transaction() as txn:
|
79
|
-
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
|
81
|
+
kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
|
80
82
|
if kb_config is None:
|
81
83
|
raise Exception(f"KB {kbid} does not exist")
|
82
84
|
kb_config.migration_version = current_version
|
@@ -84,15 +86,13 @@ class MigrationsDataManager:
|
|
84
86
|
await txn.commit()
|
85
87
|
|
86
88
|
async def get_global_info(self) -> GlobalInfo:
|
87
|
-
async with self.driver.transaction() as txn:
|
89
|
+
async with self.driver.transaction(read_only=True) as txn:
|
88
90
|
raw_pb = await txn.get(MIGRATION_INFO_KEY)
|
89
91
|
if raw_pb is None:
|
90
92
|
return GlobalInfo(current_version=0, target_version=None)
|
91
93
|
pb = migrations_pb2.MigrationInfo()
|
92
94
|
pb.ParseFromString(raw_pb)
|
93
|
-
return GlobalInfo(
|
94
|
-
current_version=pb.current_version, target_version=pb.target_version
|
95
|
-
)
|
95
|
+
return GlobalInfo(current_version=pb.current_version, target_version=pb.target_version)
|
96
96
|
|
97
97
|
async def update_global_info(
|
98
98
|
self,
|
@@ -101,7 +101,7 @@ class MigrationsDataManager:
|
|
101
101
|
target_version: Union[int, None, _Unset] = _UNSET,
|
102
102
|
) -> None:
|
103
103
|
async with self.driver.transaction() as txn:
|
104
|
-
raw_pb = await txn.get(MIGRATION_INFO_KEY)
|
104
|
+
raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
|
105
105
|
pb = migrations_pb2.MigrationInfo()
|
106
106
|
if raw_pb is not None:
|
107
107
|
pb.ParseFromString(raw_pb)
|