nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/ingest/settings.py
CHANGED
@@ -20,14 +20,13 @@
|
|
20
20
|
from enum import Enum
|
21
21
|
from typing import Optional
|
22
22
|
|
23
|
-
from pydantic import
|
23
|
+
from pydantic import Field
|
24
|
+
from pydantic_settings import BaseSettings
|
24
25
|
|
25
26
|
|
26
|
-
class DriverConfig(
|
27
|
-
REDIS = "redis"
|
28
|
-
TIKV = "tikv"
|
27
|
+
class DriverConfig(Enum):
|
29
28
|
PG = "pg"
|
30
|
-
LOCAL = "local"
|
29
|
+
LOCAL = "local" # Not recommended for production
|
31
30
|
NOT_SET = "notset" # setting not provided
|
32
31
|
|
33
32
|
@classmethod
|
@@ -41,19 +40,7 @@ class DriverConfig(str, Enum):
|
|
41
40
|
|
42
41
|
|
43
42
|
class DriverSettings(BaseSettings):
|
44
|
-
driver: DriverConfig = Field(
|
45
|
-
default=DriverConfig.NOT_SET, description="K/V storage driver"
|
46
|
-
)
|
47
|
-
driver_redis_url: Optional[str] = Field(
|
48
|
-
default=None, description="Redis URL. Example: redis://localhost:6379"
|
49
|
-
)
|
50
|
-
driver_tikv_url: Optional[list[str]] = Field(
|
51
|
-
default=None,
|
52
|
-
description=(
|
53
|
-
"TiKV PD (Placement Driver) URLs. The URL to the cluster manager of"
|
54
|
-
"TiKV. Example: '[\"tikv-pd.svc:2379\"]'"
|
55
|
-
),
|
56
|
-
)
|
43
|
+
driver: DriverConfig = Field(default=DriverConfig.PG, description="K/V storage driver")
|
57
44
|
driver_local_url: Optional[str] = Field(
|
58
45
|
default=None,
|
59
46
|
description="Local path to store data on file system. Example: /nucliadb/data/main",
|
@@ -62,13 +49,17 @@ class DriverSettings(BaseSettings):
|
|
62
49
|
default=None,
|
63
50
|
description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.", # noqa
|
64
51
|
)
|
52
|
+
driver_pg_connection_pool_min_size: int = Field(
|
53
|
+
default=10,
|
54
|
+
description="PostgreSQL min pool size. The minimum number of connections to the PostgreSQL server.",
|
55
|
+
)
|
65
56
|
driver_pg_connection_pool_max_size: int = Field(
|
66
57
|
default=20,
|
67
58
|
description="PostgreSQL max pool size. The maximum number of connections to the PostgreSQL server.",
|
68
59
|
)
|
69
|
-
|
70
|
-
default=
|
71
|
-
description="
|
60
|
+
driver_pg_connection_pool_acquire_timeout_ms: int = Field(
|
61
|
+
default=1000,
|
62
|
+
description="PostgreSQL pool acquire timeout in ms. The maximum time to wait until a connection becomes available.",
|
72
63
|
)
|
73
64
|
|
74
65
|
|
@@ -86,7 +77,7 @@ class Settings(DriverSettings):
|
|
86
77
|
total_replicas: int = 1 # number of ingest processor replicas in the cluster
|
87
78
|
nuclia_partitions: int = 50
|
88
79
|
|
89
|
-
max_receive_message_length: int =
|
80
|
+
max_receive_message_length: int = 500 # In MB
|
90
81
|
|
91
82
|
# Search query timeouts
|
92
83
|
relation_search_timeout: float = 10.0
|
nucliadb/ingest/utils.py
CHANGED
@@ -19,9 +19,8 @@
|
|
19
19
|
#
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
23
|
-
|
24
22
|
from nucliadb.common.maindb.utils import setup_driver
|
23
|
+
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
25
24
|
from nucliadb_utils.grpc import get_traced_grpc_channel
|
26
25
|
from nucliadb_utils.settings import nucliadb_settings
|
27
26
|
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
@@ -37,11 +36,9 @@ async def start_ingest(service_name: Optional[str] = None):
|
|
37
36
|
if nucliadb_settings.nucliadb_ingest is not None:
|
38
37
|
# Its distributed lets create a GRPC client
|
39
38
|
# We want Jaeger telemetry enabled
|
40
|
-
channel = get_traced_grpc_channel(
|
41
|
-
nucliadb_settings.nucliadb_ingest, service_name or "ingest"
|
42
|
-
)
|
39
|
+
channel = get_traced_grpc_channel(nucliadb_settings.nucliadb_ingest, service_name or "ingest")
|
43
40
|
set_utility(Utility.CHANNEL, channel)
|
44
|
-
ingest = WriterStub(channel)
|
41
|
+
ingest = WriterStub(channel)
|
45
42
|
set_utility(Utility.INGEST, ingest)
|
46
43
|
else:
|
47
44
|
# Its not distributed create a ingest
|
nucliadb/learning_proxy.py
CHANGED
@@ -20,16 +20,21 @@
|
|
20
20
|
import contextlib
|
21
21
|
import json
|
22
22
|
import logging
|
23
|
+
import os
|
24
|
+
from abc import ABC, abstractmethod
|
23
25
|
from collections.abc import AsyncIterator
|
24
|
-
from enum import Enum
|
26
|
+
from enum import Enum, IntEnum
|
25
27
|
from typing import Any, Optional, Union
|
26
28
|
|
27
29
|
import backoff
|
28
30
|
import httpx
|
29
31
|
from fastapi import Request, Response
|
30
32
|
from fastapi.responses import StreamingResponse
|
31
|
-
from
|
33
|
+
from lru import LRU
|
34
|
+
from pydantic import BaseModel, Field, model_validator
|
35
|
+
from typing_extensions import Self
|
32
36
|
|
37
|
+
from nucliadb_protos import knowledgebox_pb2, utils_pb2
|
33
38
|
from nucliadb_telemetry import errors
|
34
39
|
from nucliadb_utils.settings import is_onprem_nucliadb, nuclia_settings
|
35
40
|
|
@@ -48,48 +53,150 @@ WHITELISTED_HEADERS = {
|
|
48
53
|
}
|
49
54
|
|
50
55
|
|
51
|
-
class LearningService(
|
56
|
+
class LearningService(Enum):
|
52
57
|
CONFIG = "config"
|
53
|
-
COLLECTOR = "collector-api"
|
54
58
|
|
55
59
|
|
60
|
+
class SimilarityFunction(IntEnum):
|
61
|
+
# Keep this in sync with learning config repo
|
62
|
+
# It's an IntEnum to match the protobuf definition
|
63
|
+
DOT = 0
|
64
|
+
COSINE = 1
|
65
|
+
|
66
|
+
|
67
|
+
class SemanticConfig(BaseModel):
|
68
|
+
# Keep this in sync with learning config repo
|
69
|
+
similarity: SimilarityFunction
|
70
|
+
size: int
|
71
|
+
threshold: float
|
72
|
+
matryoshka_dims: list[int] = []
|
73
|
+
|
74
|
+
def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
|
75
|
+
semantic_model = knowledgebox_pb2.SemanticModelMetadata()
|
76
|
+
LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
|
77
|
+
SimilarityFunction.COSINE: utils_pb2.VectorSimilarity.COSINE,
|
78
|
+
SimilarityFunction.DOT: utils_pb2.VectorSimilarity.DOT,
|
79
|
+
}
|
80
|
+
semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[self.similarity]
|
81
|
+
semantic_model.vector_dimension = self.size
|
82
|
+
semantic_model.matryoshka_dimensions.extend(self.matryoshka_dims)
|
83
|
+
return semantic_model
|
84
|
+
|
85
|
+
|
86
|
+
# Subset of learning configuration of nucliadb's interest. Look at
|
87
|
+
# learning_config models for more fields
|
56
88
|
class LearningConfiguration(BaseModel):
|
57
89
|
semantic_model: str
|
90
|
+
# aka similarity function
|
58
91
|
semantic_vector_similarity: str
|
59
|
-
|
60
|
-
|
92
|
+
# aka vector_dimension
|
93
|
+
semantic_vector_size: Optional[int] = None
|
94
|
+
# aka min_score
|
95
|
+
semantic_threshold: Optional[float] = None
|
96
|
+
# List of possible subdivisions of the matryoshka embeddings (if the model
|
97
|
+
# supports it)
|
98
|
+
semantic_matryoshka_dimensions: Optional[list[int]] = Field(
|
99
|
+
default=None, alias="semantic_matryoshka_dims"
|
100
|
+
)
|
101
|
+
|
102
|
+
semantic_models: list[str] = Field(default_factory=list)
|
103
|
+
|
104
|
+
# This is where the config for each semantic model (aka vectorsets) is returned
|
105
|
+
semantic_model_configs: dict[str, SemanticConfig] = Field(default_factory=dict)
|
106
|
+
|
107
|
+
@model_validator(mode="before")
|
108
|
+
@classmethod
|
109
|
+
def maintain_bw_compatibility_with_single_model_configs(cls, data: Any) -> Any:
|
110
|
+
if isinstance(data, dict):
|
111
|
+
if not data.get("semantic_model", None) and len(data.get("semantic_models", [])) > 0:
|
112
|
+
data["semantic_model"] = data["semantic_models"][0]
|
113
|
+
return data
|
114
|
+
|
115
|
+
@model_validator(mode="after")
|
116
|
+
def validate_matryoshka_and_vector_dimension_consistency(self) -> Self:
|
117
|
+
vector_size = self.semantic_vector_size
|
118
|
+
matryoshka_dimensions = self.semantic_matryoshka_dimensions or []
|
119
|
+
if (
|
120
|
+
len(matryoshka_dimensions) > 0
|
121
|
+
and vector_size is not None
|
122
|
+
and vector_size not in matryoshka_dimensions
|
123
|
+
):
|
124
|
+
raise ValueError("Semantic vector size is inconsistent with matryoshka dimensions")
|
125
|
+
return self
|
126
|
+
|
127
|
+
def into_semantic_models_metadata(
|
128
|
+
self,
|
129
|
+
) -> dict[str, knowledgebox_pb2.SemanticModelMetadata]:
|
130
|
+
result = {}
|
131
|
+
for model_name, config in self.semantic_model_configs.items():
|
132
|
+
result[model_name] = config.into_semantic_model_metadata()
|
133
|
+
return result
|
134
|
+
|
135
|
+
def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
|
136
|
+
semantic_model = knowledgebox_pb2.SemanticModelMetadata()
|
137
|
+
|
138
|
+
LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
|
139
|
+
"cosine": utils_pb2.VectorSimilarity.COSINE,
|
140
|
+
"dot": utils_pb2.VectorSimilarity.DOT,
|
141
|
+
}
|
142
|
+
semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[
|
143
|
+
self.semantic_vector_similarity.lower()
|
144
|
+
]
|
145
|
+
|
146
|
+
if self.semantic_vector_size is not None:
|
147
|
+
semantic_model.vector_dimension = self.semantic_vector_size
|
148
|
+
else:
|
149
|
+
logger.warning("Vector dimension not set!")
|
150
|
+
|
151
|
+
if self.semantic_matryoshka_dimensions is not None:
|
152
|
+
semantic_model.matryoshka_dimensions.extend(self.semantic_matryoshka_dimensions)
|
153
|
+
|
154
|
+
return semantic_model
|
155
|
+
|
156
|
+
|
157
|
+
class ProxiedLearningConfigError(Exception):
|
158
|
+
def __init__(self, status_code: int, content: bytes, content_type: str):
|
159
|
+
self.status_code = status_code
|
160
|
+
self.content = content
|
161
|
+
self.content_type = content_type
|
162
|
+
|
163
|
+
|
164
|
+
def raise_for_status(response: httpx.Response) -> None:
|
165
|
+
try:
|
166
|
+
response.raise_for_status()
|
167
|
+
except httpx.HTTPStatusError as err:
|
168
|
+
content_type = err.response.headers.get("Content-Type", "application/json")
|
169
|
+
raise ProxiedLearningConfigError(
|
170
|
+
status_code=err.response.status_code,
|
171
|
+
content=err.response.content,
|
172
|
+
content_type=content_type,
|
173
|
+
)
|
61
174
|
|
62
175
|
|
63
176
|
async def get_configuration(
|
64
177
|
kbid: str,
|
65
178
|
) -> Optional[LearningConfiguration]:
|
66
|
-
|
67
|
-
resp = await client.get(f"config/{kbid}")
|
68
|
-
try:
|
69
|
-
resp.raise_for_status()
|
70
|
-
except httpx.HTTPStatusError as err:
|
71
|
-
if err.response.status_code == 404:
|
72
|
-
return None
|
73
|
-
raise
|
74
|
-
return LearningConfiguration.parse_obj(resp.json())
|
179
|
+
return await learning_config_service().get_configuration(kbid)
|
75
180
|
|
76
181
|
|
77
182
|
async def set_configuration(
|
78
183
|
kbid: str,
|
79
184
|
config: dict[str, Any],
|
80
185
|
) -> LearningConfiguration:
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
186
|
+
return await learning_config_service().set_configuration(kbid, config)
|
187
|
+
|
188
|
+
|
189
|
+
async def update_configuration(
|
190
|
+
kbid: str,
|
191
|
+
config: dict[str, Any],
|
192
|
+
) -> None:
|
193
|
+
return await learning_config_service().update_configuration(kbid, config)
|
85
194
|
|
86
195
|
|
87
196
|
async def delete_configuration(
|
88
197
|
kbid: str,
|
89
198
|
) -> None:
|
90
|
-
|
91
|
-
resp = await client.delete(f"config/{kbid}")
|
92
|
-
resp.raise_for_status()
|
199
|
+
return await learning_config_service().delete_configuration(kbid)
|
93
200
|
|
94
201
|
|
95
202
|
async def learning_config_proxy(
|
@@ -107,21 +214,6 @@ async def learning_config_proxy(
|
|
107
214
|
)
|
108
215
|
|
109
216
|
|
110
|
-
async def learning_collector_proxy(
|
111
|
-
request: Request,
|
112
|
-
method: str,
|
113
|
-
url: str,
|
114
|
-
extra_headers: Optional[dict[str, str]] = None,
|
115
|
-
) -> Union[Response, StreamingResponse]:
|
116
|
-
return await proxy(
|
117
|
-
service=LearningService.COLLECTOR,
|
118
|
-
request=request,
|
119
|
-
method=method,
|
120
|
-
url=url,
|
121
|
-
extra_headers=extra_headers,
|
122
|
-
)
|
123
|
-
|
124
|
-
|
125
217
|
def is_white_listed_header(header: str) -> bool:
|
126
218
|
return header.lower() in WHITELISTED_HEADERS
|
127
219
|
|
@@ -213,13 +305,9 @@ async def proxy(
|
|
213
305
|
|
214
306
|
def get_base_url(service: LearningService) -> str:
|
215
307
|
if is_onprem_nucliadb():
|
216
|
-
nuclia_public_url = nuclia_settings.nuclia_public_url.format(
|
217
|
-
zone=nuclia_settings.nuclia_zone
|
218
|
-
)
|
308
|
+
nuclia_public_url = nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone)
|
219
309
|
return f"{nuclia_public_url}/api/v1"
|
220
|
-
learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(
|
221
|
-
service=service.value
|
222
|
-
)
|
310
|
+
learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(service=service.value)
|
223
311
|
return f"{learning_svc_base_url}/api/v1/internal"
|
224
312
|
|
225
313
|
|
@@ -248,9 +336,7 @@ async def service_client(
|
|
248
336
|
# This is a workaround to be able to run integration tests that start nucliadb with docker.
|
249
337
|
# The learning APIs are not available in the docker setup, so we use a dummy client.
|
250
338
|
client = DummyClient(base_url=base_url, headers=headers)
|
251
|
-
logger.warning(
|
252
|
-
"Using dummy client. If you see this in production, something is wrong."
|
253
|
-
)
|
339
|
+
logger.warning("Using dummy client. If you see this in production, something is wrong.")
|
254
340
|
else:
|
255
341
|
client = httpx.AsyncClient(base_url=base_url, headers=headers) # type: ignore
|
256
342
|
try:
|
@@ -299,13 +385,31 @@ class DummyClient(httpx.AsyncClient):
|
|
299
385
|
return self._handle_request("DELETE", *args, **kwargs)
|
300
386
|
|
301
387
|
def get_config(self, *args: Any, **kwargs: Any):
|
388
|
+
size = 768 if os.environ.get("TEST_SENTENCE_ENCODER") == "multilingual-2023-02-21" else 512
|
302
389
|
lconfig = LearningConfiguration(
|
303
390
|
semantic_model="multilingual",
|
304
391
|
semantic_vector_similarity="cosine",
|
305
|
-
semantic_vector_size=
|
392
|
+
semantic_vector_size=size,
|
306
393
|
semantic_threshold=None,
|
394
|
+
semantic_matryoshka_dims=[],
|
395
|
+
semantic_model_configs={
|
396
|
+
"multilingual": SemanticConfig(
|
397
|
+
similarity=SimilarityFunction.COSINE,
|
398
|
+
size=size,
|
399
|
+
threshold=0,
|
400
|
+
matryoshka_dims=[],
|
401
|
+
)
|
402
|
+
},
|
307
403
|
)
|
308
|
-
return self._response(content=lconfig.
|
404
|
+
return self._response(content=lconfig.model_dump())
|
405
|
+
|
406
|
+
def post_config(self, *args: Any, **kwargs: Any):
|
407
|
+
# simulate post that returns the created config
|
408
|
+
return self.get_config(*args, **kwargs)
|
409
|
+
|
410
|
+
def patch_config(self, *args: Any, **kwargs: Any):
|
411
|
+
# simulate patch that returns the updated config
|
412
|
+
return self.get_config(*args, **kwargs)
|
309
413
|
|
310
414
|
async def request( # type: ignore
|
311
415
|
self,
|
@@ -315,9 +419,7 @@ class DummyClient(httpx.AsyncClient):
|
|
315
419
|
content=None,
|
316
420
|
headers=None,
|
317
421
|
) -> httpx.Response:
|
318
|
-
return self._handle_request(
|
319
|
-
method, url, params=params, content=content, headers=headers
|
320
|
-
)
|
422
|
+
return self._handle_request(method, url, params=params, content=content, headers=headers)
|
321
423
|
|
322
424
|
def _handle_request(self, *args: Any, **kwargs: Any) -> httpx.Response:
|
323
425
|
"""
|
@@ -331,3 +433,114 @@ class DummyClient(httpx.AsyncClient):
|
|
331
433
|
return getattr(self, method)(*args, **kwargs)
|
332
434
|
else:
|
333
435
|
return self._response()
|
436
|
+
|
437
|
+
|
438
|
+
class LearningConfigService(ABC):
|
439
|
+
@abstractmethod
|
440
|
+
async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]: ...
|
441
|
+
|
442
|
+
@abstractmethod
|
443
|
+
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration: ...
|
444
|
+
|
445
|
+
@abstractmethod
|
446
|
+
async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None: ...
|
447
|
+
|
448
|
+
@abstractmethod
|
449
|
+
async def delete_configuration(self, kbid: str) -> None: ...
|
450
|
+
|
451
|
+
|
452
|
+
class ProxiedLearningConfig(LearningConfigService):
|
453
|
+
async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
|
454
|
+
async with self._client() as client:
|
455
|
+
resp = await client.get(f"config/{kbid}")
|
456
|
+
try:
|
457
|
+
raise_for_status(resp)
|
458
|
+
except ProxiedLearningConfigError as err:
|
459
|
+
if err.status_code == 404:
|
460
|
+
return None
|
461
|
+
raise
|
462
|
+
return LearningConfiguration.model_validate(resp.json())
|
463
|
+
|
464
|
+
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
|
465
|
+
async with self._client() as client:
|
466
|
+
resp = await client.post(f"config/{kbid}", json=config)
|
467
|
+
raise_for_status(resp)
|
468
|
+
return LearningConfiguration.model_validate(resp.json())
|
469
|
+
|
470
|
+
async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
|
471
|
+
async with self._client() as client:
|
472
|
+
resp = await client.patch(f"config/{kbid}", json=config)
|
473
|
+
raise_for_status(resp)
|
474
|
+
return
|
475
|
+
|
476
|
+
async def delete_configuration(self, kbid: str) -> None:
|
477
|
+
async with self._client() as client:
|
478
|
+
resp = await client.delete(f"config/{kbid}")
|
479
|
+
raise_for_status(resp)
|
480
|
+
|
481
|
+
@contextlib.asynccontextmanager
|
482
|
+
async def _client(self) -> AsyncIterator[httpx.AsyncClient]:
|
483
|
+
async with httpx.AsyncClient(
|
484
|
+
base_url=get_base_url(LearningService.CONFIG),
|
485
|
+
headers=get_auth_headers(),
|
486
|
+
) as client:
|
487
|
+
yield client
|
488
|
+
|
489
|
+
|
490
|
+
_IN_MEMORY_CONFIGS: dict[str, LearningConfiguration]
|
491
|
+
_IN_MEMORY_CONFIGS = LRU(50) # type: ignore
|
492
|
+
|
493
|
+
|
494
|
+
class InMemoryLearningConfig(LearningConfigService):
|
495
|
+
def __init__(self):
|
496
|
+
self.in_memory_configs = {}
|
497
|
+
|
498
|
+
async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
|
499
|
+
return _IN_MEMORY_CONFIGS.get(kbid, None)
|
500
|
+
|
501
|
+
async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
|
502
|
+
if not config:
|
503
|
+
# generate a default config
|
504
|
+
default_model = os.environ.get("TEST_SENTENCE_ENCODER", "multilingual")
|
505
|
+
size = 768 if default_model == "multilingual-2023-02-21" else 512
|
506
|
+
# XXX for some reason, we override the model name and set this one
|
507
|
+
# default_model = "multilingual"
|
508
|
+
learning_config = LearningConfiguration(
|
509
|
+
semantic_model=default_model,
|
510
|
+
semantic_vector_similarity="cosine",
|
511
|
+
semantic_vector_size=size,
|
512
|
+
semantic_threshold=None,
|
513
|
+
semantic_matryoshka_dims=[],
|
514
|
+
semantic_models=[default_model],
|
515
|
+
semantic_model_configs={
|
516
|
+
default_model: SemanticConfig(
|
517
|
+
similarity=SimilarityFunction.COSINE,
|
518
|
+
size=size,
|
519
|
+
threshold=0,
|
520
|
+
matryoshka_dims=[],
|
521
|
+
)
|
522
|
+
},
|
523
|
+
)
|
524
|
+
|
525
|
+
else:
|
526
|
+
learning_config = LearningConfiguration.model_validate(config)
|
527
|
+
|
528
|
+
_IN_MEMORY_CONFIGS[kbid] = learning_config
|
529
|
+
return learning_config
|
530
|
+
|
531
|
+
async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
|
532
|
+
if kbid not in _IN_MEMORY_CONFIGS:
|
533
|
+
raise ValueError(f"Configuration for kbid {kbid} not found")
|
534
|
+
learning_config = _IN_MEMORY_CONFIGS[kbid]
|
535
|
+
learning_config = learning_config.model_copy(update=config)
|
536
|
+
_IN_MEMORY_CONFIGS[kbid] = learning_config
|
537
|
+
|
538
|
+
async def delete_configuration(self, kbid: str) -> None:
|
539
|
+
_IN_MEMORY_CONFIGS.pop(kbid, None)
|
540
|
+
|
541
|
+
|
542
|
+
def learning_config_service() -> LearningConfigService:
|
543
|
+
if nuclia_settings.dummy_learning_services:
|
544
|
+
return InMemoryLearningConfig()
|
545
|
+
else:
|
546
|
+
return ProxiedLearningConfig()
|
nucliadb/metrics_exporter.py
CHANGED
@@ -20,12 +20,14 @@
|
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
22
|
import asyncio
|
23
|
-
from typing import AsyncGenerator, Callable
|
23
|
+
from typing import AsyncGenerator, Callable, Tuple, cast
|
24
24
|
|
25
25
|
from nucliadb import logger
|
26
26
|
from nucliadb.common import datamanagers
|
27
27
|
from nucliadb.common.cluster import manager as cluster_manager
|
28
28
|
from nucliadb.common.context import ApplicationContext
|
29
|
+
from nucliadb.common.maindb.pg import PGDriver
|
30
|
+
from nucliadb.common.maindb.utils import get_driver
|
29
31
|
from nucliadb.migrator.datamanager import MigrationsDataManager
|
30
32
|
from nucliadb_telemetry import metrics
|
31
33
|
from nucliadb_telemetry.logs import setup_logging
|
@@ -34,9 +36,9 @@ from nucliadb_utils.fastapi.run import serve_metrics
|
|
34
36
|
|
35
37
|
SHARD_COUNT = metrics.Gauge("nucliadb_node_shard_count", labels={"node": ""})
|
36
38
|
|
37
|
-
MIGRATION_COUNT = metrics.Gauge(
|
38
|
-
|
39
|
-
)
|
39
|
+
MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "version": ""})
|
40
|
+
|
41
|
+
PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
|
40
42
|
|
41
43
|
|
42
44
|
async def update_node_metrics(context: ApplicationContext):
|
@@ -57,7 +59,7 @@ async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
|
|
57
59
|
"""
|
58
60
|
Return a list of all KB ids.
|
59
61
|
"""
|
60
|
-
async with context.kv_driver.transaction() as txn:
|
62
|
+
async with context.kv_driver.transaction(read_only=True) as txn:
|
61
63
|
async for kbid, _ in datamanagers.kb.get_kbs(txn):
|
62
64
|
yield kbid
|
63
65
|
|
@@ -72,9 +74,7 @@ async def update_migration_metrics(context: ApplicationContext):
|
|
72
74
|
mdm = MigrationsDataManager(context.kv_driver)
|
73
75
|
global_info = await mdm.get_global_info()
|
74
76
|
if global_info is not None:
|
75
|
-
MIGRATION_COUNT.set(
|
76
|
-
1, labels=dict(type="global", version=str(global_info.current_version))
|
77
|
-
)
|
77
|
+
MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
|
78
78
|
|
79
79
|
version_count: dict[str, int] = {}
|
80
80
|
async for kbid in iter_kbids(context):
|
@@ -88,9 +88,25 @@ async def update_migration_metrics(context: ApplicationContext):
|
|
88
88
|
MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
|
89
89
|
|
90
90
|
|
91
|
-
async def
|
92
|
-
|
93
|
-
|
91
|
+
async def update_resource_metrics(context: ApplicationContext):
|
92
|
+
"""
|
93
|
+
Report the number of pending resources older than some estimated processing time
|
94
|
+
"""
|
95
|
+
driver = get_driver()
|
96
|
+
if not isinstance(driver, PGDriver):
|
97
|
+
return
|
98
|
+
|
99
|
+
async with driver._get_connection() as conn, conn.cursor() as cur:
|
100
|
+
await cur.execute(
|
101
|
+
"SELECT COUNT(*) FROM catalog "
|
102
|
+
"WHERE labels @> '{/n/s/PENDING}' "
|
103
|
+
"AND COALESCE(modified_at, created_at) BETWEEN NOW() - INTERVAL '1 month' AND NOW() - INTERVAL '6 hours'"
|
104
|
+
)
|
105
|
+
count = cast(Tuple[int], await cur.fetchone())[0]
|
106
|
+
PENDING_RESOURCE_COUNT.set(count)
|
107
|
+
|
108
|
+
|
109
|
+
async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
|
94
110
|
"""
|
95
111
|
Run coroutine infinitely, catching exceptions and logging them.
|
96
112
|
It will wait for the interval before running again.
|
@@ -100,9 +116,7 @@ async def run_exporter_task(
|
|
100
116
|
try:
|
101
117
|
await exporter_task(context)
|
102
118
|
except Exception:
|
103
|
-
logger.error(
|
104
|
-
f"Error on exporter task {exporter_task.__name__}", exc_info=True
|
105
|
-
)
|
119
|
+
logger.error(f"Error on exporter task {exporter_task.__name__}", exc_info=True)
|
106
120
|
await asyncio.sleep(interval)
|
107
121
|
except asyncio.CancelledError:
|
108
122
|
pass
|
@@ -114,12 +128,9 @@ async def run_exporter(context: ApplicationContext):
|
|
114
128
|
for export_task, interval in [
|
115
129
|
(update_node_metrics, 10),
|
116
130
|
(update_migration_metrics, 60 * 3),
|
131
|
+
(update_resource_metrics, 60 * 5),
|
117
132
|
]:
|
118
|
-
tasks.append(
|
119
|
-
asyncio.create_task(
|
120
|
-
run_exporter_task(context, export_task, interval=interval)
|
121
|
-
)
|
122
|
-
)
|
133
|
+
tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))
|
123
134
|
try:
|
124
135
|
while True:
|
125
136
|
await asyncio.sleep(10)
|
nucliadb/middleware/__init__.py
CHANGED
@@ -39,9 +39,7 @@ class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
|
|
39
39
|
exposed_headers.append(PROCESS_TIME_HEADER)
|
40
40
|
response.headers[ACCESS_CONTROL_EXPOSE_HEADER] = ",".join(exposed_headers)
|
41
41
|
|
42
|
-
async def dispatch(
|
43
|
-
self, request: Request, call_next: RequestResponseEndpoint
|
44
|
-
) -> Response:
|
42
|
+
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
45
43
|
response = None
|
46
44
|
start = time.perf_counter()
|
47
45
|
try:
|
nucliadb/migrator/command.py
CHANGED
@@ -53,9 +53,7 @@ def validate():
|
|
53
53
|
versions = set()
|
54
54
|
for migration in migrations:
|
55
55
|
if migration.version in versions:
|
56
|
-
raise MigrationValidationError(
|
57
|
-
f"Migration {migration.version} is duplicated"
|
58
|
-
)
|
56
|
+
raise MigrationValidationError(f"Migration {migration.version} is duplicated")
|
59
57
|
versions.add(migration.version)
|
60
58
|
|
61
59
|
|