nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,110 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from typing import AsyncIterator, Optional
|
21
|
+
|
22
|
+
from nucliadb.common.datamanagers.utils import get_kv_pb
|
23
|
+
from nucliadb.common.maindb.driver import Transaction
|
24
|
+
from nucliadb_protos import knowledgebox_pb2
|
25
|
+
|
26
|
+
KB_VECTORSETS = "/kbs/{kbid}/vectorsets"
|
27
|
+
|
28
|
+
|
29
|
+
class BrokenInvariant(Exception):
|
30
|
+
pass
|
31
|
+
|
32
|
+
|
33
|
+
async def initialize(txn: Transaction, *, kbid: str):
|
34
|
+
key = KB_VECTORSETS.format(kbid=kbid)
|
35
|
+
await txn.set(key, knowledgebox_pb2.KnowledgeBoxVectorSetsConfig().SerializeToString())
|
36
|
+
|
37
|
+
|
38
|
+
async def get(
|
39
|
+
txn: Transaction, *, kbid: str, vectorset_id: str
|
40
|
+
) -> Optional[knowledgebox_pb2.VectorSetConfig]:
|
41
|
+
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
|
42
|
+
index = _find_vectorset(kb_vectorsets, vectorset_id)
|
43
|
+
if index is None:
|
44
|
+
return None
|
45
|
+
return kb_vectorsets.vectorsets[index]
|
46
|
+
|
47
|
+
|
48
|
+
async def exists(txn, *, kbid: str, vectorset_id: str) -> bool:
|
49
|
+
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
|
50
|
+
return _find_vectorset(kb_vectorsets, vectorset_id) is not None
|
51
|
+
|
52
|
+
|
53
|
+
async def iter(
|
54
|
+
txn: Transaction, *, kbid: str
|
55
|
+
) -> AsyncIterator[tuple[str, knowledgebox_pb2.VectorSetConfig]]:
|
56
|
+
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
|
57
|
+
for config in kb_vectorsets.vectorsets:
|
58
|
+
yield config.vectorset_id, config
|
59
|
+
|
60
|
+
|
61
|
+
async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
|
62
|
+
"""Create or update a vectorset configuration"""
|
63
|
+
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
|
64
|
+
index = _find_vectorset(kb_vectorsets, config.vectorset_id)
|
65
|
+
if index is None:
|
66
|
+
# adding a new vectorset
|
67
|
+
kb_vectorsets.vectorsets.append(config)
|
68
|
+
else:
|
69
|
+
# updating a vectorset
|
70
|
+
kb_vectorsets.vectorsets[index].CopyFrom(config)
|
71
|
+
|
72
|
+
key = KB_VECTORSETS.format(kbid=kbid)
|
73
|
+
await txn.set(key, kb_vectorsets.SerializeToString())
|
74
|
+
|
75
|
+
|
76
|
+
async def delete(txn: Transaction, *, kbid: str, vectorset_id: str):
|
77
|
+
kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
|
78
|
+
index = _find_vectorset(kb_vectorsets, vectorset_id)
|
79
|
+
if index is None:
|
80
|
+
# already deleted
|
81
|
+
return
|
82
|
+
|
83
|
+
del kb_vectorsets.vectorsets[index]
|
84
|
+
key = KB_VECTORSETS.format(kbid=kbid)
|
85
|
+
await txn.set(key, kb_vectorsets.SerializeToString())
|
86
|
+
|
87
|
+
|
88
|
+
# XXX At some point in the vectorset epic, we should make this key mandatory and
|
89
|
+
# fail instead of providing a default
|
90
|
+
async def _get_or_default(
|
91
|
+
txn: Transaction,
|
92
|
+
*,
|
93
|
+
kbid: str,
|
94
|
+
for_update: bool = True,
|
95
|
+
) -> knowledgebox_pb2.KnowledgeBoxVectorSetsConfig:
|
96
|
+
key = KB_VECTORSETS.format(kbid=kbid)
|
97
|
+
stored = await get_kv_pb(
|
98
|
+
txn, key, knowledgebox_pb2.KnowledgeBoxVectorSetsConfig, for_update=for_update
|
99
|
+
)
|
100
|
+
return stored or knowledgebox_pb2.KnowledgeBoxVectorSetsConfig()
|
101
|
+
|
102
|
+
|
103
|
+
def _find_vectorset(
|
104
|
+
kb_vectorsets: knowledgebox_pb2.KnowledgeBoxVectorSetsConfig, vectorset_id: str
|
105
|
+
) -> Optional[int]:
|
106
|
+
"""Return the position of the vectorset in `vectorsets` or `None` if not found."""
|
107
|
+
for idx, vectorset in enumerate(kb_vectorsets.vectorsets):
|
108
|
+
if vectorset.vectorset_id == vectorset_id:
|
109
|
+
return idx
|
110
|
+
return None
|
@@ -0,0 +1,257 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
import abc
|
21
|
+
import logging
|
22
|
+
from dataclasses import dataclass
|
23
|
+
from typing import Any, Iterator, Optional
|
24
|
+
|
25
|
+
from pydantic import BaseModel
|
26
|
+
|
27
|
+
from nucliadb.common.counters import IndexCounts
|
28
|
+
from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
|
29
|
+
from nucliadb.common.ids import ParagraphId
|
30
|
+
from nucliadb_models.external_index_providers import ExternalIndexProviderType
|
31
|
+
from nucliadb_models.search import SCORE_TYPE, TextPosition
|
32
|
+
from nucliadb_protos.knowledgebox_pb2 import (
|
33
|
+
CreateExternalIndexProviderMetadata,
|
34
|
+
StoredExternalIndexProviderMetadata,
|
35
|
+
)
|
36
|
+
from nucliadb_protos.nodereader_pb2 import SearchRequest
|
37
|
+
from nucliadb_protos.noderesources_pb2 import Resource
|
38
|
+
from nucliadb_protos.utils_pb2 import VectorSimilarity
|
39
|
+
from nucliadb_telemetry.metrics import Observer
|
40
|
+
|
41
|
+
logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
manager_observer = Observer("external_index_manager", labels={"operation": "", "provider": ""})
|
44
|
+
|
45
|
+
|
46
|
+
@dataclass
|
47
|
+
class VectorsetExternalIndex:
|
48
|
+
"""
|
49
|
+
Used to indicate to external index managers the required metadata
|
50
|
+
in order to create an external index for each vectorset
|
51
|
+
"""
|
52
|
+
|
53
|
+
vectorset_id: str
|
54
|
+
dimension: int
|
55
|
+
similarity: VectorSimilarity.ValueType
|
56
|
+
|
57
|
+
|
58
|
+
class TextBlockMatch(BaseModel):
|
59
|
+
"""
|
60
|
+
Model a text block/paragraph retrieved from an external index with all the information
|
61
|
+
needed in order to later hydrate retrieval results.
|
62
|
+
"""
|
63
|
+
|
64
|
+
paragraph_id: ParagraphId
|
65
|
+
position: TextPosition
|
66
|
+
score: float
|
67
|
+
score_type: SCORE_TYPE
|
68
|
+
order: int
|
69
|
+
page_with_visual: bool = False
|
70
|
+
fuzzy_search: bool
|
71
|
+
is_a_table: bool = False
|
72
|
+
representation_file: Optional[str] = None
|
73
|
+
paragraph_labels: list[str] = []
|
74
|
+
field_labels: list[str] = []
|
75
|
+
text: Optional[str] = None
|
76
|
+
|
77
|
+
|
78
|
+
class QueryResults(BaseModel):
|
79
|
+
"""
|
80
|
+
Model for the results of a query to an external index provider.
|
81
|
+
Must be subclassed by the specific external index provider.
|
82
|
+
"""
|
83
|
+
|
84
|
+
type: ExternalIndexProviderType
|
85
|
+
results: Any
|
86
|
+
|
87
|
+
def iter_matching_text_blocks(self) -> Iterator[TextBlockMatch]:
|
88
|
+
"""
|
89
|
+
Iterates over the paragraphs in the results, by decreasing score.
|
90
|
+
This should be implemented by the specific external index provider.
|
91
|
+
"""
|
92
|
+
raise NotImplementedError()
|
93
|
+
|
94
|
+
|
95
|
+
class ExternalIndexManager(abc.ABC, metaclass=abc.ABCMeta):
|
96
|
+
"""
|
97
|
+
Base class for the external index providers. Must be subclassed by the specific external index provider.
|
98
|
+
"""
|
99
|
+
|
100
|
+
type: ExternalIndexProviderType
|
101
|
+
supports_rollover: bool = False
|
102
|
+
|
103
|
+
def __init__(self, kbid: str):
|
104
|
+
self.kbid = kbid
|
105
|
+
|
106
|
+
@classmethod
|
107
|
+
@abc.abstractmethod
|
108
|
+
async def create_indexes(
|
109
|
+
cls,
|
110
|
+
kbid: str,
|
111
|
+
create_request: CreateExternalIndexProviderMetadata,
|
112
|
+
indexes: list[VectorsetExternalIndex],
|
113
|
+
) -> StoredExternalIndexProviderMetadata: ...
|
114
|
+
|
115
|
+
@classmethod
|
116
|
+
@abc.abstractmethod
|
117
|
+
async def delete_indexes(
|
118
|
+
cls,
|
119
|
+
kbid: str,
|
120
|
+
stored: StoredExternalIndexProviderMetadata,
|
121
|
+
) -> None: ...
|
122
|
+
|
123
|
+
@abc.abstractmethod
|
124
|
+
async def rollover_create_indexes(
|
125
|
+
self, stored: StoredExternalIndexProviderMetadata
|
126
|
+
) -> StoredExternalIndexProviderMetadata: # pragma: no cover
|
127
|
+
"""
|
128
|
+
Creates the indexes for the rollover process.
|
129
|
+
In the event of an error, it should rollback any left over indexes.
|
130
|
+
Returns a modified version of the stored external index provider metadata with the new indexes for the rollover.
|
131
|
+
"""
|
132
|
+
...
|
133
|
+
|
134
|
+
@abc.abstractmethod
|
135
|
+
async def rollover_cutover_indexes(self) -> None: # pragma: no cover
|
136
|
+
"""
|
137
|
+
Cutover the indexes for the rollover process.
|
138
|
+
After this operation, the new indexes should be used for queries and the old ones should be deleted.
|
139
|
+
"""
|
140
|
+
...
|
141
|
+
|
142
|
+
@classmethod
|
143
|
+
def get_index_name(cls) -> str: # pragma: no cover
|
144
|
+
"""
|
145
|
+
Returns the name of the index in the external index provider.
|
146
|
+
"""
|
147
|
+
raise NotImplementedError()
|
148
|
+
|
149
|
+
async def delete_resource(self, resource_uuid: str) -> None:
|
150
|
+
"""
|
151
|
+
Deletes a resource from the external index provider.
|
152
|
+
"""
|
153
|
+
logger.info(
|
154
|
+
"Deleting resource to external index",
|
155
|
+
extra={
|
156
|
+
"kbid": self.kbid,
|
157
|
+
"rid": resource_uuid,
|
158
|
+
"provider": self.type.value,
|
159
|
+
},
|
160
|
+
)
|
161
|
+
with manager_observer({"operation": "delete_resource", "provider": self.type.value}):
|
162
|
+
await self._delete_resource(resource_uuid)
|
163
|
+
|
164
|
+
async def index_resource(
|
165
|
+
self, resource_uuid: str, resource_data: Resource, to_rollover_indexes: bool = False
|
166
|
+
) -> None:
|
167
|
+
"""
|
168
|
+
Indexes a resource to the external index provider.
|
169
|
+
"""
|
170
|
+
if not self.supports_rollover and to_rollover_indexes:
|
171
|
+
logger.info(
|
172
|
+
"Indexing to rollover indexes not supported",
|
173
|
+
extra={
|
174
|
+
"kbid": self.kbid,
|
175
|
+
"rid": resource_uuid,
|
176
|
+
"provider": self.type.value,
|
177
|
+
},
|
178
|
+
)
|
179
|
+
return
|
180
|
+
logger.info(
|
181
|
+
"Indexing resource to external index",
|
182
|
+
extra={
|
183
|
+
"kbid": self.kbid,
|
184
|
+
"rid": resource_uuid,
|
185
|
+
"provider": self.type.value,
|
186
|
+
"rollover": to_rollover_indexes,
|
187
|
+
},
|
188
|
+
)
|
189
|
+
with manager_observer({"operation": "index_resource", "provider": self.type.value}):
|
190
|
+
try:
|
191
|
+
await self._index_resource(
|
192
|
+
resource_uuid, resource_data, to_rollover_indexes=to_rollover_indexes
|
193
|
+
)
|
194
|
+
except Exception as ex:
|
195
|
+
raise ExternalIndexingError() from ex
|
196
|
+
|
197
|
+
async def get_index_counts(self) -> IndexCounts:
|
198
|
+
"""
|
199
|
+
Returns the index counts for the external index provider.
|
200
|
+
"""
|
201
|
+
logger.debug(
|
202
|
+
"Getting index counts from external index",
|
203
|
+
extra={
|
204
|
+
"kbid": self.kbid,
|
205
|
+
"provider": self.type.value,
|
206
|
+
},
|
207
|
+
)
|
208
|
+
with manager_observer({"operation": "get_index_counts", "provider": self.type.value}):
|
209
|
+
return await self._get_index_counts()
|
210
|
+
|
211
|
+
async def query(self, request: SearchRequest) -> QueryResults:
|
212
|
+
"""
|
213
|
+
Queries the external index provider and returns the results.
|
214
|
+
"""
|
215
|
+
logger.info(
|
216
|
+
"Querying external index",
|
217
|
+
extra={
|
218
|
+
"kbid": self.kbid,
|
219
|
+
"provider": self.type.value,
|
220
|
+
},
|
221
|
+
)
|
222
|
+
with manager_observer({"operation": "query", "provider": self.type.value}):
|
223
|
+
return await self._query(request)
|
224
|
+
|
225
|
+
@abc.abstractmethod
|
226
|
+
async def _delete_resource(self, resource_uuid: str) -> None: # pragma: no cover
|
227
|
+
"""
|
228
|
+
Makes sure that all vectors associated with the resource are deleted from the external index provider.
|
229
|
+
"""
|
230
|
+
...
|
231
|
+
|
232
|
+
@abc.abstractmethod
|
233
|
+
async def _index_resource(
|
234
|
+
self, resource_uuid: str, resource_data: Resource, to_rollover_indexes: bool = False
|
235
|
+
) -> None: # pragma: no cover
|
236
|
+
"""
|
237
|
+
Adapts the Resource (aka brain) to the external index provider's index format and indexes it.
|
238
|
+
Params:
|
239
|
+
- resource_uuid: the resource's UUID
|
240
|
+
- resource_data: the resource index data
|
241
|
+
- to_rollover_indexes: whether to index to the rollover indexes or the main indexes
|
242
|
+
"""
|
243
|
+
...
|
244
|
+
|
245
|
+
@abc.abstractmethod
|
246
|
+
async def _query(self, request: SearchRequest) -> QueryResults: # pragma: no cover
|
247
|
+
"""
|
248
|
+
Adapts the Nucliadb's search request to the external index provider's query format and returns the results.
|
249
|
+
"""
|
250
|
+
...
|
251
|
+
|
252
|
+
@abc.abstractmethod
|
253
|
+
async def _get_index_counts(self) -> IndexCounts: # pragma: no cover
|
254
|
+
"""
|
255
|
+
Returns the index counts for the external index provider.
|
256
|
+
"""
|
257
|
+
...
|
@@ -17,15 +17,16 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
from nucliadb.common.cluster.index_node import READ_CONNECTIONS, WRITE_CONNECTIONS
|
21
|
-
from nucliadb.ingest.cache import clear_ingest_cache
|
22
20
|
|
23
21
|
|
24
|
-
|
25
|
-
|
26
|
-
|
22
|
+
class ExternalIndexCreationError(Exception):
|
23
|
+
def __init__(self, provider: str, message: str):
|
24
|
+
self.provider = provider
|
25
|
+
self.message = message
|
26
|
+
super().__init__(f"{provider} index creation error: {message}")
|
27
27
|
|
28
|
-
clear_ingest_cache()
|
29
28
|
|
30
|
-
|
31
|
-
|
29
|
+
class ExternalIndexingError(Exception):
|
30
|
+
"""
|
31
|
+
Raised when an error occurs while indexing a resource in an external index.
|
32
|
+
"""
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from typing import Optional
|
21
|
+
|
22
|
+
import async_lru
|
23
|
+
|
24
|
+
from nucliadb.common import datamanagers
|
25
|
+
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
26
|
+
from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
|
27
|
+
from nucliadb.common.external_index_providers.settings import settings
|
28
|
+
from nucliadb_protos.knowledgebox_pb2 import (
|
29
|
+
ExternalIndexProviderType,
|
30
|
+
StoredExternalIndexProviderMetadata,
|
31
|
+
)
|
32
|
+
from nucliadb_utils.utilities import get_endecryptor
|
33
|
+
|
34
|
+
|
35
|
+
async def get_external_index_manager(
|
36
|
+
kbid: str, for_rollover: bool = False
|
37
|
+
) -> Optional[ExternalIndexManager]:
|
38
|
+
"""
|
39
|
+
Returns an ExternalIndexManager for the given kbid.
|
40
|
+
If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
|
41
|
+
"""
|
42
|
+
metadata = await get_external_index_metadata(kbid)
|
43
|
+
if metadata is None or metadata.type != ExternalIndexProviderType.PINECONE:
|
44
|
+
# Only Pinecone is supported for now
|
45
|
+
return None
|
46
|
+
|
47
|
+
api_key = get_endecryptor().decrypt(metadata.pinecone_config.encrypted_api_key)
|
48
|
+
default_vectorset = await get_default_vectorset_id(kbid)
|
49
|
+
|
50
|
+
rollover_indexes = None
|
51
|
+
if for_rollover:
|
52
|
+
rollover_metadata = await get_rollover_external_index_metadata(kbid)
|
53
|
+
if rollover_metadata is not None:
|
54
|
+
rollover_indexes = dict(rollover_metadata.pinecone_config.indexes)
|
55
|
+
|
56
|
+
return PineconeIndexManager(
|
57
|
+
kbid=kbid,
|
58
|
+
api_key=api_key,
|
59
|
+
indexes=dict(metadata.pinecone_config.indexes),
|
60
|
+
upsert_parallelism=settings.pinecone_upsert_parallelism,
|
61
|
+
delete_parallelism=settings.pinecone_delete_parallelism,
|
62
|
+
upsert_timeout=settings.pinecone_upsert_timeout,
|
63
|
+
delete_timeout=settings.pinecone_delete_timeout,
|
64
|
+
default_vectorset=default_vectorset,
|
65
|
+
rollover_indexes=rollover_indexes,
|
66
|
+
)
|
67
|
+
|
68
|
+
|
69
|
+
@async_lru.alru_cache(maxsize=None)
|
70
|
+
async def get_external_index_metadata(kbid: str) -> Optional[StoredExternalIndexProviderMetadata]:
|
71
|
+
return await datamanagers.atomic.kb.get_external_index_provider_metadata(kbid=kbid)
|
72
|
+
|
73
|
+
|
74
|
+
@async_lru.alru_cache(maxsize=None)
|
75
|
+
async def get_default_vectorset_id(kbid: str) -> Optional[str]:
|
76
|
+
"""
|
77
|
+
While we are transitioning to the new vectorset system, we need to take into account
|
78
|
+
that KBs that have only one semantic model will have the `vectorset_id` field on BrokerMessage.field_vectors
|
79
|
+
set to empty string -- that is the `default` vectorset concept.
|
80
|
+
"""
|
81
|
+
async with datamanagers.with_ro_transaction() as txn:
|
82
|
+
vss = []
|
83
|
+
async for vs_id, vs_config in datamanagers.vectorsets.iter(txn, kbid=kbid):
|
84
|
+
vss.append((vs_id, vs_config))
|
85
|
+
if len(vss) == 0:
|
86
|
+
# If there is nothing in the vectorsets key on maindb, we use the "__default__" vectorset as id.
|
87
|
+
return "__default__"
|
88
|
+
if len(vss) == 1:
|
89
|
+
# If there is only one vectorset, return it as the default
|
90
|
+
return vss[0][0]
|
91
|
+
else:
|
92
|
+
# If there are multiple vectorsets, we don't have a default
|
93
|
+
# and we assume the index messages are explicit about the vectorset
|
94
|
+
return None
|
95
|
+
|
96
|
+
|
97
|
+
async def get_rollover_external_index_metadata(
|
98
|
+
kbid: str,
|
99
|
+
) -> Optional[StoredExternalIndexProviderMetadata]:
|
100
|
+
async with datamanagers.with_ro_transaction() as txn:
|
101
|
+
return await datamanagers.rollover.get_kb_rollover_external_index_metadata(txn, kbid=kbid)
|