nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -17,22 +17,27 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
from unittest.mock import Mock
|
21
20
|
|
22
|
-
|
21
|
+
"""Migration #22
|
23
22
|
|
24
|
-
|
25
|
-
|
23
|
+
There was a bug while ingesting/indexing that made paragraphs not being properly
|
24
|
+
removed in some cases. This rollover migration ensures data is consistently
|
25
|
+
indexed.
|
26
26
|
|
27
|
+
"""
|
27
28
|
|
28
|
-
|
29
|
-
return Mount(path=path, app=Mock())
|
29
|
+
import logging
|
30
30
|
|
31
|
+
from nucliadb.migrator.context import ExecutionContext
|
31
32
|
|
32
|
-
|
33
|
-
assert is_versioned_route(get_route(path="/api/v1/search"))
|
34
|
-
assert not is_versioned_route(get_route(path="/metrics"))
|
33
|
+
logger = logging.getLogger(__name__)
|
35
34
|
|
36
35
|
|
37
|
-
def
|
38
|
-
|
36
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
37
|
+
|
38
|
+
|
39
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
40
|
+
"""
|
41
|
+
We only need 1 rollover migration defined at a time; otherwise, we will
|
42
|
+
possibly run many for a kb when we only ever need to run one
|
43
|
+
"""
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
"""Migration #23
|
22
|
+
|
23
|
+
Backfill the data into the PG catalog
|
24
|
+
|
25
|
+
"""
|
26
|
+
|
27
|
+
import logging
|
28
|
+
from typing import cast
|
29
|
+
|
30
|
+
from nucliadb.common import datamanagers
|
31
|
+
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
32
|
+
from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
|
33
|
+
from nucliadb.migrator.context import ExecutionContext
|
34
|
+
|
35
|
+
logger = logging.getLogger(__name__)
|
36
|
+
|
37
|
+
|
38
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
39
|
+
|
40
|
+
|
41
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
42
|
+
if not isinstance(context.kv_driver, PGDriver):
|
43
|
+
return
|
44
|
+
|
45
|
+
BATCH_SIZE = 100
|
46
|
+
async with context.kv_driver.transaction() as txn:
|
47
|
+
txn = cast(PGTransaction, txn)
|
48
|
+
continue_sql = ""
|
49
|
+
while True:
|
50
|
+
async with txn.connection.cursor() as cur:
|
51
|
+
# Get list of resources except those already in the catalog
|
52
|
+
await cur.execute(
|
53
|
+
f"""
|
54
|
+
SELECT SPLIT_PART(key, '/', 5)::UUID FROM resources
|
55
|
+
LEFT JOIN catalog ON kbid = %s AND SPLIT_PART(key, '/', 5)::UUID = rid
|
56
|
+
WHERE key SIMILAR TO %s
|
57
|
+
AND rid IS NULL
|
58
|
+
{continue_sql}
|
59
|
+
ORDER BY key
|
60
|
+
LIMIT %s
|
61
|
+
""",
|
62
|
+
(kbid, f"/kbs/{kbid}/r/[a-f0-9]*", BATCH_SIZE),
|
63
|
+
)
|
64
|
+
resources_to_index = [r[0] for r in await cur.fetchall()]
|
65
|
+
if len(resources_to_index) == 0:
|
66
|
+
return
|
67
|
+
|
68
|
+
# Index each resource
|
69
|
+
for rid in resources_to_index:
|
70
|
+
rid = str(rid).replace("-", "")
|
71
|
+
resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
|
72
|
+
if resource is None:
|
73
|
+
logger.warning(f"Could not load resource {rid} for kbid {kbid}")
|
74
|
+
continue
|
75
|
+
|
76
|
+
await resource.compute_global_tags(resource.indexer)
|
77
|
+
await pgcatalog_update(txn, kbid, resource)
|
78
|
+
|
79
|
+
await txn.commit()
|
80
|
+
continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
|
@@ -0,0 +1,113 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
"""Migration #25 (Fixed migration 24)
|
22
|
+
|
23
|
+
Vectorsets are coming and we need to be ready at nucliadb. Vector index config
|
24
|
+
shouldn't be stored anymore in the `Shards` protobuffer, we need to migrate to
|
25
|
+
the new vectorsets config.
|
26
|
+
|
27
|
+
This migration asks learning_config for each KB configuration and saves the
|
28
|
+
model name as the vectorset_id. Creates a vectorset configuration for each model
|
29
|
+
and deprecates the vectors index config from the `Shards` protobuffer.
|
30
|
+
|
31
|
+
This migration should work for onprem and hosted deployments, as
|
32
|
+
learning_proxy handles which API is used (internal or external)
|
33
|
+
|
34
|
+
"""
|
35
|
+
|
36
|
+
import logging
|
37
|
+
|
38
|
+
from nucliadb import learning_proxy
|
39
|
+
from nucliadb.common import datamanagers
|
40
|
+
from nucliadb.migrator.context import ExecutionContext
|
41
|
+
from nucliadb_protos import (
|
42
|
+
knowledgebox_pb2,
|
43
|
+
nodewriter_pb2,
|
44
|
+
)
|
45
|
+
|
46
|
+
logger = logging.getLogger(__name__)
|
47
|
+
|
48
|
+
|
49
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
50
|
+
|
51
|
+
|
52
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
53
|
+
async with context.kv_driver.transaction(read_only=True) as txn:
|
54
|
+
vectorsets_count = len([vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)])
|
55
|
+
if vectorsets_count > 0:
|
56
|
+
logger.info("Skipping KB with vectorsets already populated", extra={"kbid": kbid})
|
57
|
+
return
|
58
|
+
|
59
|
+
learning_config = await learning_proxy.get_configuration(kbid)
|
60
|
+
if learning_config is None:
|
61
|
+
logger.warning(f"KB has no learning config", extra={"kbid": kbid})
|
62
|
+
return None
|
63
|
+
|
64
|
+
vectorset_id = learning_config.semantic_model
|
65
|
+
learning_model_metadata = learning_config.into_semantic_model_metadata()
|
66
|
+
learning_similarity = learning_model_metadata.similarity_function
|
67
|
+
learning_vector_dimension = learning_model_metadata.vector_dimension
|
68
|
+
learning_matryoshka_dimensions = learning_model_metadata.matryoshka_dimensions
|
69
|
+
learning_normalize_vectors = len(learning_matryoshka_dimensions) > 0
|
70
|
+
|
71
|
+
async with context.kv_driver.transaction(read_only=True) as txn:
|
72
|
+
semantic_model = await datamanagers.kb.get_model_metadata(txn, kbid=kbid)
|
73
|
+
|
74
|
+
maindb_similarity = semantic_model.similarity_function
|
75
|
+
|
76
|
+
maindb_vector_dimension = None
|
77
|
+
if semantic_model.vector_dimension:
|
78
|
+
maindb_vector_dimension = semantic_model.vector_dimension
|
79
|
+
|
80
|
+
maindb_matryoshka_dimensions: list[int] = []
|
81
|
+
if len(semantic_model.matryoshka_dimensions) > 0:
|
82
|
+
maindb_matryoshka_dimensions.extend(semantic_model.matryoshka_dimensions)
|
83
|
+
|
84
|
+
maindb_normalize_vectors = len(maindb_matryoshka_dimensions) > 0
|
85
|
+
|
86
|
+
if (
|
87
|
+
maindb_similarity != learning_similarity
|
88
|
+
or (maindb_vector_dimension is not None and maindb_vector_dimension != learning_vector_dimension)
|
89
|
+
or set(maindb_matryoshka_dimensions) != set(learning_matryoshka_dimensions)
|
90
|
+
or maindb_normalize_vectors != learning_normalize_vectors
|
91
|
+
):
|
92
|
+
logger.error(
|
93
|
+
"KB has mismatched data between nucliadb and learning_config! Please, review manually",
|
94
|
+
extra={"kbid": kbid},
|
95
|
+
)
|
96
|
+
return None
|
97
|
+
|
98
|
+
default_vectorset = knowledgebox_pb2.VectorSetConfig(
|
99
|
+
vectorset_id=vectorset_id,
|
100
|
+
vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
|
101
|
+
vector_dimension=maindb_vector_dimension,
|
102
|
+
similarity=maindb_similarity,
|
103
|
+
vector_type=nodewriter_pb2.VectorType.DENSE_F32, # we only support this for now
|
104
|
+
normalize_vectors=maindb_normalize_vectors,
|
105
|
+
),
|
106
|
+
matryoshka_dimensions=maindb_matryoshka_dimensions,
|
107
|
+
)
|
108
|
+
|
109
|
+
async with context.kv_driver.transaction() as txn:
|
110
|
+
# Populate KB vectorsets with data from learning. We are skipping KBs
|
111
|
+
# with this key already set, so we can set here safely
|
112
|
+
await datamanagers.vectorsets.set(txn, kbid=kbid, config=default_vectorset)
|
113
|
+
await txn.commit()
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
"""Migration #26
|
22
|
+
|
23
|
+
Previously, there was no validation on content types added by users on upload. This caused that in some KBs,
|
24
|
+
there were content types that included random uuids, which caused high cardinality in the content type field.
|
25
|
+
|
26
|
+
This migration will fix those invalid content types.
|
27
|
+
"""
|
28
|
+
|
29
|
+
import logging
|
30
|
+
|
31
|
+
from nucliadb.common import datamanagers
|
32
|
+
from nucliadb.migrator.context import ExecutionContext
|
33
|
+
|
34
|
+
logger = logging.getLogger(__name__)
|
35
|
+
|
36
|
+
|
37
|
+
AFFECTED_KBS = [
|
38
|
+
"78d289e0-dd4d-448c-84b5-8ef0b01a5aba",
|
39
|
+
]
|
40
|
+
|
41
|
+
|
42
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
43
|
+
|
44
|
+
|
45
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
46
|
+
if kbid not in AFFECTED_KBS:
|
47
|
+
return
|
48
|
+
async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
|
49
|
+
async with datamanagers.with_rw_transaction() as txn:
|
50
|
+
basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
|
51
|
+
if not basic or not basic.icon:
|
52
|
+
continue
|
53
|
+
# We're aiming to fix content types like "multipart/form-data; boundary={uuid}"
|
54
|
+
if "multipart/form-data" not in basic.icon:
|
55
|
+
continue
|
56
|
+
if "boundary=" not in basic.icon:
|
57
|
+
continue
|
58
|
+
logger.info("Fixing content type for resource", extra={"kbid": kbid, "rid": rid})
|
59
|
+
basic.icon = "multipart/form-data"
|
60
|
+
await datamanagers.resources.set_basic(txn, kbid=kbid, rid=rid, basic=basic)
|
61
|
+
await txn.commit()
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
"""Migration #27
|
22
|
+
|
23
|
+
Rollover for nucliadb_texts3
|
24
|
+
"""
|
25
|
+
|
26
|
+
import logging
|
27
|
+
|
28
|
+
from nucliadb import learning_proxy
|
29
|
+
from nucliadb.common import datamanagers
|
30
|
+
from nucliadb.common.cluster.rollover import rollover_kb_index
|
31
|
+
from nucliadb.migrator.context import ExecutionContext
|
32
|
+
|
33
|
+
logger = logging.getLogger(__name__)
|
34
|
+
|
35
|
+
|
36
|
+
async def migrate(context: ExecutionContext) -> None: ...
|
37
|
+
|
38
|
+
|
39
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
40
|
+
await maybe_fix_vector_dimensions(context, kbid)
|
41
|
+
await rollover_kb_index(context, kbid)
|
42
|
+
|
43
|
+
|
44
|
+
async def maybe_fix_vector_dimensions(context: ExecutionContext, kbid: str) -> None:
|
45
|
+
learning_config = await learning_proxy.get_configuration(kbid)
|
46
|
+
if learning_config is None:
|
47
|
+
logger.warning(f"KB has no learning config", extra={"kbid": kbid})
|
48
|
+
return
|
49
|
+
|
50
|
+
async with context.kv_driver.transaction() as txn:
|
51
|
+
vectorsets = [vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)]
|
52
|
+
if len(vectorsets) != 1:
|
53
|
+
# If multiple vectorsets, they are new shards created correctly, we can safely skip it
|
54
|
+
logger.warning(f"KB has {len(vectorsets)} vectorsets, skipping...", extra={"kbid": kbid})
|
55
|
+
return
|
56
|
+
vectorset = vectorsets[0][1]
|
57
|
+
|
58
|
+
# Correct value, skip
|
59
|
+
if vectorset.vectorset_index_config.vector_dimension != 0:
|
60
|
+
return
|
61
|
+
|
62
|
+
learning_model_metadata = learning_config.into_semantic_model_metadata()
|
63
|
+
logger.info(
|
64
|
+
f"Fixing KB vectorset dimension",
|
65
|
+
extra={
|
66
|
+
"kbid": kbid,
|
67
|
+
"from": vectorset.vectorset_index_config.vector_dimension,
|
68
|
+
"to": learning_model_metadata.vector_dimension,
|
69
|
+
},
|
70
|
+
)
|
71
|
+
vectorset.vectorset_index_config.vector_dimension = learning_model_metadata.vector_dimension
|
72
|
+
|
73
|
+
await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset)
|
@@ -17,18 +17,16 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
from nucliadb_protos.resources_pb2 import FieldDatetime
|
21
20
|
|
22
|
-
from nucliadb.
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
23
22
|
|
24
23
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
return await self.db_get_value()
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
25
|
+
async with txn.connection.cursor() as cur:
|
26
|
+
# IF NOT EXISTS just for compatibility with older install predating the migration system
|
27
|
+
await cur.execute("""
|
28
|
+
CREATE TABLE IF NOT EXISTS resources (
|
29
|
+
key TEXT PRIMARY KEY,
|
30
|
+
value BYTEA
|
31
|
+
);
|
32
|
+
""")
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
22
|
+
|
23
|
+
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
25
|
+
async with txn.connection.cursor() as cur:
|
26
|
+
await cur.execute(r"""
|
27
|
+
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
|
28
|
+
CREATE EXTENSION IF NOT EXISTS btree_gin;
|
29
|
+
CREATE TABLE catalog (
|
30
|
+
kbid UUID,
|
31
|
+
rid UUID,
|
32
|
+
title TEXT,
|
33
|
+
created_at TIMESTAMP,
|
34
|
+
modified_at TIMESTAMP,
|
35
|
+
labels TEXT[],
|
36
|
+
PRIMARY KEY(kbid, rid)
|
37
|
+
);
|
38
|
+
CREATE INDEX ON catalog USING GIN(kbid, labels);
|
39
|
+
CREATE INDEX ON catalog USING GIN(kbid, regexp_split_to_array(lower(title), '\W'::text));
|
40
|
+
CREATE INDEX ON catalog(kbid, created_at);
|
41
|
+
CREATE INDEX ON catalog(kbid, modified_at);
|
42
|
+
""")
|
@@ -17,8 +17,10 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
from nucliadb.ingest.settings import DriverConfig
|
21
20
|
|
21
|
+
from nucliadb.common.maindb.pg import PGTransaction
|
22
22
|
|
23
|
-
|
24
|
-
|
23
|
+
|
24
|
+
async def migrate(txn: PGTransaction) -> None:
|
25
|
+
async with txn.connection.cursor() as cur:
|
26
|
+
await cur.execute("CREATE INDEX ON catalog(kbid);")
|
nucliadb/common/cluster/base.py
CHANGED
@@ -20,16 +20,16 @@
|
|
20
20
|
from abc import ABCMeta, abstractmethod
|
21
21
|
from typing import AsyncIterator, Optional
|
22
22
|
|
23
|
+
from nucliadb_protos import nodereader_pb2, noderesources_pb2, utils_pb2
|
23
24
|
from nucliadb_protos.nodereader_pb2_grpc import NodeReaderStub
|
24
25
|
from nucliadb_protos.nodewriter_pb2 import (
|
25
26
|
NewShardRequest,
|
26
27
|
NewVectorSetRequest,
|
27
28
|
OpStatus,
|
29
|
+
VectorIndexConfig,
|
28
30
|
)
|
29
31
|
from nucliadb_protos.nodewriter_pb2_grpc import NodeWriterStub
|
30
32
|
|
31
|
-
from nucliadb_protos import nodereader_pb2, noderesources_pb2, utils_pb2
|
32
|
-
|
33
33
|
|
34
34
|
class AbstractIndexNode(metaclass=ABCMeta):
|
35
35
|
label: str = "index-node"
|
@@ -85,23 +85,37 @@ class AbstractIndexNode(metaclass=ABCMeta):
|
|
85
85
|
async for idandfacets in self.reader.Paragraphs(stream_request): # type: ignore
|
86
86
|
yield idandfacets
|
87
87
|
|
88
|
-
async def get_shard(
|
89
|
-
self, shard_id: str, vectorset: Optional[str] = None
|
90
|
-
) -> noderesources_pb2.Shard:
|
88
|
+
async def get_shard(self, shard_id: str) -> noderesources_pb2.Shard:
|
91
89
|
req = nodereader_pb2.GetShardRequest()
|
92
90
|
req.shard_id.id = shard_id
|
93
|
-
if vectorset is not None:
|
94
|
-
req.vectorset = vectorset
|
95
91
|
return await self.reader.GetShard(req) # type: ignore
|
96
92
|
|
97
93
|
async def new_shard(
|
98
94
|
self,
|
99
95
|
kbid: str,
|
100
|
-
|
101
|
-
release_channel: utils_pb2.ReleaseChannel.ValueType,
|
96
|
+
vector_index_config: VectorIndexConfig,
|
102
97
|
) -> noderesources_pb2.ShardCreated:
|
103
98
|
req = NewShardRequest(
|
104
|
-
kbid=kbid,
|
99
|
+
kbid=kbid,
|
100
|
+
release_channel=utils_pb2.ReleaseChannel.STABLE,
|
101
|
+
config=vector_index_config,
|
102
|
+
# Deprecated fields, only for backwards compatibility with older nodes
|
103
|
+
similarity=vector_index_config.similarity,
|
104
|
+
normalize_vectors=vector_index_config.normalize_vectors,
|
105
|
+
)
|
106
|
+
|
107
|
+
resp = await self.writer.NewShard(req) # type: ignore
|
108
|
+
return resp
|
109
|
+
|
110
|
+
async def new_shard_with_vectorsets(
|
111
|
+
self,
|
112
|
+
kbid: str,
|
113
|
+
vectorsets_configs: dict[str, VectorIndexConfig],
|
114
|
+
) -> noderesources_pb2.ShardCreated:
|
115
|
+
req = NewShardRequest(
|
116
|
+
kbid=kbid,
|
117
|
+
release_channel=utils_pb2.ReleaseChannel.STABLE,
|
118
|
+
vectorsets_configs=vectorsets_configs,
|
105
119
|
)
|
106
120
|
|
107
121
|
resp = await self.writer.NewShard(req) # type: ignore
|
@@ -116,28 +130,31 @@ class AbstractIndexNode(metaclass=ABCMeta):
|
|
116
130
|
resp: noderesources_pb2.ShardId = await self.writer.DeleteShard(req) # type: ignore
|
117
131
|
return resp.id
|
118
132
|
|
119
|
-
async def
|
120
|
-
req = noderesources_pb2.VectorSetID()
|
121
|
-
req.shard.id = shard_id
|
122
|
-
req.vectorset = vectorset
|
123
|
-
resp = await self.writer.RemoveVectorSet(req) # type: ignore
|
124
|
-
return resp
|
125
|
-
|
126
|
-
async def set_vectorset(
|
133
|
+
async def add_vectorset(
|
127
134
|
self,
|
128
135
|
shard_id: str,
|
129
136
|
vectorset: str,
|
130
|
-
|
137
|
+
config: VectorIndexConfig,
|
131
138
|
) -> OpStatus:
|
132
|
-
req = NewVectorSetRequest(
|
133
|
-
|
134
|
-
|
135
|
-
|
139
|
+
req = NewVectorSetRequest(
|
140
|
+
id=noderesources_pb2.VectorSetID(
|
141
|
+
shard=noderesources_pb2.ShardId(id=shard_id), vectorset=vectorset
|
142
|
+
),
|
143
|
+
config=config,
|
144
|
+
)
|
145
|
+
|
136
146
|
resp = await self.writer.AddVectorSet(req) # type: ignore
|
137
147
|
return resp
|
138
148
|
|
139
|
-
async def
|
149
|
+
async def list_vectorsets(self, shard_id: str) -> list[str]:
|
140
150
|
req = noderesources_pb2.ShardId()
|
141
151
|
req.id = shard_id
|
142
152
|
resp = await self.writer.ListVectorSets(req) # type: ignore
|
153
|
+
return [v for v in resp.vectorsets]
|
154
|
+
|
155
|
+
async def remove_vectorset(self, shard_id: str, vectorset: str) -> OpStatus:
|
156
|
+
req = noderesources_pb2.VectorSetID()
|
157
|
+
req.shard.id = shard_id
|
158
|
+
req.vectorset = vectorset
|
159
|
+
resp = await self.writer.RemoveVectorSet(req) # type: ignore
|
143
160
|
return resp
|
@@ -113,7 +113,7 @@ async def _get_index_node_metadata(
|
|
113
113
|
channel = get_traced_grpc_channel(grpc_address, "discovery", variant="_writer")
|
114
114
|
if read_replica:
|
115
115
|
# on a read replica, we need to use the replication service
|
116
|
-
stub = replication_pb2_grpc.ReplicationServiceStub(channel)
|
116
|
+
stub = replication_pb2_grpc.ReplicationServiceStub(channel)
|
117
117
|
else:
|
118
118
|
stub = nodewriter_pb2_grpc.NodeWriterStub(channel) # type: ignore
|
119
119
|
try:
|
@@ -127,9 +127,7 @@ async def _get_index_node_metadata(
|
|
127
127
|
or None
|
128
128
|
)
|
129
129
|
if read_replica and primary_id is None:
|
130
|
-
raise Exception(
|
131
|
-
"Primary node id not found when it is expected to be a read replica"
|
132
|
-
)
|
130
|
+
raise Exception("Primary node id not found when it is expected to be a read replica")
|
133
131
|
|
134
132
|
return IndexNodeMetadata(
|
135
133
|
node_id=metadata.node_id,
|
@@ -141,18 +139,14 @@ async def _get_index_node_metadata(
|
|
141
139
|
)
|
142
140
|
|
143
141
|
|
144
|
-
@backoff.on_exception(
|
145
|
-
|
146
|
-
)
|
147
|
-
async def _get_standalone_index_node_metadata(
|
148
|
-
settings: Settings, address: str
|
149
|
-
) -> IndexNodeMetadata:
|
142
|
+
@backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=4)
|
143
|
+
async def _get_standalone_index_node_metadata(settings: Settings, address: str) -> IndexNodeMetadata:
|
150
144
|
if ":" not in address:
|
151
145
|
grpc_address = f"{address}:{settings.standalone_node_port}"
|
152
146
|
else:
|
153
147
|
grpc_address = address
|
154
148
|
channel = get_traced_grpc_channel(grpc_address, "standalone_proxy")
|
155
|
-
stub = standalone_pb2_grpc.StandaloneClusterServiceStub(channel)
|
149
|
+
stub = standalone_pb2_grpc.StandaloneClusterServiceStub(channel)
|
156
150
|
resp: standalone_pb2.NodeInfoResponse = await stub.NodeInfo(standalone_pb2.NodeInfoRequest()) # type: ignore
|
157
151
|
return IndexNodeMetadata(
|
158
152
|
node_id=resp.id,
|
@@ -177,9 +171,7 @@ class AbstractClusterDiscovery(abc.ABC):
|
|
177
171
|
async def finalize(self) -> None:
|
178
172
|
""" """
|
179
173
|
|
180
|
-
async def _query_node_metadata(
|
181
|
-
self, address: str, read_replica: bool = False
|
182
|
-
) -> IndexNodeMetadata:
|
174
|
+
async def _query_node_metadata(self, address: str, read_replica: bool = False) -> IndexNodeMetadata:
|
183
175
|
if self.settings.standalone_mode:
|
184
176
|
return await _get_standalone_index_node_metadata(self.settings, address)
|
185
177
|
else:
|