nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -0,0 +1,257 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
import abc
|
21
|
+
import logging
|
22
|
+
from dataclasses import dataclass
|
23
|
+
from typing import Any, Iterator, Optional
|
24
|
+
|
25
|
+
from pydantic import BaseModel
|
26
|
+
|
27
|
+
from nucliadb.common.counters import IndexCounts
|
28
|
+
from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
|
29
|
+
from nucliadb.common.ids import ParagraphId
|
30
|
+
from nucliadb_models.external_index_providers import ExternalIndexProviderType
|
31
|
+
from nucliadb_models.search import SCORE_TYPE, TextPosition
|
32
|
+
from nucliadb_protos.knowledgebox_pb2 import (
|
33
|
+
CreateExternalIndexProviderMetadata,
|
34
|
+
StoredExternalIndexProviderMetadata,
|
35
|
+
)
|
36
|
+
from nucliadb_protos.nodereader_pb2 import SearchRequest
|
37
|
+
from nucliadb_protos.noderesources_pb2 import Resource
|
38
|
+
from nucliadb_protos.utils_pb2 import VectorSimilarity
|
39
|
+
from nucliadb_telemetry.metrics import Observer
|
40
|
+
|
41
|
+
logger = logging.getLogger(__name__)
|
42
|
+
|
43
|
+
manager_observer = Observer("external_index_manager", labels={"operation": "", "provider": ""})
|
44
|
+
|
45
|
+
|
46
|
+
@dataclass
|
47
|
+
class VectorsetExternalIndex:
|
48
|
+
"""
|
49
|
+
Used to indicate to external index managers the required metadata
|
50
|
+
in order to create an external index for each vectorset
|
51
|
+
"""
|
52
|
+
|
53
|
+
vectorset_id: str
|
54
|
+
dimension: int
|
55
|
+
similarity: VectorSimilarity.ValueType
|
56
|
+
|
57
|
+
|
58
|
+
class TextBlockMatch(BaseModel):
|
59
|
+
"""
|
60
|
+
Model a text block/paragraph retrieved from an external index with all the information
|
61
|
+
needed in order to later hydrate retrieval results.
|
62
|
+
"""
|
63
|
+
|
64
|
+
paragraph_id: ParagraphId
|
65
|
+
position: TextPosition
|
66
|
+
score: float
|
67
|
+
score_type: SCORE_TYPE
|
68
|
+
order: int
|
69
|
+
page_with_visual: bool = False
|
70
|
+
fuzzy_search: bool
|
71
|
+
is_a_table: bool = False
|
72
|
+
representation_file: Optional[str] = None
|
73
|
+
paragraph_labels: list[str] = []
|
74
|
+
field_labels: list[str] = []
|
75
|
+
text: Optional[str] = None
|
76
|
+
|
77
|
+
|
78
|
+
class QueryResults(BaseModel):
|
79
|
+
"""
|
80
|
+
Model for the results of a query to an external index provider.
|
81
|
+
Must be subclassed by the specific external index provider.
|
82
|
+
"""
|
83
|
+
|
84
|
+
type: ExternalIndexProviderType
|
85
|
+
results: Any
|
86
|
+
|
87
|
+
def iter_matching_text_blocks(self) -> Iterator[TextBlockMatch]:
|
88
|
+
"""
|
89
|
+
Iterates over the paragraphs in the results, by decreasing score.
|
90
|
+
This should be implemented by the specific external index provider.
|
91
|
+
"""
|
92
|
+
raise NotImplementedError()
|
93
|
+
|
94
|
+
|
95
|
+
class ExternalIndexManager(abc.ABC, metaclass=abc.ABCMeta):
|
96
|
+
"""
|
97
|
+
Base class for the external index providers. Must be subclassed by the specific external index provider.
|
98
|
+
"""
|
99
|
+
|
100
|
+
type: ExternalIndexProviderType
|
101
|
+
supports_rollover: bool = False
|
102
|
+
|
103
|
+
def __init__(self, kbid: str):
|
104
|
+
self.kbid = kbid
|
105
|
+
|
106
|
+
@classmethod
|
107
|
+
@abc.abstractmethod
|
108
|
+
async def create_indexes(
|
109
|
+
cls,
|
110
|
+
kbid: str,
|
111
|
+
create_request: CreateExternalIndexProviderMetadata,
|
112
|
+
indexes: list[VectorsetExternalIndex],
|
113
|
+
) -> StoredExternalIndexProviderMetadata: ...
|
114
|
+
|
115
|
+
@classmethod
|
116
|
+
@abc.abstractmethod
|
117
|
+
async def delete_indexes(
|
118
|
+
cls,
|
119
|
+
kbid: str,
|
120
|
+
stored: StoredExternalIndexProviderMetadata,
|
121
|
+
) -> None: ...
|
122
|
+
|
123
|
+
@abc.abstractmethod
|
124
|
+
async def rollover_create_indexes(
|
125
|
+
self, stored: StoredExternalIndexProviderMetadata
|
126
|
+
) -> StoredExternalIndexProviderMetadata: # pragma: no cover
|
127
|
+
"""
|
128
|
+
Creates the indexes for the rollover process.
|
129
|
+
In the event of an error, it should rollback any left over indexes.
|
130
|
+
Returns a modified version of the stored external index provider metadata with the new indexes for the rollover.
|
131
|
+
"""
|
132
|
+
...
|
133
|
+
|
134
|
+
@abc.abstractmethod
|
135
|
+
async def rollover_cutover_indexes(self) -> None: # pragma: no cover
|
136
|
+
"""
|
137
|
+
Cutover the indexes for the rollover process.
|
138
|
+
After this operation, the new indexes should be used for queries and the old ones should be deleted.
|
139
|
+
"""
|
140
|
+
...
|
141
|
+
|
142
|
+
@classmethod
|
143
|
+
def get_index_name(cls) -> str: # pragma: no cover
|
144
|
+
"""
|
145
|
+
Returns the name of the index in the external index provider.
|
146
|
+
"""
|
147
|
+
raise NotImplementedError()
|
148
|
+
|
149
|
+
async def delete_resource(self, resource_uuid: str) -> None:
|
150
|
+
"""
|
151
|
+
Deletes a resource from the external index provider.
|
152
|
+
"""
|
153
|
+
logger.info(
|
154
|
+
"Deleting resource to external index",
|
155
|
+
extra={
|
156
|
+
"kbid": self.kbid,
|
157
|
+
"rid": resource_uuid,
|
158
|
+
"provider": self.type.value,
|
159
|
+
},
|
160
|
+
)
|
161
|
+
with manager_observer({"operation": "delete_resource", "provider": self.type.value}):
|
162
|
+
await self._delete_resource(resource_uuid)
|
163
|
+
|
164
|
+
async def index_resource(
|
165
|
+
self, resource_uuid: str, resource_data: Resource, to_rollover_indexes: bool = False
|
166
|
+
) -> None:
|
167
|
+
"""
|
168
|
+
Indexes a resource to the external index provider.
|
169
|
+
"""
|
170
|
+
if not self.supports_rollover and to_rollover_indexes:
|
171
|
+
logger.info(
|
172
|
+
"Indexing to rollover indexes not supported",
|
173
|
+
extra={
|
174
|
+
"kbid": self.kbid,
|
175
|
+
"rid": resource_uuid,
|
176
|
+
"provider": self.type.value,
|
177
|
+
},
|
178
|
+
)
|
179
|
+
return
|
180
|
+
logger.info(
|
181
|
+
"Indexing resource to external index",
|
182
|
+
extra={
|
183
|
+
"kbid": self.kbid,
|
184
|
+
"rid": resource_uuid,
|
185
|
+
"provider": self.type.value,
|
186
|
+
"rollover": to_rollover_indexes,
|
187
|
+
},
|
188
|
+
)
|
189
|
+
with manager_observer({"operation": "index_resource", "provider": self.type.value}):
|
190
|
+
try:
|
191
|
+
await self._index_resource(
|
192
|
+
resource_uuid, resource_data, to_rollover_indexes=to_rollover_indexes
|
193
|
+
)
|
194
|
+
except Exception as ex:
|
195
|
+
raise ExternalIndexingError() from ex
|
196
|
+
|
197
|
+
async def get_index_counts(self) -> IndexCounts:
|
198
|
+
"""
|
199
|
+
Returns the index counts for the external index provider.
|
200
|
+
"""
|
201
|
+
logger.debug(
|
202
|
+
"Getting index counts from external index",
|
203
|
+
extra={
|
204
|
+
"kbid": self.kbid,
|
205
|
+
"provider": self.type.value,
|
206
|
+
},
|
207
|
+
)
|
208
|
+
with manager_observer({"operation": "get_index_counts", "provider": self.type.value}):
|
209
|
+
return await self._get_index_counts()
|
210
|
+
|
211
|
+
async def query(self, request: SearchRequest) -> QueryResults:
|
212
|
+
"""
|
213
|
+
Queries the external index provider and returns the results.
|
214
|
+
"""
|
215
|
+
logger.info(
|
216
|
+
"Querying external index",
|
217
|
+
extra={
|
218
|
+
"kbid": self.kbid,
|
219
|
+
"provider": self.type.value,
|
220
|
+
},
|
221
|
+
)
|
222
|
+
with manager_observer({"operation": "query", "provider": self.type.value}):
|
223
|
+
return await self._query(request)
|
224
|
+
|
225
|
+
@abc.abstractmethod
|
226
|
+
async def _delete_resource(self, resource_uuid: str) -> None: # pragma: no cover
|
227
|
+
"""
|
228
|
+
Makes sure that all vectors associated with the resource are deleted from the external index provider.
|
229
|
+
"""
|
230
|
+
...
|
231
|
+
|
232
|
+
@abc.abstractmethod
|
233
|
+
async def _index_resource(
|
234
|
+
self, resource_uuid: str, resource_data: Resource, to_rollover_indexes: bool = False
|
235
|
+
) -> None: # pragma: no cover
|
236
|
+
"""
|
237
|
+
Adapts the Resource (aka brain) to the external index provider's index format and indexes it.
|
238
|
+
Params:
|
239
|
+
- resource_uuid: the resource's UUID
|
240
|
+
- resource_data: the resource index data
|
241
|
+
- to_rollover_indexes: whether to index to the rollover indexes or the main indexes
|
242
|
+
"""
|
243
|
+
...
|
244
|
+
|
245
|
+
@abc.abstractmethod
|
246
|
+
async def _query(self, request: SearchRequest) -> QueryResults: # pragma: no cover
|
247
|
+
"""
|
248
|
+
Adapts the Nucliadb's search request to the external index provider's query format and returns the results.
|
249
|
+
"""
|
250
|
+
...
|
251
|
+
|
252
|
+
@abc.abstractmethod
|
253
|
+
async def _get_index_counts(self) -> IndexCounts: # pragma: no cover
|
254
|
+
"""
|
255
|
+
Returns the index counts for the external index provider.
|
256
|
+
"""
|
257
|
+
...
|
nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py}
RENAMED
@@ -18,14 +18,15 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
-
import pytest
|
22
21
|
|
23
|
-
|
22
|
+
class ExternalIndexCreationError(Exception):
|
23
|
+
def __init__(self, provider: str, message: str):
|
24
|
+
self.provider = provider
|
25
|
+
self.message = message
|
26
|
+
super().__init__(f"{provider} index creation error: {message}")
|
24
27
|
|
25
28
|
|
26
|
-
|
27
|
-
"
|
28
|
-
|
29
|
-
|
30
|
-
def test_choose_matryoshka_dimensions(dimensions, expected):
|
31
|
-
assert choose_matryoshka_dimension(dimensions) == expected
|
29
|
+
class ExternalIndexingError(Exception):
|
30
|
+
"""
|
31
|
+
Raised when an error occurs while indexing a resource in an external index.
|
32
|
+
"""
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from typing import Optional
|
21
|
+
|
22
|
+
import async_lru
|
23
|
+
|
24
|
+
from nucliadb.common import datamanagers
|
25
|
+
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
26
|
+
from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
|
27
|
+
from nucliadb.common.external_index_providers.settings import settings
|
28
|
+
from nucliadb_protos.knowledgebox_pb2 import (
|
29
|
+
ExternalIndexProviderType,
|
30
|
+
StoredExternalIndexProviderMetadata,
|
31
|
+
)
|
32
|
+
from nucliadb_utils.utilities import get_endecryptor
|
33
|
+
|
34
|
+
|
35
|
+
async def get_external_index_manager(
|
36
|
+
kbid: str, for_rollover: bool = False
|
37
|
+
) -> Optional[ExternalIndexManager]:
|
38
|
+
"""
|
39
|
+
Returns an ExternalIndexManager for the given kbid.
|
40
|
+
If for_rollover is True, the ExternalIndexManager returned will include the rollover indexes (if any).
|
41
|
+
"""
|
42
|
+
metadata = await get_external_index_metadata(kbid)
|
43
|
+
if metadata is None or metadata.type != ExternalIndexProviderType.PINECONE:
|
44
|
+
# Only Pinecone is supported for now
|
45
|
+
return None
|
46
|
+
|
47
|
+
api_key = get_endecryptor().decrypt(metadata.pinecone_config.encrypted_api_key)
|
48
|
+
default_vectorset = await get_default_vectorset_id(kbid)
|
49
|
+
|
50
|
+
rollover_indexes = None
|
51
|
+
if for_rollover:
|
52
|
+
rollover_metadata = await get_rollover_external_index_metadata(kbid)
|
53
|
+
if rollover_metadata is not None:
|
54
|
+
rollover_indexes = dict(rollover_metadata.pinecone_config.indexes)
|
55
|
+
|
56
|
+
return PineconeIndexManager(
|
57
|
+
kbid=kbid,
|
58
|
+
api_key=api_key,
|
59
|
+
indexes=dict(metadata.pinecone_config.indexes),
|
60
|
+
upsert_parallelism=settings.pinecone_upsert_parallelism,
|
61
|
+
delete_parallelism=settings.pinecone_delete_parallelism,
|
62
|
+
upsert_timeout=settings.pinecone_upsert_timeout,
|
63
|
+
delete_timeout=settings.pinecone_delete_timeout,
|
64
|
+
default_vectorset=default_vectorset,
|
65
|
+
rollover_indexes=rollover_indexes,
|
66
|
+
)
|
67
|
+
|
68
|
+
|
69
|
+
@async_lru.alru_cache(maxsize=None)
|
70
|
+
async def get_external_index_metadata(kbid: str) -> Optional[StoredExternalIndexProviderMetadata]:
|
71
|
+
return await datamanagers.atomic.kb.get_external_index_provider_metadata(kbid=kbid)
|
72
|
+
|
73
|
+
|
74
|
+
@async_lru.alru_cache(maxsize=None)
|
75
|
+
async def get_default_vectorset_id(kbid: str) -> Optional[str]:
|
76
|
+
"""
|
77
|
+
While we are transitioning to the new vectorset system, we need to take into account
|
78
|
+
that KBs that have only one semantic model will have the `vectorset_id` field on BrokerMessage.field_vectors
|
79
|
+
set to empty string -- that is the `default` vectorset concept.
|
80
|
+
"""
|
81
|
+
async with datamanagers.with_ro_transaction() as txn:
|
82
|
+
vss = []
|
83
|
+
async for vs_id, vs_config in datamanagers.vectorsets.iter(txn, kbid=kbid):
|
84
|
+
vss.append((vs_id, vs_config))
|
85
|
+
if len(vss) == 0:
|
86
|
+
# If there is nothing in the vectorsets key on maindb, we use the "__default__" vectorset as id.
|
87
|
+
return "__default__"
|
88
|
+
if len(vss) == 1:
|
89
|
+
# If there is only one vectorset, return it as the default
|
90
|
+
return vss[0][0]
|
91
|
+
else:
|
92
|
+
# If there are multiple vectorsets, we don't have a default
|
93
|
+
# and we assume the index messages are explicit about the vectorset
|
94
|
+
return None
|
95
|
+
|
96
|
+
|
97
|
+
async def get_rollover_external_index_metadata(
|
98
|
+
kbid: str,
|
99
|
+
) -> Optional[StoredExternalIndexProviderMetadata]:
|
100
|
+
async with datamanagers.with_ro_transaction() as txn:
|
101
|
+
return await datamanagers.rollover.get_kb_rollover_external_index_metadata(txn, kbid=kbid)
|