nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/search/search/query.py
CHANGED
@@ -19,19 +19,19 @@
|
|
19
19
|
#
|
20
20
|
import asyncio
|
21
21
|
import json
|
22
|
+
import string
|
22
23
|
from datetime import datetime
|
23
24
|
from typing import Any, Awaitable, Optional, Union
|
24
25
|
|
25
26
|
from async_lru import alru_cache
|
26
|
-
from nucliadb_protos.noderesources_pb2 import Resource
|
27
27
|
|
28
28
|
from nucliadb.common import datamanagers
|
29
|
-
from nucliadb.
|
29
|
+
from nucliadb.common.maindb.utils import get_driver
|
30
30
|
from nucliadb.search import logger
|
31
31
|
from nucliadb.search.predict import SendToPredictError, convert_relations
|
32
32
|
from nucliadb.search.search.filters import (
|
33
33
|
convert_to_node_filters,
|
34
|
-
|
34
|
+
flatten_filter_literals,
|
35
35
|
has_classification_label_filters,
|
36
36
|
split_labels_by_type,
|
37
37
|
translate_label,
|
@@ -41,17 +41,22 @@ from nucliadb.search.search.metrics import (
|
|
41
41
|
node_features,
|
42
42
|
query_parse_dependency_observer,
|
43
43
|
)
|
44
|
+
from nucliadb.search.search.rank_fusion import (
|
45
|
+
RankFusionAlgorithm,
|
46
|
+
)
|
47
|
+
from nucliadb.search.search.rerankers import (
|
48
|
+
Reranker,
|
49
|
+
)
|
44
50
|
from nucliadb.search.utilities import get_predict
|
45
|
-
from nucliadb_models.
|
51
|
+
from nucliadb_models.internal.predict import QueryInfo
|
52
|
+
from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
|
46
53
|
from nucliadb_models.metadata import ResourceProcessingStatus
|
47
54
|
from nucliadb_models.search import (
|
48
55
|
Filter,
|
49
56
|
MaxTokens,
|
50
57
|
MinScore,
|
51
|
-
QueryInfo,
|
52
58
|
SearchOptions,
|
53
59
|
SortField,
|
54
|
-
SortFieldMap,
|
55
60
|
SortOptions,
|
56
61
|
SortOrder,
|
57
62
|
SortOrderMap,
|
@@ -59,6 +64,7 @@ from nucliadb_models.search import (
|
|
59
64
|
)
|
60
65
|
from nucliadb_models.security import RequestSecurity
|
61
66
|
from nucliadb_protos import knowledgebox_pb2, nodereader_pb2, utils_pb2
|
67
|
+
from nucliadb_protos.noderesources_pb2 import Resource
|
62
68
|
|
63
69
|
from .exceptions import InvalidQueryError
|
64
70
|
|
@@ -67,7 +73,6 @@ INDEX_SORTABLE_FIELDS = [
|
|
67
73
|
SortField.MODIFIED,
|
68
74
|
]
|
69
75
|
|
70
|
-
MAX_VECTOR_RESULTS_ALLOWED = 2000
|
71
76
|
DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
|
72
77
|
|
73
78
|
|
@@ -82,6 +87,7 @@ class QueryParser:
|
|
82
87
|
"""
|
83
88
|
|
84
89
|
_query_information_task: Optional[asyncio.Task] = None
|
90
|
+
_get_vectorset_task: Optional[asyncio.Task] = None
|
85
91
|
_detected_entities_task: Optional[asyncio.Task] = None
|
86
92
|
_entities_meta_cache_task: Optional[asyncio.Task] = None
|
87
93
|
_deleted_entities_groups_task: Optional[asyncio.Task] = None
|
@@ -95,9 +101,9 @@ class QueryParser:
|
|
95
101
|
kbid: str,
|
96
102
|
features: list[SearchOptions],
|
97
103
|
query: str,
|
98
|
-
|
99
|
-
|
100
|
-
|
104
|
+
label_filters: Union[list[str], list[Filter]],
|
105
|
+
keyword_filters: Union[list[str], list[Filter]],
|
106
|
+
top_k: int,
|
101
107
|
min_score: MinScore,
|
102
108
|
faceted: Optional[list[str]] = None,
|
103
109
|
sort: Optional[SortOptions] = None,
|
@@ -107,6 +113,7 @@ class QueryParser:
|
|
107
113
|
range_modification_end: Optional[datetime] = None,
|
108
114
|
fields: Optional[list[str]] = None,
|
109
115
|
user_vector: Optional[list[float]] = None,
|
116
|
+
vectorset: Optional[str] = None,
|
110
117
|
with_duplicates: bool = False,
|
111
118
|
with_status: Optional[ResourceProcessingStatus] = None,
|
112
119
|
with_synonyms: bool = False,
|
@@ -114,17 +121,28 @@ class QueryParser:
|
|
114
121
|
key_filters: Optional[list[str]] = None,
|
115
122
|
security: Optional[RequestSecurity] = None,
|
116
123
|
generative_model: Optional[str] = None,
|
117
|
-
rephrase:
|
124
|
+
rephrase: bool = False,
|
125
|
+
rephrase_prompt: Optional[str] = None,
|
118
126
|
max_tokens: Optional[MaxTokens] = None,
|
127
|
+
hidden: Optional[bool] = None,
|
128
|
+
rank_fusion: Optional[RankFusionAlgorithm] = None,
|
129
|
+
reranker: Optional[Reranker] = None,
|
119
130
|
):
|
120
131
|
self.kbid = kbid
|
121
132
|
self.features = features
|
122
133
|
self.query = query
|
123
|
-
self.
|
124
|
-
self.
|
134
|
+
self.hidden = hidden
|
135
|
+
if self.hidden is not None:
|
136
|
+
if self.hidden:
|
137
|
+
label_filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
|
138
|
+
else:
|
139
|
+
label_filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore
|
140
|
+
|
141
|
+
self.label_filters: dict[str, Any] = convert_to_node_filters(label_filters)
|
142
|
+
self.flat_label_filters: list[str] = []
|
143
|
+
self.keyword_filters: dict[str, Any] = convert_to_node_filters(keyword_filters)
|
125
144
|
self.faceted = faceted or []
|
126
|
-
self.
|
127
|
-
self.page_size = page_size
|
145
|
+
self.top_k = top_k
|
128
146
|
self.min_score = min_score
|
129
147
|
self.sort = sort
|
130
148
|
self.range_creation_start = range_creation_start
|
@@ -133,6 +151,7 @@ class QueryParser:
|
|
133
151
|
self.range_modification_end = range_modification_end
|
134
152
|
self.fields = fields or []
|
135
153
|
self.user_vector = user_vector
|
154
|
+
self.vectorset = vectorset
|
136
155
|
self.with_duplicates = with_duplicates
|
137
156
|
self.with_status = with_status
|
138
157
|
self.with_synonyms = with_synonyms
|
@@ -141,15 +160,18 @@ class QueryParser:
|
|
141
160
|
self.security = security
|
142
161
|
self.generative_model = generative_model
|
143
162
|
self.rephrase = rephrase
|
163
|
+
self.rephrase_prompt = rephrase_prompt
|
144
164
|
self.query_endpoint_used = False
|
145
|
-
if len(self.
|
146
|
-
self.
|
147
|
-
self.
|
165
|
+
if len(self.label_filters) > 0:
|
166
|
+
self.label_filters = translate_label_filters(self.label_filters)
|
167
|
+
self.flat_label_filters = flatten_filter_literals(self.label_filters)
|
148
168
|
self.max_tokens = max_tokens
|
169
|
+
self.rank_fusion = rank_fusion
|
170
|
+
self.reranker = reranker
|
149
171
|
|
150
172
|
@property
|
151
173
|
def has_vector_search(self) -> bool:
|
152
|
-
return SearchOptions.
|
174
|
+
return SearchOptions.SEMANTIC in self.features
|
153
175
|
|
154
176
|
@property
|
155
177
|
def has_relations_search(self) -> bool:
|
@@ -157,34 +179,62 @@ class QueryParser:
|
|
157
179
|
|
158
180
|
def _get_query_information(self) -> Awaitable[QueryInfo]:
|
159
181
|
if self._query_information_task is None: # pragma: no cover
|
160
|
-
self._query_information_task = asyncio.create_task(
|
161
|
-
query_information(
|
162
|
-
self.kbid, self.query, self.generative_model, self.rephrase
|
163
|
-
)
|
164
|
-
)
|
182
|
+
self._query_information_task = asyncio.create_task(self._query_information())
|
165
183
|
return self._query_information_task
|
166
184
|
|
185
|
+
async def _query_information(self) -> QueryInfo:
|
186
|
+
vectorset = await self.select_query_vectorset()
|
187
|
+
return await query_information(
|
188
|
+
self.kbid, self.query, vectorset, self.generative_model, self.rephrase, self.rephrase_prompt
|
189
|
+
)
|
190
|
+
|
191
|
+
def _get_vectorset(self) -> Awaitable[Optional[str]]:
|
192
|
+
if self._get_vectorset_task is None:
|
193
|
+
self._get_vectorset_task = asyncio.create_task(self._select_vectorset())
|
194
|
+
return self._get_vectorset_task
|
195
|
+
|
196
|
+
async def _select_vectorset(self) -> Optional[str]:
|
197
|
+
if self.vectorset:
|
198
|
+
return self.vectorset
|
199
|
+
|
200
|
+
# When vectorset is not provided we get the default from Predict API
|
201
|
+
|
202
|
+
try:
|
203
|
+
query_information = await self._get_query_information()
|
204
|
+
except SendToPredictError:
|
205
|
+
return None
|
206
|
+
|
207
|
+
if query_information.sentence is None:
|
208
|
+
logger.error(
|
209
|
+
"Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
|
210
|
+
)
|
211
|
+
return None
|
212
|
+
|
213
|
+
for vectorset in query_information.sentence.vectors.keys():
|
214
|
+
self.vectorset = vectorset
|
215
|
+
break
|
216
|
+
|
217
|
+
return self.vectorset
|
218
|
+
|
167
219
|
def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]:
|
168
220
|
if self._get_matryoshka_dimension_task is None:
|
169
|
-
self._get_matryoshka_dimension_task = asyncio.create_task(
|
170
|
-
get_matryoshka_dimension_cached(self.kbid)
|
171
|
-
)
|
221
|
+
self._get_matryoshka_dimension_task = asyncio.create_task(self._matryoshka_dimension())
|
172
222
|
return self._get_matryoshka_dimension_task
|
173
223
|
|
224
|
+
async def _matryoshka_dimension(self) -> Optional[int]:
|
225
|
+
vectorset = await self._select_vectorset()
|
226
|
+
return await get_matryoshka_dimension_cached(self.kbid, vectorset)
|
227
|
+
|
174
228
|
def _get_detected_entities(self) -> Awaitable[list[utils_pb2.RelationNode]]:
|
175
229
|
if self._detected_entities_task is None: # pragma: no cover
|
176
|
-
self._detected_entities_task = asyncio.create_task(
|
177
|
-
detect_entities(self.kbid, self.query)
|
178
|
-
)
|
230
|
+
self._detected_entities_task = asyncio.create_task(detect_entities(self.kbid, self.query))
|
179
231
|
return self._detected_entities_task
|
180
232
|
|
181
233
|
def _get_entities_meta_cache(
|
182
234
|
self,
|
183
235
|
) -> Awaitable[datamanagers.entities.EntitiesMetaCache]:
|
184
236
|
if self._entities_meta_cache_task is None:
|
185
|
-
self._entities_meta_cache_task = asyncio.create_task(
|
186
|
-
get_entities_meta_cache(self.kbid)
|
187
|
-
)
|
237
|
+
self._entities_meta_cache_task = asyncio.create_task(get_entities_meta_cache(self.kbid))
|
188
238
|
return self._entities_meta_cache_task
|
189
239
|
|
190
240
|
def _get_deleted_entity_groups(self) -> Awaitable[list[str]]:
|
@@ -211,9 +261,7 @@ class QueryParser:
|
|
211
261
|
This will schedule concurrent tasks for different data that needs to be pulled
|
212
262
|
for the sake of the query being performed
|
213
263
|
"""
|
214
|
-
if len(self.
|
215
|
-
self.flat_filter_labels
|
216
|
-
):
|
264
|
+
if len(self.label_filters) > 0 and has_classification_label_filters(self.flat_label_filters):
|
217
265
|
asyncio.ensure_future(self._get_classification_labels())
|
218
266
|
|
219
267
|
if self.has_vector_search and self.user_vector is None:
|
@@ -253,26 +301,28 @@ class QueryParser:
|
|
253
301
|
autofilters = await self.parse_relation_search(request)
|
254
302
|
await self.parse_synonyms(request)
|
255
303
|
await self.parse_min_score(request, incomplete)
|
304
|
+
await self.adjust_page_size(request, self.rank_fusion, self.reranker)
|
256
305
|
return request, incomplete, autofilters
|
257
306
|
|
258
307
|
async def parse_filters(self, request: nodereader_pb2.SearchRequest) -> None:
|
259
|
-
if len(self.
|
260
|
-
field_labels = self.
|
308
|
+
if len(self.label_filters) > 0:
|
309
|
+
field_labels = self.flat_label_filters
|
261
310
|
paragraph_labels: list[str] = []
|
262
|
-
if has_classification_label_filters(self.
|
311
|
+
if has_classification_label_filters(self.flat_label_filters):
|
263
312
|
classification_labels = await self._get_classification_labels()
|
264
313
|
field_labels, paragraph_labels = split_labels_by_type(
|
265
|
-
self.
|
314
|
+
self.flat_label_filters, classification_labels
|
266
315
|
)
|
267
|
-
check_supported_filters(self.
|
316
|
+
check_supported_filters(self.label_filters, paragraph_labels)
|
268
317
|
|
269
318
|
request.filter.field_labels.extend(field_labels)
|
270
319
|
request.filter.paragraph_labels.extend(paragraph_labels)
|
271
|
-
request.filter.
|
320
|
+
request.filter.labels_expression = json.dumps(self.label_filters)
|
272
321
|
|
273
|
-
|
274
|
-
|
275
|
-
|
322
|
+
if len(self.keyword_filters) > 0:
|
323
|
+
request.filter.keywords_expression = json.dumps(self.keyword_filters)
|
324
|
+
|
325
|
+
request.faceted.labels.extend([translate_label(facet) for facet in self.faceted])
|
276
326
|
request.fields.extend(self.fields)
|
277
327
|
|
278
328
|
if self.security is not None and len(self.security.groups) > 0:
|
@@ -322,9 +372,7 @@ class QueryParser:
|
|
322
372
|
order=SortOrder.DESC,
|
323
373
|
limit=None,
|
324
374
|
)
|
325
|
-
elif
|
326
|
-
self.sort.field not in INDEX_SORTABLE_FIELDS and self.sort.limit is None
|
327
|
-
):
|
375
|
+
elif self.sort.field not in INDEX_SORTABLE_FIELDS and self.sort.limit is None:
|
328
376
|
raise InvalidQueryError(
|
329
377
|
"sort_field",
|
330
378
|
f"Sort by '{self.sort.field}' requires setting a sort limit",
|
@@ -337,35 +385,32 @@ class QueryParser:
|
|
337
385
|
# have consistent results, we must limit them
|
338
386
|
request.result_per_page = self.sort.limit
|
339
387
|
else:
|
340
|
-
request.result_per_page = self.
|
388
|
+
request.result_per_page = self.top_k
|
341
389
|
|
342
|
-
sort_field =
|
390
|
+
sort_field = get_sort_field_proto(self.sort.field) if self.sort else None
|
343
391
|
if sort_field is not None:
|
344
392
|
request.order.sort_by = sort_field
|
345
393
|
request.order.type = SortOrderMap[self.sort.order] # type: ignore
|
346
394
|
|
347
|
-
|
348
|
-
self.has_vector_search
|
349
|
-
and request.result_per_page > MAX_VECTOR_RESULTS_ALLOWED
|
350
|
-
):
|
351
|
-
raise InvalidQueryError(
|
352
|
-
"page_size",
|
353
|
-
f"Pagination of semantic results limit reached: {MAX_VECTOR_RESULTS_ALLOWED}. If you want to paginate through all results, please disable the vector search feature.", # noqa: E501
|
354
|
-
)
|
355
|
-
|
356
|
-
async def parse_min_score(
|
357
|
-
self, request: nodereader_pb2.SearchRequest, incomplete: bool
|
358
|
-
) -> None:
|
395
|
+
async def parse_min_score(self, request: nodereader_pb2.SearchRequest, incomplete: bool) -> None:
|
359
396
|
semantic_min_score = DEFAULT_GENERIC_SEMANTIC_THRESHOLD
|
360
397
|
if self.min_score.semantic is not None:
|
361
398
|
semantic_min_score = self.min_score.semantic
|
362
399
|
elif self.has_vector_search and not incomplete:
|
363
400
|
query_information = await self._get_query_information()
|
364
|
-
|
365
|
-
|
401
|
+
vectorset = await self._select_vectorset()
|
402
|
+
if vectorset is not None:
|
403
|
+
semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
|
404
|
+
if semantic_threshold is not None:
|
405
|
+
semantic_min_score = semantic_threshold
|
406
|
+
else:
|
407
|
+
logger.warning(
|
408
|
+
"Semantic threshold not found in query information, using default",
|
409
|
+
extra={"kbid": self.kbid},
|
410
|
+
)
|
366
411
|
else:
|
367
412
|
logger.warning(
|
368
|
-
"
|
413
|
+
"Vectorset unset by user or predict, using default semantic threshold",
|
369
414
|
extra={"kbid": self.kbid},
|
370
415
|
)
|
371
416
|
self.min_score.semantic = semantic_min_score
|
@@ -373,15 +418,34 @@ class QueryParser:
|
|
373
418
|
request.min_score_bm25 = self.min_score.bm25
|
374
419
|
|
375
420
|
def parse_document_search(self, request: nodereader_pb2.SearchRequest) -> None:
|
376
|
-
if SearchOptions.
|
421
|
+
if SearchOptions.FULLTEXT in self.features:
|
377
422
|
request.document = True
|
378
423
|
node_features.inc({"type": "documents"})
|
379
424
|
|
380
425
|
def parse_paragraph_search(self, request: nodereader_pb2.SearchRequest) -> None:
|
381
|
-
if SearchOptions.
|
426
|
+
if SearchOptions.KEYWORD in self.features:
|
382
427
|
request.paragraph = True
|
383
428
|
node_features.inc({"type": "paragraphs"})
|
384
429
|
|
430
|
+
async def select_query_vectorset(self) -> Optional[str]:
|
431
|
+
"""Set and return the requested vectorset parameter (if used) validated
|
432
|
+
for the current KB.
|
433
|
+
|
434
|
+
"""
|
435
|
+
if not self.vectorset:
|
436
|
+
return None
|
437
|
+
|
438
|
+
# validate vectorset
|
439
|
+
async with datamanagers.with_ro_transaction() as txn:
|
440
|
+
if not await datamanagers.vectorsets.exists(
|
441
|
+
txn, kbid=self.kbid, vectorset_id=self.vectorset
|
442
|
+
):
|
443
|
+
raise InvalidQueryError(
|
444
|
+
"vectorset",
|
445
|
+
f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box",
|
446
|
+
)
|
447
|
+
return self.vectorset
|
448
|
+
|
385
449
|
async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool:
|
386
450
|
if not self.has_vector_search:
|
387
451
|
return False
|
@@ -389,6 +453,11 @@ class QueryParser:
|
|
389
453
|
node_features.inc({"type": "vectors"})
|
390
454
|
|
391
455
|
incomplete = False
|
456
|
+
|
457
|
+
vectorset = await self._select_vectorset()
|
458
|
+
if vectorset is not None:
|
459
|
+
request.vectorset = vectorset
|
460
|
+
|
392
461
|
query_vector = None
|
393
462
|
if self.user_vector is None:
|
394
463
|
try:
|
@@ -398,11 +467,24 @@ class QueryParser:
|
|
398
467
|
incomplete = True
|
399
468
|
else:
|
400
469
|
if query_info and query_info.sentence:
|
401
|
-
|
470
|
+
if vectorset:
|
471
|
+
if vectorset in query_info.sentence.vectors:
|
472
|
+
query_vector = query_info.sentence.vectors[vectorset]
|
473
|
+
else:
|
474
|
+
incomplete = True
|
475
|
+
else:
|
476
|
+
for vectorset_id, vector in query_info.sentence.vectors.items():
|
477
|
+
if vector:
|
478
|
+
query_vector = vector
|
479
|
+
break
|
480
|
+
else:
|
481
|
+
incomplete = True
|
482
|
+
|
402
483
|
else:
|
403
484
|
incomplete = True
|
404
485
|
else:
|
405
486
|
query_vector = self.user_vector
|
487
|
+
|
406
488
|
if query_vector is not None:
|
407
489
|
matryoshka_dimension = await self._get_matryoshka_dimension()
|
408
490
|
if matryoshka_dimension is not None:
|
@@ -410,11 +492,10 @@ class QueryParser:
|
|
410
492
|
# accordingly
|
411
493
|
query_vector = query_vector[:matryoshka_dimension]
|
412
494
|
request.vector.extend(query_vector)
|
495
|
+
|
413
496
|
return incomplete
|
414
497
|
|
415
|
-
async def parse_relation_search(
|
416
|
-
self, request: nodereader_pb2.SearchRequest
|
417
|
-
) -> list[str]:
|
498
|
+
async def parse_relation_search(self, request: nodereader_pb2.SearchRequest) -> list[str]:
|
418
499
|
autofilters = []
|
419
500
|
if self.has_relations_search or self.autofilter:
|
420
501
|
if not self.query_endpoint_used:
|
@@ -422,9 +503,7 @@ class QueryParser:
|
|
422
503
|
else:
|
423
504
|
query_info_result = await self._get_query_information()
|
424
505
|
if query_info_result.entities:
|
425
|
-
detected_entities = convert_relations(
|
426
|
-
query_info_result.entities.dict()
|
427
|
-
)
|
506
|
+
detected_entities = convert_relations(query_info_result.entities.model_dump())
|
428
507
|
else:
|
429
508
|
detected_entities = []
|
430
509
|
meta_cache = await self._get_entities_meta_cache()
|
@@ -432,9 +511,7 @@ class QueryParser:
|
|
432
511
|
if self.has_relations_search:
|
433
512
|
request.relation_subgraph.entry_points.extend(detected_entities)
|
434
513
|
request.relation_subgraph.depth = 1
|
435
|
-
request.relation_subgraph.deleted_groups.extend(
|
436
|
-
await self._get_deleted_entity_groups()
|
437
|
-
)
|
514
|
+
request.relation_subgraph.deleted_groups.extend(await self._get_deleted_entity_groups())
|
438
515
|
for group_id, deleted_entities in meta_cache.deleted_entities.items():
|
439
516
|
request.relation_subgraph.deleted_entities.append(
|
440
517
|
nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
|
@@ -444,13 +521,21 @@ class QueryParser:
|
|
444
521
|
node_features.inc({"type": "relations"})
|
445
522
|
if self.autofilter:
|
446
523
|
entity_filters = parse_entities_to_filters(request, detected_entities)
|
447
|
-
autofilters.extend(
|
448
|
-
[translate_system_to_alias_label(e) for e in entity_filters]
|
449
|
-
)
|
524
|
+
autofilters.extend([translate_system_to_alias_label(e) for e in entity_filters])
|
450
525
|
return autofilters
|
451
526
|
|
452
527
|
async def parse_synonyms(self, request: nodereader_pb2.SearchRequest) -> None:
|
453
|
-
|
528
|
+
"""
|
529
|
+
Replace the terms in the query with an expression that will make it match with the configured synonyms.
|
530
|
+
We're using the Tantivy's query language here: https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html
|
531
|
+
|
532
|
+
Example:
|
533
|
+
- Synonyms: Foo -> Bar, Baz
|
534
|
+
- Query: "What is Foo?"
|
535
|
+
- Advanced Query: "What is (Foo OR Bar OR Baz)?"
|
536
|
+
"""
|
537
|
+
if not self.with_synonyms or not self.query:
|
538
|
+
# Nothing to do
|
454
539
|
return
|
455
540
|
|
456
541
|
if self.has_vector_search or self.has_relations_search:
|
@@ -459,27 +544,32 @@ class QueryParser:
|
|
459
544
|
"Search with custom synonyms is only supported on paragraph and document search",
|
460
545
|
)
|
461
546
|
|
462
|
-
if not self.query:
|
463
|
-
# Nothing to do
|
464
|
-
return
|
465
|
-
|
466
547
|
synonyms = await self._get_synomyns()
|
467
548
|
if synonyms is None:
|
468
549
|
# No synonyms found
|
469
550
|
return
|
470
551
|
|
471
|
-
|
472
|
-
|
473
|
-
for term in
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
552
|
+
# Calculate term variants: 'term' -> '(term OR synonym1 OR synonym2)'
|
553
|
+
variants: dict[str, str] = {}
|
554
|
+
for term, term_synonyms in synonyms.terms.items():
|
555
|
+
if len(term_synonyms.synonyms) > 0:
|
556
|
+
variants[term] = "({})".format(" OR ".join([term] + list(term_synonyms.synonyms)))
|
557
|
+
|
558
|
+
# Split the query into terms
|
559
|
+
query_terms = self.query.split()
|
560
|
+
|
561
|
+
# Remove punctuation from the query terms
|
562
|
+
clean_query_terms = [term.strip(string.punctuation) for term in query_terms]
|
563
|
+
|
564
|
+
# Replace the original terms with the variants if the cleaned term is in the variants
|
565
|
+
term_with_synonyms_found = False
|
566
|
+
for index, clean_term in enumerate(clean_query_terms):
|
567
|
+
if clean_term in variants:
|
568
|
+
term_with_synonyms_found = True
|
569
|
+
query_terms[index] = query_terms[index].replace(clean_term, variants[clean_term])
|
570
|
+
|
571
|
+
if term_with_synonyms_found:
|
572
|
+
request.advanced_query = " ".join(query_terms)
|
483
573
|
request.ClearField("body")
|
484
574
|
|
485
575
|
async def get_visual_llm_enabled(self) -> bool:
|
@@ -501,17 +591,41 @@ class QueryParser:
|
|
501
591
|
return self.max_tokens.answer
|
502
592
|
return None
|
503
593
|
|
594
|
+
async def adjust_page_size(
|
595
|
+
self,
|
596
|
+
request: nodereader_pb2.SearchRequest,
|
597
|
+
rank_fusion: Optional[RankFusionAlgorithm],
|
598
|
+
reranker: Optional[Reranker],
|
599
|
+
):
|
600
|
+
"""Adjust requested page size depending on rank fusion and reranking algorithms.
|
601
|
+
|
602
|
+
Some rerankers want more results than the requested by the user so
|
603
|
+
reranking can have more choices.
|
604
|
+
|
605
|
+
"""
|
606
|
+
rank_fusion_window = 0
|
607
|
+
if rank_fusion is not None:
|
608
|
+
rank_fusion_window = rank_fusion.window
|
609
|
+
|
610
|
+
reranker_window = 0
|
611
|
+
if reranker is not None:
|
612
|
+
reranker_window = reranker.window or 0
|
613
|
+
|
614
|
+
request.result_per_page = max(
|
615
|
+
request.result_per_page,
|
616
|
+
rank_fusion_window,
|
617
|
+
reranker_window,
|
618
|
+
)
|
619
|
+
|
504
620
|
|
505
621
|
async def paragraph_query_to_pb(
|
506
622
|
kbid: str,
|
507
|
-
features: list[SearchOptions],
|
508
623
|
rid: str,
|
509
624
|
query: str,
|
510
625
|
fields: list[str],
|
511
626
|
filters: list[str],
|
512
627
|
faceted: list[str],
|
513
|
-
|
514
|
-
page_size: int,
|
628
|
+
top_k: int,
|
515
629
|
range_creation_start: Optional[datetime] = None,
|
516
630
|
range_creation_end: Optional[datetime] = None,
|
517
631
|
range_modification_start: Optional[datetime] = None,
|
@@ -519,13 +633,37 @@ async def paragraph_query_to_pb(
|
|
519
633
|
sort: Optional[str] = None,
|
520
634
|
sort_ord: str = SortOrder.DESC.value,
|
521
635
|
with_duplicates: bool = False,
|
522
|
-
) -> nodereader_pb2.
|
523
|
-
request = nodereader_pb2.
|
524
|
-
request.
|
636
|
+
) -> nodereader_pb2.SearchRequest:
|
637
|
+
request = nodereader_pb2.SearchRequest()
|
638
|
+
request.paragraph = True
|
525
639
|
|
526
640
|
# We need to ask for all and cut later
|
527
641
|
request.page_number = 0
|
528
|
-
request.result_per_page =
|
642
|
+
request.result_per_page = top_k
|
643
|
+
|
644
|
+
request.body = query
|
645
|
+
|
646
|
+
# we don't have a specific filter only for resource_ids but key_filters
|
647
|
+
# parse "rid" and "rid/field" like ids, so it does the job
|
648
|
+
request.key_filters.append(rid)
|
649
|
+
|
650
|
+
if len(filters) > 0:
|
651
|
+
field_labels = filters
|
652
|
+
paragraph_labels: list[str] = []
|
653
|
+
if has_classification_label_filters(filters):
|
654
|
+
classification_labels = await get_classification_labels(kbid)
|
655
|
+
field_labels, paragraph_labels = split_labels_by_type(filters, classification_labels)
|
656
|
+
request.filter.field_labels.extend(field_labels)
|
657
|
+
request.filter.paragraph_labels.extend(paragraph_labels)
|
658
|
+
|
659
|
+
request.faceted.labels.extend([translate_label(facet) for facet in faceted])
|
660
|
+
request.fields.extend(fields)
|
661
|
+
|
662
|
+
if sort:
|
663
|
+
request.order.field = sort
|
664
|
+
request.order.type = sort_ord # type: ignore
|
665
|
+
|
666
|
+
request.with_duplicates = with_duplicates
|
529
667
|
|
530
668
|
if range_creation_start is not None:
|
531
669
|
request.timestamps.from_created.FromDatetime(range_creation_start)
|
@@ -539,26 +677,6 @@ async def paragraph_query_to_pb(
|
|
539
677
|
if range_modification_end is not None:
|
540
678
|
request.timestamps.to_modified.FromDatetime(range_modification_end)
|
541
679
|
|
542
|
-
if SearchOptions.PARAGRAPH in features:
|
543
|
-
request.uuid = rid
|
544
|
-
request.body = query
|
545
|
-
if len(filters) > 0:
|
546
|
-
field_labels = filters
|
547
|
-
paragraph_labels: list[str] = []
|
548
|
-
if has_classification_label_filters(filters):
|
549
|
-
classification_labels = await get_classification_labels(kbid)
|
550
|
-
field_labels, paragraph_labels = split_labels_by_type(
|
551
|
-
filters, classification_labels
|
552
|
-
)
|
553
|
-
request.filter.field_labels.extend(field_labels)
|
554
|
-
request.filter.paragraph_labels.extend(paragraph_labels)
|
555
|
-
|
556
|
-
request.faceted.labels.extend([translate_label(facet) for facet in faceted])
|
557
|
-
if sort:
|
558
|
-
request.order.field = sort
|
559
|
-
request.order.type = sort_ord # type: ignore
|
560
|
-
request.fields.extend(fields)
|
561
|
-
|
562
680
|
return request
|
563
681
|
|
564
682
|
|
@@ -566,11 +684,13 @@ async def paragraph_query_to_pb(
|
|
566
684
|
async def query_information(
|
567
685
|
kbid: str,
|
568
686
|
query: str,
|
687
|
+
semantic_model: Optional[str],
|
569
688
|
generative_model: Optional[str] = None,
|
570
689
|
rephrase: bool = False,
|
690
|
+
rephrase_prompt: Optional[str] = None,
|
571
691
|
) -> QueryInfo:
|
572
692
|
predict = get_predict()
|
573
|
-
return await predict.query(kbid, query, generative_model, rephrase)
|
693
|
+
return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
|
574
694
|
|
575
695
|
|
576
696
|
@query_parse_dependency_observer.wrap({"type": "detect_entities"})
|
@@ -610,9 +730,7 @@ def expand_entities(
|
|
610
730
|
)
|
611
731
|
|
612
732
|
if entity.value in duplicated_entities_by_value[entity.subtype]:
|
613
|
-
source_duplicate = duplicated_entities_by_value[entity.subtype][
|
614
|
-
entity.value
|
615
|
-
]
|
733
|
+
source_duplicate = duplicated_entities_by_value[entity.subtype][entity.value]
|
616
734
|
result_entities[source_duplicate] = utils_pb2.RelationNode(
|
617
735
|
ntype=utils_pb2.RelationNode.NodeType.ENTITY,
|
618
736
|
subtype=entity.subtype,
|
@@ -651,10 +769,10 @@ def parse_entities_to_filters(
|
|
651
769
|
# So far, autofilters feature will only yield 'and' expressions with the detected entities.
|
652
770
|
# More complex autofilters can be added here if we leverage the query endpoint.
|
653
771
|
expanded_expression = {"and": [{"literal": entity} for entity in added_filters]}
|
654
|
-
if request.filter.
|
655
|
-
expression = json.loads(request.filter.
|
772
|
+
if request.filter.labels_expression:
|
773
|
+
expression = json.loads(request.filter.labels_expression)
|
656
774
|
expanded_expression["and"].append(expression)
|
657
|
-
request.filter.
|
775
|
+
request.filter.labels_expression = json.dumps(expanded_expression)
|
658
776
|
return added_filters
|
659
777
|
|
660
778
|
|
@@ -668,6 +786,7 @@ def suggest_query_to_pb(
|
|
668
786
|
range_creation_end: Optional[datetime] = None,
|
669
787
|
range_modification_start: Optional[datetime] = None,
|
670
788
|
range_modification_end: Optional[datetime] = None,
|
789
|
+
hidden: Optional[bool] = None,
|
671
790
|
) -> nodereader_pb2.SuggestRequest:
|
672
791
|
request = nodereader_pb2.SuggestRequest()
|
673
792
|
|
@@ -677,10 +796,21 @@ def suggest_query_to_pb(
|
|
677
796
|
|
678
797
|
if SuggestOptions.PARAGRAPH in features:
|
679
798
|
request.features.append(nodereader_pb2.SuggestFeatures.PARAGRAPHS)
|
680
|
-
filters = [translate_label(fltr) for fltr in filters]
|
681
|
-
request.filter.field_labels.extend(filters)
|
682
799
|
request.fields.extend(fields)
|
683
800
|
|
801
|
+
if hidden is not None:
|
802
|
+
if hidden:
|
803
|
+
filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
|
804
|
+
else:
|
805
|
+
filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore
|
806
|
+
|
807
|
+
expression = convert_to_node_filters(filters)
|
808
|
+
if expression:
|
809
|
+
expression = translate_label_filters(expression)
|
810
|
+
|
811
|
+
request.filter.field_labels.extend(flatten_filter_literals(expression))
|
812
|
+
request.filter.labels_expression = json.dumps(expression)
|
813
|
+
|
684
814
|
if range_creation_start is not None:
|
685
815
|
request.timestamps.from_created.FromDatetime(range_creation_start)
|
686
816
|
if range_creation_end is not None:
|
@@ -705,28 +835,26 @@ PROCESSING_STATUS_TO_PB_MAP = {
|
|
705
835
|
|
706
836
|
@query_parse_dependency_observer.wrap({"type": "synonyms"})
|
707
837
|
async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
|
708
|
-
|
709
|
-
|
838
|
+
async with get_driver().transaction(read_only=True) as txn:
|
839
|
+
return await datamanagers.synonyms.get(txn, kbid=kbid)
|
710
840
|
|
711
841
|
|
712
842
|
@query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
|
713
843
|
async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
|
714
|
-
|
715
|
-
|
844
|
+
async with get_driver().transaction(read_only=True) as txn:
|
845
|
+
return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
|
716
846
|
|
717
847
|
|
718
848
|
@query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
|
719
849
|
async def get_deleted_entity_groups(kbid: str) -> list[str]:
|
720
|
-
|
721
|
-
|
722
|
-
(await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups
|
723
|
-
)
|
850
|
+
async with get_driver().transaction(read_only=True) as txn:
|
851
|
+
return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
|
724
852
|
|
725
853
|
|
726
854
|
@query_parse_dependency_observer.wrap({"type": "classification_labels"})
|
727
855
|
async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
|
728
|
-
|
729
|
-
|
856
|
+
async with get_driver().transaction(read_only=True) as txn:
|
857
|
+
return await datamanagers.labels.get_labels(txn, kbid=kbid)
|
730
858
|
|
731
859
|
|
732
860
|
def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]):
|
@@ -745,8 +873,16 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
|
|
745
873
|
"Paragraph labels can only be used with 'all' filter",
|
746
874
|
)
|
747
875
|
for term in filters["and"]:
|
748
|
-
# Nested expressions are not allowed with paragraph labels
|
749
|
-
if "
|
876
|
+
# Nested expressions are not allowed with paragraph labels (only "literal" and "not(literal)")
|
877
|
+
if "not" in term:
|
878
|
+
subterm = term["not"]
|
879
|
+
if "literal" not in subterm:
|
880
|
+
# AND (NOT( X )) where X is anything other than a literal
|
881
|
+
raise InvalidQueryError(
|
882
|
+
"filters",
|
883
|
+
"Paragraph labels can only be used with 'all' filter",
|
884
|
+
)
|
885
|
+
elif "literal" not in term:
|
750
886
|
raise InvalidQueryError(
|
751
887
|
"filters",
|
752
888
|
"Paragraph labels can only be used with 'all' filter",
|
@@ -754,12 +890,31 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
|
|
754
890
|
|
755
891
|
|
756
892
|
@alru_cache(maxsize=None)
|
757
|
-
async def get_matryoshka_dimension_cached(kbid: str) -> Optional[int]:
|
893
|
+
async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
758
894
|
# This can be safely cached as the matryoshka dimension is not expected to change
|
759
|
-
return await get_matryoshka_dimension(kbid)
|
895
|
+
return await get_matryoshka_dimension(kbid, vectorset)
|
760
896
|
|
761
897
|
|
762
898
|
@query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
|
763
|
-
async def get_matryoshka_dimension(kbid: str) -> Optional[int]:
|
764
|
-
|
765
|
-
|
899
|
+
async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
|
900
|
+
async with get_driver().transaction(read_only=True) as txn:
|
901
|
+
matryoshka_dimension = None
|
902
|
+
if not vectorset:
|
903
|
+
# XXX this should be migrated once we remove the "default" vectorset
|
904
|
+
# concept
|
905
|
+
matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
|
906
|
+
else:
|
907
|
+
vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
|
908
|
+
if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
|
909
|
+
matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
|
910
|
+
|
911
|
+
return matryoshka_dimension
|
912
|
+
|
913
|
+
|
914
|
+
def get_sort_field_proto(obj: SortField) -> Optional[nodereader_pb2.OrderBy.OrderField.ValueType]:
|
915
|
+
return {
|
916
|
+
SortField.SCORE: None,
|
917
|
+
SortField.CREATED: nodereader_pb2.OrderBy.OrderField.CREATED,
|
918
|
+
SortField.MODIFIED: nodereader_pb2.OrderBy.OrderField.MODIFIED,
|
919
|
+
SortField.TITLE: None,
|
920
|
+
}[obj]
|