nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -18,13 +18,13 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
-
from
|
22
|
-
from nucliadb.migrator.utils import get_migrations
|
21
|
+
from typing import TypeVar
|
23
22
|
|
23
|
+
T = TypeVar("T")
|
24
24
|
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
return
|
25
|
+
|
26
|
+
def cut_page(items: list[T], top_k: int) -> tuple[list[T], bool]:
|
27
|
+
"""Return a slice of `items` representing the specified page and a boolean
|
28
|
+
indicating whether there is a next page or not"""
|
29
|
+
next_page = len(items) > top_k
|
30
|
+
return items[:top_k], next_page
|
nucliadb/search/search/fetch.py
CHANGED
@@ -17,26 +17,25 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import asyncio
|
20
21
|
from contextvars import ContextVar
|
21
22
|
from typing import Optional
|
22
23
|
|
23
|
-
from
|
24
|
-
from
|
25
|
-
|
26
|
-
from nucliadb.ingest.orm.resource import KB_REVERSE
|
24
|
+
from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB
|
25
|
+
from nucliadb.common.maindb.utils import get_driver
|
27
26
|
from nucliadb.ingest.orm.resource import Resource as ResourceORM
|
28
27
|
from nucliadb.ingest.serialize import managed_serialize
|
29
|
-
from nucliadb.middleware.transaction import get_read_only_transaction
|
30
28
|
from nucliadb.search import SERVICE_NAME, logger
|
29
|
+
from nucliadb.search.search import cache
|
31
30
|
from nucliadb_models.common import FieldTypeName
|
32
31
|
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
33
32
|
from nucliadb_models.search import ResourceProperties
|
33
|
+
from nucliadb_protos.nodereader_pb2 import DocumentResult, ParagraphResult
|
34
|
+
from nucliadb_protos.resources_pb2 import Paragraph
|
35
|
+
from nucliadb_utils import const
|
36
|
+
from nucliadb_utils.utilities import has_feature
|
34
37
|
|
35
|
-
|
36
|
-
|
37
|
-
rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar(
|
38
|
-
"rcache", default=None
|
39
|
-
)
|
38
|
+
rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar("rcache", default=None)
|
40
39
|
|
41
40
|
|
42
41
|
async def fetch_resources(
|
@@ -46,20 +45,34 @@ async def fetch_resources(
|
|
46
45
|
field_type_filter: list[FieldTypeName],
|
47
46
|
extracted: list[ExtractedDataTypeName],
|
48
47
|
) -> dict[str, Resource]:
|
48
|
+
if ResourceProperties.EXTRACTED in show and has_feature(
|
49
|
+
const.Features.IGNORE_EXTRACTED_IN_SEARCH, context={"kbid": kbid}, default=False
|
50
|
+
):
|
51
|
+
# Returning extracted metadata in search results is deprecated and this flag
|
52
|
+
# will be set to True for all KBs in the future.
|
53
|
+
show.remove(ResourceProperties.EXTRACTED)
|
54
|
+
extracted = []
|
55
|
+
|
49
56
|
result = {}
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
57
|
+
async with get_driver().transaction(read_only=True) as txn:
|
58
|
+
tasks = []
|
59
|
+
for resource in resources:
|
60
|
+
tasks.append(
|
61
|
+
asyncio.create_task(
|
62
|
+
managed_serialize(
|
63
|
+
txn,
|
64
|
+
kbid,
|
65
|
+
resource,
|
66
|
+
show,
|
67
|
+
field_type_filter=field_type_filter,
|
68
|
+
extracted=extracted,
|
69
|
+
service_name=SERVICE_NAME,
|
70
|
+
)
|
71
|
+
)
|
72
|
+
)
|
73
|
+
for resource, serialization in zip(resources, await asyncio.gather(*tasks)):
|
74
|
+
if serialization is not None:
|
75
|
+
result[resource] = serialization
|
63
76
|
return result
|
64
77
|
|
65
78
|
|
@@ -67,7 +80,7 @@ async def get_paragraph_from_resource(
|
|
67
80
|
orm_resource: ResourceORM, result: ParagraphResult
|
68
81
|
) -> Optional[Paragraph]:
|
69
82
|
_, field_type, field = result.field.split("/")
|
70
|
-
field_type_int =
|
83
|
+
field_type_int = FIELD_TYPE_STR_TO_PB[field_type]
|
71
84
|
field_obj = await orm_resource.get_field(field, field_type_int, load=False)
|
72
85
|
field_metadata = await field_obj.get_field_metadata()
|
73
86
|
paragraph = None
|
@@ -81,7 +94,7 @@ async def get_paragraph_from_resource(
|
|
81
94
|
|
82
95
|
|
83
96
|
async def get_labels_resource(result: DocumentResult, kbid: str) -> list[str]:
|
84
|
-
orm_resource = await
|
97
|
+
orm_resource = await cache.get_resource(kbid, result.uuid)
|
85
98
|
|
86
99
|
if orm_resource is None:
|
87
100
|
logger.error(f"{result.uuid} does not exist on DB")
|
@@ -97,7 +110,7 @@ async def get_labels_resource(result: DocumentResult, kbid: str) -> list[str]:
|
|
97
110
|
|
98
111
|
|
99
112
|
async def get_labels_paragraph(result: ParagraphResult, kbid: str) -> list[str]:
|
100
|
-
orm_resource = await
|
113
|
+
orm_resource = await cache.get_resource(kbid, result.uuid)
|
101
114
|
|
102
115
|
if orm_resource is None:
|
103
116
|
logger.error(f"{result.uuid} does not exist on DB")
|
@@ -110,7 +123,7 @@ async def get_labels_paragraph(result: ParagraphResult, kbid: str) -> list[str]:
|
|
110
123
|
labels.append(f"{classification.labelset}/{classification.label}")
|
111
124
|
|
112
125
|
_, field_type, field = result.field.split("/")
|
113
|
-
field_type_int =
|
126
|
+
field_type_int = FIELD_TYPE_STR_TO_PB[field_type]
|
114
127
|
field_obj = await orm_resource.get_field(field, field_type_int, load=False)
|
115
128
|
field_metadata = await field_obj.get_field_metadata()
|
116
129
|
if field_metadata:
|
@@ -131,21 +144,15 @@ async def get_labels_paragraph(result: ParagraphResult, kbid: str) -> list[str]:
|
|
131
144
|
async def get_seconds_paragraph(
|
132
145
|
result: ParagraphResult, kbid: str
|
133
146
|
) -> Optional[tuple[list[int], list[int]]]:
|
134
|
-
orm_resource = await
|
147
|
+
orm_resource = await cache.get_resource(kbid, result.uuid)
|
135
148
|
|
136
149
|
if orm_resource is None:
|
137
150
|
logger.error(f"{result.uuid} does not exist on DB")
|
138
151
|
return None
|
139
152
|
|
140
|
-
paragraph = await get_paragraph_from_resource(
|
141
|
-
orm_resource=orm_resource, result=result
|
142
|
-
)
|
153
|
+
paragraph = await get_paragraph_from_resource(orm_resource=orm_resource, result=result)
|
143
154
|
|
144
|
-
if (
|
145
|
-
paragraph is not None
|
146
|
-
and len(paragraph.end_seconds) > 0
|
147
|
-
and paragraph.end_seconds[0] > 0
|
148
|
-
):
|
155
|
+
if paragraph is not None and len(paragraph.end_seconds) > 0 and paragraph.end_seconds[0] > 0:
|
149
156
|
return (list(paragraph.start_seconds), list(paragraph.end_seconds))
|
150
157
|
|
151
158
|
return None
|
@@ -59,9 +59,7 @@ def translate_label(literal: str) -> str:
|
|
59
59
|
if len(literal) == 0:
|
60
60
|
raise InvalidQueryError("filters", "Invalid empty label")
|
61
61
|
if literal[0] != "/":
|
62
|
-
raise InvalidQueryError(
|
63
|
-
"filters", f"Invalid label. It must start with a `/`: {literal}"
|
64
|
-
)
|
62
|
+
raise InvalidQueryError("filters", f"Invalid label. It must start with a `/`: {literal}")
|
65
63
|
return translate_alias_to_system_label(literal)
|
66
64
|
|
67
65
|
|
@@ -109,13 +107,9 @@ def split_labels_by_type(
|
|
109
107
|
return field_labels, paragraph_labels
|
110
108
|
|
111
109
|
|
112
|
-
def is_paragraph_labelset_kind(
|
113
|
-
labelset_id: str, classification_labels: knowledgebox_pb2.Labels
|
114
|
-
) -> bool:
|
110
|
+
def is_paragraph_labelset_kind(labelset_id: str, classification_labels: knowledgebox_pb2.Labels) -> bool:
|
115
111
|
try:
|
116
|
-
labelset: Optional[knowledgebox_pb2.LabelSet] = (
|
117
|
-
classification_labels.labelset.get(labelset_id)
|
118
|
-
)
|
112
|
+
labelset: Optional[knowledgebox_pb2.LabelSet] = classification_labels.labelset.get(labelset_id)
|
119
113
|
if labelset is None:
|
120
114
|
return False
|
121
115
|
return knowledgebox_pb2.LabelSet.LabelSetKind.PARAGRAPHS in labelset.kind
|
@@ -124,32 +118,32 @@ def is_paragraph_labelset_kind(
|
|
124
118
|
return False
|
125
119
|
|
126
120
|
|
127
|
-
def
|
121
|
+
def flatten_filter_literals(filters: Union[list[str], dict[str, Any]]) -> list[str]:
|
128
122
|
if isinstance(filters, list):
|
129
123
|
return filters
|
130
124
|
else:
|
131
|
-
return list(
|
125
|
+
return list(iter_filter_expression_literals(filters))
|
132
126
|
|
133
127
|
|
134
|
-
def
|
128
|
+
def iter_filter_expression_literals(expression: dict[str, Any]) -> Iterator[str]:
|
135
129
|
if "literal" in expression:
|
136
130
|
yield expression["literal"]
|
137
131
|
return
|
138
132
|
|
139
133
|
if "not" in expression:
|
140
|
-
for label in
|
134
|
+
for label in iter_filter_expression_literals(expression["not"]):
|
141
135
|
yield label
|
142
136
|
return
|
143
137
|
|
144
138
|
if "and" in expression:
|
145
139
|
for and_term in expression["and"]:
|
146
|
-
for label in
|
140
|
+
for label in iter_filter_expression_literals(and_term):
|
147
141
|
yield label
|
148
142
|
return
|
149
143
|
|
150
144
|
if "or" in expression:
|
151
145
|
for or_term in expression["or"]:
|
152
|
-
for label in
|
146
|
+
for label in iter_filter_expression_literals(or_term):
|
153
147
|
yield label
|
154
148
|
return
|
155
149
|
|
nucliadb/search/search/find.py
CHANGED
@@ -18,13 +18,39 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import logging
|
21
|
+
from dataclasses import dataclass
|
21
22
|
from time import time
|
22
23
|
from typing import Optional
|
23
24
|
|
25
|
+
from nucliadb.common.external_index_providers.base import ExternalIndexManager
|
26
|
+
from nucliadb.common.external_index_providers.manager import get_external_index_manager
|
27
|
+
from nucliadb.common.models_utils import to_proto
|
24
28
|
from nucliadb.search.requesters.utils import Method, debug_nodes_info, node_query
|
25
|
-
from nucliadb.search.search.find_merge import
|
29
|
+
from nucliadb.search.search.find_merge import (
|
30
|
+
build_find_response,
|
31
|
+
compose_find_resources,
|
32
|
+
hydrate_and_rerank,
|
33
|
+
)
|
34
|
+
from nucliadb.search.search.hydrator import (
|
35
|
+
ResourceHydrationOptions,
|
36
|
+
TextBlockHydrationOptions,
|
37
|
+
)
|
38
|
+
from nucliadb.search.search.metrics import (
|
39
|
+
RAGMetrics,
|
40
|
+
)
|
26
41
|
from nucliadb.search.search.query import QueryParser
|
42
|
+
from nucliadb.search.search.query_parser.parser import parse_find
|
43
|
+
from nucliadb.search.search.rank_fusion import (
|
44
|
+
RankFusionAlgorithm,
|
45
|
+
get_rank_fusion,
|
46
|
+
)
|
47
|
+
from nucliadb.search.search.rerankers import (
|
48
|
+
Reranker,
|
49
|
+
RerankingOptions,
|
50
|
+
get_reranker,
|
51
|
+
)
|
27
52
|
from nucliadb.search.search.utils import (
|
53
|
+
filter_hidden_resources,
|
28
54
|
min_score_from_payload,
|
29
55
|
should_disable_vector_search,
|
30
56
|
)
|
@@ -32,6 +58,7 @@ from nucliadb.search.settings import settings
|
|
32
58
|
from nucliadb_models.search import (
|
33
59
|
FindRequest,
|
34
60
|
KnowledgeboxFindResults,
|
61
|
+
MinScore,
|
35
62
|
NucliaDBClientType,
|
36
63
|
SearchOptions,
|
37
64
|
)
|
@@ -47,72 +74,76 @@ async def find(
|
|
47
74
|
x_nucliadb_user: str,
|
48
75
|
x_forwarded_for: str,
|
49
76
|
generative_model: Optional[str] = None,
|
77
|
+
metrics: RAGMetrics = RAGMetrics(),
|
50
78
|
) -> tuple[KnowledgeboxFindResults, bool, QueryParser]:
|
51
|
-
|
52
|
-
|
79
|
+
external_index_manager = await get_external_index_manager(kbid=kbid)
|
80
|
+
if external_index_manager is not None:
|
81
|
+
return await _external_index_retrieval(
|
82
|
+
kbid,
|
83
|
+
item,
|
84
|
+
external_index_manager,
|
85
|
+
generative_model,
|
86
|
+
)
|
87
|
+
else:
|
88
|
+
return await _index_node_retrieval(
|
89
|
+
kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for, generative_model, metrics
|
90
|
+
)
|
53
91
|
|
54
|
-
item.min_score = min_score_from_payload(item.min_score)
|
55
92
|
|
56
|
-
|
57
|
-
|
58
|
-
|
93
|
+
async def _index_node_retrieval(
|
94
|
+
kbid: str,
|
95
|
+
item: FindRequest,
|
96
|
+
x_ndb_client: NucliaDBClientType,
|
97
|
+
x_nucliadb_user: str,
|
98
|
+
x_forwarded_for: str,
|
99
|
+
generative_model: Optional[str] = None,
|
100
|
+
metrics: RAGMetrics = RAGMetrics(),
|
101
|
+
) -> tuple[KnowledgeboxFindResults, bool, QueryParser]:
|
102
|
+
audit = get_audit()
|
103
|
+
start_time = time()
|
59
104
|
|
60
|
-
query_parser =
|
61
|
-
kbid=
|
62
|
-
features=item.features,
|
63
|
-
query=item.query,
|
64
|
-
filters=item.filters,
|
65
|
-
faceted=None,
|
66
|
-
sort=None,
|
67
|
-
page_number=item.page_number,
|
68
|
-
page_size=item.page_size,
|
69
|
-
min_score=item.min_score,
|
70
|
-
range_creation_start=item.range_creation_start,
|
71
|
-
range_creation_end=item.range_creation_end,
|
72
|
-
range_modification_start=item.range_modification_start,
|
73
|
-
range_modification_end=item.range_modification_end,
|
74
|
-
fields=item.fields,
|
75
|
-
user_vector=item.vector,
|
76
|
-
with_duplicates=item.with_duplicates,
|
77
|
-
with_synonyms=item.with_synonyms,
|
78
|
-
autofilter=item.autofilter,
|
79
|
-
key_filters=item.resource_filters,
|
80
|
-
security=item.security,
|
81
|
-
generative_model=generative_model,
|
82
|
-
rephrase=item.rephrase,
|
83
|
-
)
|
84
|
-
pb_query, incomplete_results, autofilters = await query_parser.parse()
|
85
|
-
results, query_incomplete_results, queried_nodes = await node_query(
|
86
|
-
kbid, Method.SEARCH, pb_query, target_shard_replicas=item.shards
|
105
|
+
query_parser, rank_fusion, reranker = await query_parser_from_find_request(
|
106
|
+
kbid, item, generative_model=generative_model
|
87
107
|
)
|
108
|
+
with metrics.time("query_parse"):
|
109
|
+
pb_query, incomplete_results, autofilters = await query_parser.parse()
|
110
|
+
|
111
|
+
with metrics.time("node_query"):
|
112
|
+
results, query_incomplete_results, queried_nodes = await node_query(
|
113
|
+
kbid, Method.SEARCH, pb_query, target_shard_replicas=item.shards
|
114
|
+
)
|
88
115
|
incomplete_results = incomplete_results or query_incomplete_results
|
89
116
|
|
90
|
-
#
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
117
|
+
# Rank fusion merge, cut, hydrate and rerank
|
118
|
+
with metrics.time("results_merge"):
|
119
|
+
search_results = await build_find_response(
|
120
|
+
results,
|
121
|
+
kbid=kbid,
|
122
|
+
query=pb_query.body,
|
123
|
+
relation_subgraph_query=pb_query.relations.subgraph,
|
124
|
+
min_score_bm25=pb_query.min_score_bm25,
|
125
|
+
min_score_semantic=pb_query.min_score_semantic,
|
126
|
+
top_k=item.top_k,
|
127
|
+
show=item.show,
|
128
|
+
extracted=item.extracted,
|
129
|
+
field_type_filter=item.field_type_filter,
|
130
|
+
highlight=item.highlight,
|
131
|
+
rank_fusion_algorithm=rank_fusion,
|
132
|
+
reranker=reranker,
|
133
|
+
)
|
104
134
|
|
105
135
|
search_time = time() - start_time
|
106
136
|
if audit is not None:
|
107
|
-
|
137
|
+
audit.search(
|
108
138
|
kbid,
|
109
139
|
x_nucliadb_user,
|
110
|
-
|
140
|
+
to_proto.client_type(x_ndb_client),
|
111
141
|
x_forwarded_for,
|
112
142
|
pb_query,
|
113
143
|
search_time,
|
114
144
|
len(search_results.resources),
|
115
145
|
)
|
146
|
+
|
116
147
|
if item.debug:
|
117
148
|
search_results.nodes = debug_nodes_info(queried_nodes)
|
118
149
|
|
@@ -120,17 +151,147 @@ async def find(
|
|
120
151
|
search_results.shards = queried_shards
|
121
152
|
search_results.autofilters = autofilters
|
122
153
|
|
123
|
-
if
|
154
|
+
if metrics.elapsed("node_query") > settings.slow_node_query_log_threshold:
|
124
155
|
logger.warning(
|
125
|
-
"Slow query",
|
156
|
+
"Slow node query",
|
157
|
+
extra={
|
158
|
+
"kbid": kbid,
|
159
|
+
"user": x_nucliadb_user,
|
160
|
+
"client": x_ndb_client,
|
161
|
+
"query": item.model_dump_json(),
|
162
|
+
"time": search_time,
|
163
|
+
"nodes": debug_nodes_info(queried_nodes),
|
164
|
+
"durations": metrics.steps(),
|
165
|
+
},
|
166
|
+
)
|
167
|
+
elif search_time > settings.slow_find_log_threshold:
|
168
|
+
logger.info(
|
169
|
+
"Slow find query",
|
126
170
|
extra={
|
127
171
|
"kbid": kbid,
|
128
172
|
"user": x_nucliadb_user,
|
129
173
|
"client": x_ndb_client,
|
130
|
-
"query": item.
|
174
|
+
"query": item.model_dump_json(),
|
131
175
|
"time": search_time,
|
132
176
|
"nodes": debug_nodes_info(queried_nodes),
|
177
|
+
"durations": metrics.steps(),
|
133
178
|
},
|
134
179
|
)
|
135
180
|
|
136
181
|
return search_results, incomplete_results, query_parser
|
182
|
+
|
183
|
+
|
184
|
+
async def _external_index_retrieval(
|
185
|
+
kbid: str,
|
186
|
+
item: FindRequest,
|
187
|
+
external_index_manager: ExternalIndexManager,
|
188
|
+
generative_model: Optional[str] = None,
|
189
|
+
) -> tuple[KnowledgeboxFindResults, bool, QueryParser]:
|
190
|
+
"""
|
191
|
+
Parse the query, query the external index, and hydrate the results.
|
192
|
+
"""
|
193
|
+
# Parse query
|
194
|
+
query_parser, _, reranker = await query_parser_from_find_request(
|
195
|
+
kbid, item, generative_model=generative_model
|
196
|
+
)
|
197
|
+
search_request, incomplete_results, _ = await query_parser.parse()
|
198
|
+
|
199
|
+
# Query index
|
200
|
+
query_results = await external_index_manager.query(search_request) # noqa
|
201
|
+
|
202
|
+
# Hydrate and rerank results
|
203
|
+
text_blocks, resources, best_matches = await hydrate_and_rerank(
|
204
|
+
query_results.iter_matching_text_blocks(),
|
205
|
+
kbid,
|
206
|
+
resource_hydration_options=ResourceHydrationOptions(
|
207
|
+
show=item.show,
|
208
|
+
extracted=item.extracted,
|
209
|
+
field_type_filter=item.field_type_filter,
|
210
|
+
),
|
211
|
+
text_block_hydration_options=TextBlockHydrationOptions(),
|
212
|
+
reranker=reranker,
|
213
|
+
reranking_options=RerankingOptions(
|
214
|
+
kbid=kbid,
|
215
|
+
query=search_request.body,
|
216
|
+
),
|
217
|
+
top_k=query_parser.top_k,
|
218
|
+
)
|
219
|
+
find_resources = compose_find_resources(text_blocks, resources)
|
220
|
+
|
221
|
+
results_min_score = MinScore(
|
222
|
+
bm25=0,
|
223
|
+
semantic=query_parser.min_score.semantic,
|
224
|
+
)
|
225
|
+
retrieval_results = KnowledgeboxFindResults(
|
226
|
+
resources=find_resources,
|
227
|
+
query=item.query,
|
228
|
+
total=0,
|
229
|
+
page_number=0,
|
230
|
+
page_size=item.top_k,
|
231
|
+
relations=None, # Not implemented for external indexes yet
|
232
|
+
autofilters=[], # Not implemented for external indexes yet
|
233
|
+
min_score=results_min_score,
|
234
|
+
best_matches=best_matches,
|
235
|
+
# These are not used for external indexes
|
236
|
+
shards=None,
|
237
|
+
nodes=None,
|
238
|
+
)
|
239
|
+
|
240
|
+
return retrieval_results, incomplete_results, query_parser
|
241
|
+
|
242
|
+
|
243
|
+
@dataclass
|
244
|
+
class ScoredParagraph:
|
245
|
+
id: str
|
246
|
+
score: float
|
247
|
+
|
248
|
+
|
249
|
+
async def query_parser_from_find_request(
|
250
|
+
kbid: str, item: FindRequest, *, generative_model: Optional[str] = None
|
251
|
+
) -> tuple[QueryParser, RankFusionAlgorithm, Reranker]:
|
252
|
+
item.min_score = min_score_from_payload(item.min_score)
|
253
|
+
|
254
|
+
if SearchOptions.SEMANTIC in item.features:
|
255
|
+
if should_disable_vector_search(item):
|
256
|
+
item.features.remove(SearchOptions.SEMANTIC)
|
257
|
+
|
258
|
+
hidden = await filter_hidden_resources(kbid, item.show_hidden)
|
259
|
+
|
260
|
+
# XXX this is becoming the new /find query parsing, this should be moved to
|
261
|
+
# a cleaner abstraction
|
262
|
+
|
263
|
+
parsed = parse_find(item)
|
264
|
+
|
265
|
+
rank_fusion = get_rank_fusion(parsed.rank_fusion)
|
266
|
+
reranker = get_reranker(parsed.reranker)
|
267
|
+
|
268
|
+
query_parser = QueryParser(
|
269
|
+
kbid=kbid,
|
270
|
+
features=item.features,
|
271
|
+
query=item.query,
|
272
|
+
label_filters=item.filters,
|
273
|
+
keyword_filters=item.keyword_filters,
|
274
|
+
faceted=None,
|
275
|
+
sort=None,
|
276
|
+
top_k=item.top_k,
|
277
|
+
min_score=item.min_score,
|
278
|
+
range_creation_start=item.range_creation_start,
|
279
|
+
range_creation_end=item.range_creation_end,
|
280
|
+
range_modification_start=item.range_modification_start,
|
281
|
+
range_modification_end=item.range_modification_end,
|
282
|
+
fields=item.fields,
|
283
|
+
user_vector=item.vector,
|
284
|
+
vectorset=item.vectorset,
|
285
|
+
with_duplicates=item.with_duplicates,
|
286
|
+
with_synonyms=item.with_synonyms,
|
287
|
+
autofilter=item.autofilter,
|
288
|
+
key_filters=item.resource_filters,
|
289
|
+
security=item.security,
|
290
|
+
generative_model=generative_model,
|
291
|
+
rephrase=item.rephrase,
|
292
|
+
rephrase_prompt=item.rephrase_prompt,
|
293
|
+
hidden=hidden,
|
294
|
+
rank_fusion=rank_fusion,
|
295
|
+
reranker=reranker,
|
296
|
+
)
|
297
|
+
return (query_parser, rank_fusion, reranker)
|