nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
|
22
|
+
class ParserError(ValueError): ...
|
@@ -0,0 +1,101 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from dataclasses import dataclass
|
22
|
+
from datetime import datetime
|
23
|
+
from typing import Any, Optional
|
24
|
+
|
25
|
+
from pydantic import (
|
26
|
+
BaseModel,
|
27
|
+
Field,
|
28
|
+
)
|
29
|
+
|
30
|
+
from nucliadb_models import search as search_models
|
31
|
+
|
32
|
+
### Retrieval
|
33
|
+
|
34
|
+
# filters
|
35
|
+
|
36
|
+
|
37
|
+
class DateTimeFilter(BaseModel):
|
38
|
+
after: Optional[datetime] = None # aka, start
|
39
|
+
before: Optional[datetime] = None # aka, end
|
40
|
+
|
41
|
+
|
42
|
+
# rank fusion
|
43
|
+
|
44
|
+
|
45
|
+
class RankFusion(BaseModel):
|
46
|
+
window: int = Field(le=500)
|
47
|
+
|
48
|
+
|
49
|
+
class ReciprocalRankFusion(RankFusion):
|
50
|
+
k: float = Field(default=60.0)
|
51
|
+
boosting: search_models.ReciprocalRankFusionWeights = Field(
|
52
|
+
default_factory=search_models.ReciprocalRankFusionWeights
|
53
|
+
)
|
54
|
+
|
55
|
+
|
56
|
+
# reranking
|
57
|
+
|
58
|
+
|
59
|
+
class Reranker(BaseModel): ...
|
60
|
+
|
61
|
+
|
62
|
+
class NoopReranker(Reranker): ...
|
63
|
+
|
64
|
+
|
65
|
+
class MultiMatchBoosterReranker(Reranker): ...
|
66
|
+
|
67
|
+
|
68
|
+
class PredictReranker(Reranker):
|
69
|
+
window: int = Field(le=200)
|
70
|
+
|
71
|
+
|
72
|
+
# retrieval operation
|
73
|
+
|
74
|
+
|
75
|
+
@dataclass
|
76
|
+
class UnitRetrieval:
|
77
|
+
top_k: int
|
78
|
+
rank_fusion: RankFusion
|
79
|
+
reranker: Reranker
|
80
|
+
|
81
|
+
|
82
|
+
### Catalog
|
83
|
+
|
84
|
+
|
85
|
+
class CatalogFilters(BaseModel):
|
86
|
+
labels: dict[str, Any] = Field(
|
87
|
+
default_factory=dict, description="Labels filter expression, like, `{and: {not: ...}, ...}`"
|
88
|
+
)
|
89
|
+
creation: DateTimeFilter
|
90
|
+
modification: DateTimeFilter
|
91
|
+
with_status: Optional[search_models.ResourceProcessingStatus] = None
|
92
|
+
|
93
|
+
|
94
|
+
class CatalogQuery(BaseModel):
|
95
|
+
kbid: str
|
96
|
+
query: str
|
97
|
+
filters: CatalogFilters
|
98
|
+
sort: search_models.SortOptions
|
99
|
+
faceted: list[str]
|
100
|
+
page_size: int
|
101
|
+
page_number: int
|
@@ -0,0 +1,183 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from typing import Any
|
22
|
+
|
23
|
+
from pydantic import ValidationError
|
24
|
+
|
25
|
+
from nucliadb.search.search.filters import (
|
26
|
+
convert_to_node_filters,
|
27
|
+
translate_label_filters,
|
28
|
+
)
|
29
|
+
from nucliadb.search.search.query_parser.exceptions import ParserError
|
30
|
+
from nucliadb.search.search.query_parser.models import (
|
31
|
+
CatalogFilters,
|
32
|
+
CatalogQuery,
|
33
|
+
DateTimeFilter,
|
34
|
+
MultiMatchBoosterReranker,
|
35
|
+
NoopReranker,
|
36
|
+
PredictReranker,
|
37
|
+
RankFusion,
|
38
|
+
ReciprocalRankFusion,
|
39
|
+
Reranker,
|
40
|
+
UnitRetrieval,
|
41
|
+
)
|
42
|
+
from nucliadb_models import search as search_models
|
43
|
+
from nucliadb_models.labels import LABEL_HIDDEN
|
44
|
+
from nucliadb_models.search import (
|
45
|
+
Filter,
|
46
|
+
FindRequest,
|
47
|
+
SortField,
|
48
|
+
SortOptions,
|
49
|
+
SortOrder,
|
50
|
+
)
|
51
|
+
|
52
|
+
|
53
|
+
def parse_find(item: FindRequest) -> UnitRetrieval:
|
54
|
+
parser = _FindParser(item)
|
55
|
+
return parser.parse()
|
56
|
+
|
57
|
+
|
58
|
+
class _FindParser:
|
59
|
+
def __init__(self, item: FindRequest):
|
60
|
+
self.item = item
|
61
|
+
|
62
|
+
def parse(self) -> UnitRetrieval:
|
63
|
+
top_k = self._parse_top_k()
|
64
|
+
try:
|
65
|
+
rank_fusion = self._parse_rank_fusion()
|
66
|
+
except ValidationError as exc:
|
67
|
+
raise ParserError(f"Parsing error in rank fusion: {str(exc)}") from exc
|
68
|
+
try:
|
69
|
+
reranker = self._parse_reranker()
|
70
|
+
except ValidationError as exc:
|
71
|
+
raise ParserError(f"Parsing error in reranker: {str(exc)}") from exc
|
72
|
+
|
73
|
+
# Adjust retrieval windows. Our current implementation assume:
|
74
|
+
# `top_k <= reranker.window <= rank_fusion.window`
|
75
|
+
# and as rank fusion is done before reranking, we must ensure rank
|
76
|
+
# fusion window is at least, the reranker window
|
77
|
+
if isinstance(reranker, PredictReranker):
|
78
|
+
rank_fusion.window = max(rank_fusion.window, reranker.window)
|
79
|
+
|
80
|
+
return UnitRetrieval(
|
81
|
+
top_k=top_k,
|
82
|
+
rank_fusion=rank_fusion,
|
83
|
+
reranker=reranker,
|
84
|
+
)
|
85
|
+
|
86
|
+
def _parse_top_k(self) -> int:
|
87
|
+
assert self.item.top_k is not None, "top_k must have an int value"
|
88
|
+
top_k = self.item.top_k
|
89
|
+
return top_k
|
90
|
+
|
91
|
+
def _parse_rank_fusion(self) -> RankFusion:
|
92
|
+
rank_fusion: RankFusion
|
93
|
+
|
94
|
+
top_k = self._parse_top_k()
|
95
|
+
window = min(top_k, 500)
|
96
|
+
|
97
|
+
if isinstance(self.item.rank_fusion, search_models.RankFusionName):
|
98
|
+
if self.item.rank_fusion == search_models.RankFusionName.RECIPROCAL_RANK_FUSION:
|
99
|
+
rank_fusion = ReciprocalRankFusion(window=window)
|
100
|
+
else:
|
101
|
+
raise ParserError(f"Unknown rank fusion algorithm: {self.item.rank_fusion}")
|
102
|
+
|
103
|
+
elif isinstance(self.item.rank_fusion, search_models.ReciprocalRankFusion):
|
104
|
+
user_window = self.item.rank_fusion.window
|
105
|
+
rank_fusion = ReciprocalRankFusion(
|
106
|
+
k=self.item.rank_fusion.k,
|
107
|
+
boosting=self.item.rank_fusion.boosting,
|
108
|
+
window=min(max(user_window or 0, top_k), 500),
|
109
|
+
)
|
110
|
+
|
111
|
+
else:
|
112
|
+
raise ParserError(f"Unknown rank fusion {self.item.rank_fusion}")
|
113
|
+
|
114
|
+
return rank_fusion
|
115
|
+
|
116
|
+
def _parse_reranker(self) -> Reranker:
|
117
|
+
reranking: Reranker
|
118
|
+
|
119
|
+
top_k = self._parse_top_k()
|
120
|
+
|
121
|
+
if isinstance(self.item.reranker, search_models.RerankerName):
|
122
|
+
if self.item.reranker == search_models.RerankerName.NOOP:
|
123
|
+
reranking = NoopReranker()
|
124
|
+
|
125
|
+
elif self.item.reranker == search_models.RerankerName.MULTI_MATCH_BOOSTER:
|
126
|
+
reranking = MultiMatchBoosterReranker()
|
127
|
+
|
128
|
+
elif self.item.reranker == search_models.RerankerName.PREDICT_RERANKER:
|
129
|
+
# for predict rearnker, by default, we want a x2 factor with a
|
130
|
+
# top of 200 results
|
131
|
+
reranking = PredictReranker(window=min(top_k * 2, 200))
|
132
|
+
|
133
|
+
else:
|
134
|
+
raise ParserError(f"Unknown reranker algorithm: {self.item.reranker}")
|
135
|
+
|
136
|
+
elif isinstance(self.item.reranker, search_models.PredictReranker):
|
137
|
+
user_window = self.item.reranker.window
|
138
|
+
reranking = PredictReranker(window=min(max(user_window or 0, top_k), 200))
|
139
|
+
|
140
|
+
else:
|
141
|
+
raise ParserError(f"Unknown reranker {self.item.reranker}")
|
142
|
+
|
143
|
+
return reranking
|
144
|
+
|
145
|
+
|
146
|
+
def parse_catalog(kbid: str, item: search_models.CatalogRequest) -> CatalogQuery:
|
147
|
+
if item.hidden:
|
148
|
+
hidden_filter = Filter(all=[LABEL_HIDDEN])
|
149
|
+
else:
|
150
|
+
hidden_filter = Filter(none=[LABEL_HIDDEN])
|
151
|
+
label_filters: dict[str, Any] = convert_to_node_filters(item.filters + [hidden_filter]) # type: ignore
|
152
|
+
if len(label_filters) > 0:
|
153
|
+
label_filters = translate_label_filters(label_filters)
|
154
|
+
|
155
|
+
sort = item.sort
|
156
|
+
if sort is None:
|
157
|
+
# By default we sort by creation date (most recent first)
|
158
|
+
sort = SortOptions(
|
159
|
+
field=SortField.CREATED,
|
160
|
+
order=SortOrder.DESC,
|
161
|
+
limit=None,
|
162
|
+
)
|
163
|
+
|
164
|
+
return CatalogQuery(
|
165
|
+
kbid=kbid,
|
166
|
+
query=item.query,
|
167
|
+
filters=CatalogFilters(
|
168
|
+
labels=label_filters,
|
169
|
+
creation=DateTimeFilter(
|
170
|
+
after=item.range_creation_start,
|
171
|
+
before=item.range_creation_end,
|
172
|
+
),
|
173
|
+
modification=DateTimeFilter(
|
174
|
+
after=item.range_modification_start,
|
175
|
+
before=item.range_modification_end,
|
176
|
+
),
|
177
|
+
with_status=item.with_status,
|
178
|
+
),
|
179
|
+
sort=sort,
|
180
|
+
faceted=item.faceted,
|
181
|
+
page_number=item.page_number,
|
182
|
+
page_size=item.page_size,
|
183
|
+
)
|
@@ -0,0 +1,204 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
import logging
|
21
|
+
from abc import ABC, abstractmethod
|
22
|
+
from typing import Iterable
|
23
|
+
|
24
|
+
from nucliadb.common.external_index_providers.base import TextBlockMatch
|
25
|
+
from nucliadb.common.ids import ParagraphId
|
26
|
+
from nucliadb.search.search.query_parser import models as parser_models
|
27
|
+
from nucliadb_models.search import SCORE_TYPE
|
28
|
+
from nucliadb_telemetry.metrics import Observer
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
rank_fusion_observer = Observer(
|
33
|
+
"rank_fusion",
|
34
|
+
labels={"type": ""},
|
35
|
+
buckets=[
|
36
|
+
0.001,
|
37
|
+
0.0025,
|
38
|
+
0.005,
|
39
|
+
0.01,
|
40
|
+
0.025,
|
41
|
+
0.05,
|
42
|
+
0.1,
|
43
|
+
0.25,
|
44
|
+
0.5,
|
45
|
+
1.0,
|
46
|
+
],
|
47
|
+
)
|
48
|
+
|
49
|
+
|
50
|
+
class RankFusionAlgorithm(ABC):
|
51
|
+
def __init__(self, window: int):
|
52
|
+
self._window = window
|
53
|
+
|
54
|
+
@property
|
55
|
+
def window(self) -> int:
|
56
|
+
"""Phony number used to compute the number of elements to retrieve and
|
57
|
+
feed the rank fusion algorithm.
|
58
|
+
|
59
|
+
This is here for convinience, but a query plan should be the way to go.
|
60
|
+
|
61
|
+
"""
|
62
|
+
return self._window
|
63
|
+
|
64
|
+
def fuse(
|
65
|
+
self, keyword: Iterable[TextBlockMatch], semantic: Iterable[TextBlockMatch]
|
66
|
+
) -> list[TextBlockMatch]:
|
67
|
+
"""Fuse keyword and semantic results and return a list with the merged
|
68
|
+
results.
|
69
|
+
|
70
|
+
"""
|
71
|
+
merged = self._fuse(keyword, semantic)
|
72
|
+
return merged
|
73
|
+
|
74
|
+
@abstractmethod
|
75
|
+
def _fuse(
|
76
|
+
self, keyword: Iterable[TextBlockMatch], semantic: Iterable[TextBlockMatch]
|
77
|
+
) -> list[TextBlockMatch]: ...
|
78
|
+
|
79
|
+
|
80
|
+
class LegacyRankFusion(RankFusionAlgorithm):
|
81
|
+
"""Legacy algorithm that given results from keyword and semantic search,
|
82
|
+
mixes them in the following way:
|
83
|
+
- 1st result from keyword search
|
84
|
+
- 2nd result from semantic search
|
85
|
+
- 2 keyword results and 1 semantic (and repeat)
|
86
|
+
|
87
|
+
"""
|
88
|
+
|
89
|
+
@rank_fusion_observer.wrap({"type": "legacy"})
|
90
|
+
def _fuse(
|
91
|
+
self, keyword: Iterable[TextBlockMatch], semantic: Iterable[TextBlockMatch]
|
92
|
+
) -> list[TextBlockMatch]:
|
93
|
+
merged: list[TextBlockMatch] = []
|
94
|
+
|
95
|
+
# sort results by it's score before merging them
|
96
|
+
keyword = [k for k in sorted(keyword, key=lambda r: r.score, reverse=True)]
|
97
|
+
semantic = [s for s in sorted(semantic, key=lambda r: r.score, reverse=True)]
|
98
|
+
|
99
|
+
for k in keyword:
|
100
|
+
merged.append(k)
|
101
|
+
|
102
|
+
nextpos = 1
|
103
|
+
for s in semantic:
|
104
|
+
merged.insert(nextpos, s)
|
105
|
+
nextpos += 3
|
106
|
+
|
107
|
+
return merged
|
108
|
+
|
109
|
+
|
110
|
+
class ReciprocalRankFusion(RankFusionAlgorithm):
|
111
|
+
"""Rank-based rank fusion algorithm. Discounts the weight of documents
|
112
|
+
occurring deep in retrieved lists using a reciprocal distribution. It can be
|
113
|
+
parametrized with weights to boost retrievers.
|
114
|
+
|
115
|
+
RRF = Σ(r ∈ R) (1 / (k + r(d)) · w(r))
|
116
|
+
|
117
|
+
where:
|
118
|
+
- d is a document
|
119
|
+
- R is the set of retrievers
|
120
|
+
- k (constant)
|
121
|
+
- r(d) rank of document d in reranker r
|
122
|
+
- w(r) weight (boost) for retriever r
|
123
|
+
|
124
|
+
RRF boosts matches from multiple retrievers and deduplicate them
|
125
|
+
|
126
|
+
"""
|
127
|
+
|
128
|
+
def __init__(
|
129
|
+
self,
|
130
|
+
k: float = 60.0,
|
131
|
+
*,
|
132
|
+
window: int,
|
133
|
+
keyword_weight: float = 1.0,
|
134
|
+
semantic_weight: float = 1.0,
|
135
|
+
):
|
136
|
+
super().__init__(window)
|
137
|
+
# Constant used in RRF, studies agree on 60 as a good default value
|
138
|
+
# giving good results across many datasets. k allow bigger score
|
139
|
+
# difference among the best results and a smaller score difference among
|
140
|
+
# bad results
|
141
|
+
self._k = k
|
142
|
+
self._keyword_boost = keyword_weight
|
143
|
+
self._semantic_boost = semantic_weight
|
144
|
+
|
145
|
+
@rank_fusion_observer.wrap({"type": "reciprocal_rank_fusion"})
|
146
|
+
def _fuse(
|
147
|
+
self, keyword: Iterable[TextBlockMatch], semantic: Iterable[TextBlockMatch]
|
148
|
+
) -> list[TextBlockMatch]:
|
149
|
+
scores: dict[ParagraphId, tuple[float, SCORE_TYPE]] = {}
|
150
|
+
match_positions: dict[ParagraphId, list[tuple[int, int]]] = {}
|
151
|
+
|
152
|
+
# sort results by it's score before merging them
|
153
|
+
keyword = [k for k in sorted(keyword, key=lambda r: r.score, reverse=True)]
|
154
|
+
semantic = [s for s in sorted(semantic, key=lambda r: r.score, reverse=True)]
|
155
|
+
|
156
|
+
rankings = [
|
157
|
+
(keyword, self._keyword_boost),
|
158
|
+
(semantic, self._semantic_boost),
|
159
|
+
]
|
160
|
+
for r, (ranking, boost) in enumerate(rankings):
|
161
|
+
for i, result in enumerate(ranking):
|
162
|
+
id = result.paragraph_id
|
163
|
+
score, score_type = scores.setdefault(id, (0, result.score_type))
|
164
|
+
score += 1 / (self._k + i) * boost
|
165
|
+
if {score_type, result.score_type} == {SCORE_TYPE.BM25, SCORE_TYPE.VECTOR}:
|
166
|
+
score_type = SCORE_TYPE.BOTH
|
167
|
+
scores[id] = (score, score_type)
|
168
|
+
|
169
|
+
position = (r, i)
|
170
|
+
match_positions.setdefault(result.paragraph_id, []).append(position)
|
171
|
+
|
172
|
+
merged = []
|
173
|
+
for paragraph_id, positions in match_positions.items():
|
174
|
+
# we are getting only one position, effectively deduplicating
|
175
|
+
# multiple matches for the same text block
|
176
|
+
r, i = match_positions[paragraph_id][0]
|
177
|
+
score, score_type = scores[paragraph_id]
|
178
|
+
result = rankings[r][0][i]
|
179
|
+
result.score = score
|
180
|
+
result.score_type = score_type
|
181
|
+
merged.append(result)
|
182
|
+
|
183
|
+
merged.sort(key=lambda x: x.score, reverse=True)
|
184
|
+
return merged
|
185
|
+
|
186
|
+
|
187
|
+
def get_rank_fusion(rank_fusion: parser_models.RankFusion) -> RankFusionAlgorithm:
|
188
|
+
"""Given a rank fusion API type, return the appropiate rank fusion algorithm instance"""
|
189
|
+
algorithm: RankFusionAlgorithm
|
190
|
+
window = rank_fusion.window
|
191
|
+
|
192
|
+
if isinstance(rank_fusion, parser_models.ReciprocalRankFusion):
|
193
|
+
algorithm = ReciprocalRankFusion(
|
194
|
+
k=rank_fusion.k,
|
195
|
+
window=window,
|
196
|
+
keyword_weight=rank_fusion.boosting.keyword,
|
197
|
+
semantic_weight=rank_fusion.boosting.semantic,
|
198
|
+
)
|
199
|
+
|
200
|
+
else:
|
201
|
+
logger.error(f"Unknown rank fusion algorithm {type(rank_fusion)}: {rank_fusion}. Using default")
|
202
|
+
algorithm = ReciprocalRankFusion(window=window)
|
203
|
+
|
204
|
+
return algorithm
|