nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -18,37 +18,48 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import asyncio
|
21
|
-
from typing import
|
21
|
+
from typing import Iterable, Union
|
22
22
|
|
23
|
-
from
|
24
|
-
|
25
|
-
EntitiesSubgraphRequest,
|
26
|
-
ParagraphResult,
|
27
|
-
SearchResponse,
|
28
|
-
)
|
29
|
-
|
30
|
-
from nucliadb.common.maindb.driver import Transaction
|
31
|
-
from nucliadb.ingest.serialize import managed_serialize
|
32
|
-
from nucliadb.middleware.transaction import get_read_only_transaction
|
23
|
+
from nucliadb.common.external_index_providers.base import TextBlockMatch
|
24
|
+
from nucliadb.common.ids import ParagraphId, VectorId
|
33
25
|
from nucliadb.search import SERVICE_NAME, logger
|
34
|
-
from nucliadb.search.search.
|
26
|
+
from nucliadb.search.search.cut import cut_page
|
27
|
+
from nucliadb.search.search.hydrator import (
|
28
|
+
ResourceHydrationOptions,
|
29
|
+
TextBlockHydrationOptions,
|
30
|
+
hydrate_resource_metadata,
|
31
|
+
hydrate_text_block,
|
32
|
+
text_block_to_find_paragraph,
|
33
|
+
)
|
35
34
|
from nucliadb.search.search.merge import merge_relations_results
|
35
|
+
from nucliadb.search.search.rank_fusion import RankFusionAlgorithm
|
36
|
+
from nucliadb.search.search.rerankers import (
|
37
|
+
RerankableItem,
|
38
|
+
Reranker,
|
39
|
+
RerankingOptions,
|
40
|
+
)
|
36
41
|
from nucliadb_models.common import FieldTypeName
|
37
|
-
from nucliadb_models.resource import ExtractedDataTypeName
|
42
|
+
from nucliadb_models.resource import ExtractedDataTypeName, Resource
|
38
43
|
from nucliadb_models.search import (
|
39
44
|
SCORE_TYPE,
|
40
45
|
FindField,
|
41
|
-
FindParagraph,
|
42
46
|
FindResource,
|
43
47
|
KnowledgeboxFindResults,
|
44
48
|
MinScore,
|
45
49
|
ResourceProperties,
|
46
|
-
TempFindParagraph,
|
47
50
|
TextPosition,
|
48
51
|
)
|
52
|
+
from nucliadb_protos.nodereader_pb2 import (
|
53
|
+
DocumentScored,
|
54
|
+
EntitiesSubgraphRequest,
|
55
|
+
ParagraphResult,
|
56
|
+
ParagraphSearchResponse,
|
57
|
+
RelationSearchResponse,
|
58
|
+
SearchResponse,
|
59
|
+
VectorSearchResponse,
|
60
|
+
)
|
49
61
|
from nucliadb_telemetry import metrics
|
50
62
|
|
51
|
-
from . import paragraphs
|
52
63
|
from .metrics import merge_observer
|
53
64
|
|
54
65
|
FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
|
@@ -57,407 +68,413 @@ FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
|
|
57
68
|
)
|
58
69
|
|
59
70
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
@merge_observer.wrap({"type": "set_text_value"})
|
65
|
-
async def set_text_value(
|
71
|
+
@merge_observer.wrap({"type": "find_merge"})
|
72
|
+
async def build_find_response(
|
73
|
+
search_responses: list[SearchResponse],
|
74
|
+
*,
|
66
75
|
kbid: str,
|
67
|
-
|
68
|
-
|
76
|
+
query: str,
|
77
|
+
relation_subgraph_query: EntitiesSubgraphRequest,
|
78
|
+
top_k: int,
|
79
|
+
min_score_bm25: float,
|
80
|
+
min_score_semantic: float,
|
81
|
+
rank_fusion_algorithm: RankFusionAlgorithm,
|
82
|
+
reranker: Reranker,
|
83
|
+
show: list[ResourceProperties] = [],
|
84
|
+
extracted: list[ExtractedDataTypeName] = [],
|
85
|
+
field_type_filter: list[FieldTypeName] = [],
|
69
86
|
highlight: bool = False,
|
70
|
-
|
71
|
-
|
72
|
-
)
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
field=result_paragraph.field,
|
80
|
-
start=result_paragraph.paragraph.position.start,
|
81
|
-
end=result_paragraph.paragraph.position.end,
|
82
|
-
split=result_paragraph.split,
|
83
|
-
highlight=highlight,
|
84
|
-
ematches=ematches,
|
85
|
-
matches=[], # TODO
|
86
|
-
extracted_text_cache=extracted_text_cache,
|
87
|
+
) -> KnowledgeboxFindResults:
|
88
|
+
# merge
|
89
|
+
search_response = merge_shard_responses(search_responses)
|
90
|
+
|
91
|
+
keyword_results = keyword_results_to_text_block_matches(search_response.paragraph.results)
|
92
|
+
semantic_results = semantic_results_to_text_block_matches(
|
93
|
+
filter(
|
94
|
+
lambda x: x.score >= min_score_semantic,
|
95
|
+
search_response.vector.documents,
|
87
96
|
)
|
97
|
+
)
|
98
|
+
|
99
|
+
merged_text_blocks: list[TextBlockMatch] = rank_fusion_algorithm.fuse(
|
100
|
+
keyword_results, semantic_results
|
101
|
+
)
|
102
|
+
|
103
|
+
# cut
|
104
|
+
# we assume pagination + predict reranker is forbidden and has been already
|
105
|
+
# enforced/validated by the query parsing.
|
106
|
+
if reranker.needs_extra_results:
|
107
|
+
assert reranker.window is not None, "Reranker definition must enforce this condition"
|
108
|
+
text_blocks_page, next_page = cut_page(merged_text_blocks, reranker.window)
|
109
|
+
else:
|
110
|
+
text_blocks_page, next_page = cut_page(merged_text_blocks, top_k)
|
111
|
+
|
112
|
+
# hydrate and rerank
|
113
|
+
resource_hydration_options = ResourceHydrationOptions(
|
114
|
+
show=show, extracted=extracted, field_type_filter=field_type_filter
|
115
|
+
)
|
116
|
+
text_block_hydration_options = TextBlockHydrationOptions(
|
117
|
+
highlight=highlight,
|
118
|
+
ematches=search_response.paragraph.ematches, # type: ignore
|
119
|
+
)
|
120
|
+
reranking_options = RerankingOptions(kbid=kbid, query=query)
|
121
|
+
text_blocks, resources, best_matches = await hydrate_and_rerank(
|
122
|
+
text_blocks_page,
|
123
|
+
kbid,
|
124
|
+
resource_hydration_options=resource_hydration_options,
|
125
|
+
text_block_hydration_options=text_block_hydration_options,
|
126
|
+
reranker=reranker,
|
127
|
+
reranking_options=reranking_options,
|
128
|
+
top_k=top_k,
|
129
|
+
)
|
130
|
+
|
131
|
+
# build relations graph
|
132
|
+
relations = await merge_relations_results([search_response.relation], relation_subgraph_query)
|
133
|
+
|
134
|
+
# compose response
|
135
|
+
find_resources = compose_find_resources(text_blocks, resources)
|
136
|
+
|
137
|
+
next_page = search_response.paragraph.next_page or next_page
|
138
|
+
total_paragraphs = search_response.paragraph.total
|
139
|
+
|
140
|
+
find_results = KnowledgeboxFindResults(
|
141
|
+
query=query,
|
142
|
+
resources=find_resources,
|
143
|
+
best_matches=best_matches,
|
144
|
+
relations=relations,
|
145
|
+
total=total_paragraphs,
|
146
|
+
page_number=0, # Bw/c with pagination
|
147
|
+
page_size=top_k,
|
148
|
+
next_page=next_page,
|
149
|
+
min_score=MinScore(bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)),
|
150
|
+
)
|
151
|
+
return find_results
|
152
|
+
|
153
|
+
|
154
|
+
def merge_shard_responses(
|
155
|
+
responses: list[SearchResponse],
|
156
|
+
) -> SearchResponse:
|
157
|
+
"""Merge search responses into a single response as if there were no shards
|
158
|
+
involved.
|
159
|
+
|
160
|
+
ATENTION! This is not a complete merge, we are only merging the fields
|
161
|
+
needed to compose a /find response.
|
162
|
+
|
163
|
+
"""
|
164
|
+
paragraphs = []
|
165
|
+
vectors = []
|
166
|
+
relations = []
|
167
|
+
for response in responses:
|
168
|
+
paragraphs.append(response.paragraph)
|
169
|
+
vectors.append(response.vector)
|
170
|
+
relations.append(response.relation)
|
88
171
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
)
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
172
|
+
merged = SearchResponse(
|
173
|
+
paragraph=merge_shards_keyword_responses(paragraphs),
|
174
|
+
vector=merge_shards_semantic_responses(vectors),
|
175
|
+
relation=merge_shards_relation_responses(relations),
|
176
|
+
)
|
177
|
+
return merged
|
178
|
+
|
179
|
+
|
180
|
+
def merge_shards_keyword_responses(
|
181
|
+
keyword_responses: list[ParagraphSearchResponse],
|
182
|
+
) -> ParagraphSearchResponse:
|
183
|
+
"""Merge keyword (paragraph) search responses into a single response as if
|
184
|
+
there were no shards involved.
|
185
|
+
|
186
|
+
ATENTION! This is not a complete merge, we are only merging the fields
|
187
|
+
needed to compose a /find response.
|
188
|
+
|
189
|
+
"""
|
190
|
+
merged = ParagraphSearchResponse()
|
191
|
+
for response in keyword_responses:
|
192
|
+
merged.query = response.query
|
193
|
+
merged.next_page = merged.next_page or response.next_page
|
194
|
+
merged.total += response.total
|
195
|
+
merged.results.extend(response.results)
|
196
|
+
merged.ematches.extend(response.ematches)
|
197
|
+
|
198
|
+
return merged
|
199
|
+
|
200
|
+
|
201
|
+
def merge_shards_semantic_responses(
|
202
|
+
semantic_responses: list[VectorSearchResponse],
|
203
|
+
) -> VectorSearchResponse:
|
204
|
+
"""Merge semantic (vector) search responses into a single response as if
|
205
|
+
there were no shards involved.
|
206
|
+
|
207
|
+
ATENTION! This is not a complete merge, we are only merging the fields
|
208
|
+
needed to compose a /find response.
|
209
|
+
|
210
|
+
"""
|
211
|
+
merged = VectorSearchResponse()
|
212
|
+
for response in semantic_responses:
|
213
|
+
merged.documents.extend(response.documents)
|
214
|
+
|
215
|
+
return merged
|
216
|
+
|
217
|
+
|
218
|
+
def merge_shards_relation_responses(
|
219
|
+
relation_responses: list[RelationSearchResponse],
|
220
|
+
) -> RelationSearchResponse:
|
221
|
+
merged = RelationSearchResponse()
|
222
|
+
for response in relation_responses:
|
223
|
+
merged.prefix.nodes.extend(response.prefix.nodes)
|
224
|
+
merged.subgraph.relations.extend(response.subgraph.relations)
|
225
|
+
|
226
|
+
return merged
|
227
|
+
|
228
|
+
|
229
|
+
def keyword_result_to_text_block_match(item: ParagraphResult) -> TextBlockMatch:
|
230
|
+
fuzzy_result = len(item.matches) > 0
|
231
|
+
return TextBlockMatch(
|
232
|
+
paragraph_id=ParagraphId.from_string(item.paragraph),
|
233
|
+
score=item.score.bm25,
|
234
|
+
score_type=SCORE_TYPE.BM25,
|
235
|
+
order=0, # NOTE: this will be filled later
|
236
|
+
text="", # NOTE: this will be filled later too
|
237
|
+
position=TextPosition(
|
238
|
+
page_number=item.metadata.position.page_number,
|
239
|
+
index=item.metadata.position.index,
|
240
|
+
start=item.start,
|
241
|
+
end=item.end,
|
242
|
+
start_seconds=[x for x in item.metadata.position.start_seconds],
|
243
|
+
end_seconds=[x for x in item.metadata.position.end_seconds],
|
244
|
+
),
|
245
|
+
# XXX: we should split labels
|
246
|
+
field_labels=[],
|
247
|
+
paragraph_labels=list(item.labels),
|
248
|
+
fuzzy_search=fuzzy_result,
|
249
|
+
is_a_table=item.metadata.representation.is_a_table,
|
250
|
+
representation_file=item.metadata.representation.file,
|
251
|
+
page_with_visual=item.metadata.page_with_visual,
|
252
|
+
)
|
253
|
+
|
254
|
+
|
255
|
+
def keyword_results_to_text_block_matches(items: Iterable[ParagraphResult]) -> list[TextBlockMatch]:
|
256
|
+
return [keyword_result_to_text_block_match(item) for item in items]
|
257
|
+
|
258
|
+
|
259
|
+
class InvalidDocId(Exception):
|
260
|
+
"""Raised while parsing an invalid id coming from semantic search"""
|
261
|
+
|
262
|
+
def __init__(self, invalid_vector_id: str):
|
263
|
+
self.invalid_vector_id = invalid_vector_id
|
264
|
+
super().__init__(f"Invalid vector ID: {invalid_vector_id}")
|
265
|
+
|
266
|
+
|
267
|
+
def semantic_result_to_text_block_match(item: DocumentScored) -> TextBlockMatch:
|
268
|
+
try:
|
269
|
+
vector_id = VectorId.from_string(item.doc_id.id)
|
270
|
+
except (IndexError, ValueError):
|
271
|
+
raise InvalidDocId(item.doc_id.id)
|
272
|
+
|
273
|
+
return TextBlockMatch(
|
274
|
+
paragraph_id=ParagraphId.from_vector_id(vector_id),
|
275
|
+
score=item.score,
|
276
|
+
score_type=SCORE_TYPE.VECTOR,
|
277
|
+
order=0, # NOTE: this will be filled later
|
278
|
+
text="", # NOTE: this will be filled later too
|
279
|
+
position=TextPosition(
|
280
|
+
page_number=item.metadata.position.page_number,
|
281
|
+
index=item.metadata.position.index,
|
282
|
+
start=vector_id.vector_start,
|
283
|
+
end=vector_id.vector_end,
|
284
|
+
start_seconds=[x for x in item.metadata.position.start_seconds],
|
285
|
+
end_seconds=[x for x in item.metadata.position.end_seconds],
|
286
|
+
),
|
287
|
+
# XXX: we should split labels
|
288
|
+
field_labels=[],
|
289
|
+
paragraph_labels=list(item.labels),
|
290
|
+
fuzzy_search=False, # semantic search doesn't have fuzziness
|
291
|
+
is_a_table=item.metadata.representation.is_a_table,
|
292
|
+
representation_file=item.metadata.representation.file,
|
293
|
+
page_with_visual=item.metadata.page_with_visual,
|
294
|
+
)
|
295
|
+
|
296
|
+
|
297
|
+
def semantic_results_to_text_block_matches(items: Iterable[DocumentScored]) -> list[TextBlockMatch]:
|
298
|
+
text_blocks: list[TextBlockMatch] = []
|
299
|
+
for item in items:
|
300
|
+
try:
|
301
|
+
text_block = semantic_result_to_text_block_match(item)
|
302
|
+
except InvalidDocId as exc:
|
303
|
+
logger.warning(f"Skipping invalid doc_id: {exc.invalid_vector_id}")
|
304
|
+
continue
|
305
|
+
text_blocks.append(text_block)
|
306
|
+
return text_blocks
|
307
|
+
|
308
|
+
|
309
|
+
@merge_observer.wrap({"type": "hydrate_and_rerank"})
|
310
|
+
async def hydrate_and_rerank(
|
311
|
+
text_blocks: Iterable[TextBlockMatch],
|
153
312
|
kbid: str,
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
313
|
+
*,
|
314
|
+
resource_hydration_options: ResourceHydrationOptions,
|
315
|
+
text_block_hydration_options: TextBlockHydrationOptions,
|
316
|
+
reranker: Reranker,
|
317
|
+
reranking_options: RerankingOptions,
|
318
|
+
top_k: int,
|
319
|
+
) -> tuple[list[TextBlockMatch], list[Resource], list[str]]:
|
320
|
+
"""Given a list of text blocks from a retrieval operation, hydrate and
|
321
|
+
rerank the results.
|
322
|
+
|
323
|
+
This function returns either the entire list or a subset of updated
|
324
|
+
(hydrated and reranked) text blocks and their corresponding resource
|
325
|
+
metadata. It also returns an ordered list of best matches.
|
326
|
+
|
327
|
+
"""
|
163
328
|
max_operations = asyncio.Semaphore(50)
|
164
|
-
orderer = Orderer()
|
165
|
-
etcache = paragraphs.ExtractedTextCache()
|
166
|
-
for result_paragraph in result_paragraphs:
|
167
|
-
if result_paragraph.paragraph is not None:
|
168
|
-
find_resource = find_resources.setdefault(
|
169
|
-
result_paragraph.rid, FindResource(id=result_paragraph.id, fields={})
|
170
|
-
)
|
171
|
-
find_field = find_resource.fields.setdefault(
|
172
|
-
result_paragraph.field, FindField(paragraphs={})
|
173
|
-
)
|
174
|
-
|
175
|
-
if result_paragraph.paragraph.id in find_field.paragraphs:
|
176
|
-
# Its a multiple match, push the score
|
177
|
-
# find_field.paragraphs[result_paragraph.paragraph.id].score = 25
|
178
|
-
if (
|
179
|
-
find_field.paragraphs[result_paragraph.paragraph.id].score
|
180
|
-
< result_paragraph.paragraph.score
|
181
|
-
):
|
182
|
-
# Use Vector score if there are both
|
183
|
-
find_field.paragraphs[result_paragraph.paragraph.id].score = (
|
184
|
-
result_paragraph.paragraph.score * 2
|
185
|
-
)
|
186
|
-
orderer.add(
|
187
|
-
(
|
188
|
-
result_paragraph.rid,
|
189
|
-
result_paragraph.field,
|
190
|
-
result_paragraph.paragraph.id,
|
191
|
-
result_paragraph.paragraph.score,
|
192
|
-
)
|
193
|
-
)
|
194
|
-
find_field.paragraphs[result_paragraph.paragraph.id].score_type = (
|
195
|
-
SCORE_TYPE.BOTH
|
196
|
-
)
|
197
329
|
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
330
|
+
# Iterate text blocks and create text block and resource metadata hydration
|
331
|
+
# tasks depending on the reranker
|
332
|
+
text_blocks_by_id: dict[str, TextBlockMatch] = {} # useful for faster access to text blocks later
|
333
|
+
resource_hydration_ops = {}
|
334
|
+
text_block_hydration_ops = []
|
335
|
+
for text_block in text_blocks:
|
336
|
+
rid = text_block.paragraph_id.rid
|
337
|
+
paragraph_id = text_block.paragraph_id.full()
|
338
|
+
|
339
|
+
# If we find multiple results (from different indexes) with different
|
340
|
+
# metadata, this statement will only get the metadata from the first on
|
341
|
+
# the list. We assume metadata is the same on all indexes, otherwise
|
342
|
+
# this would be a BUG
|
343
|
+
text_blocks_by_id.setdefault(paragraph_id, text_block)
|
344
|
+
|
345
|
+
# rerankers that need extra results may end with less resources than the
|
346
|
+
# ones we see now, so we'll skip this step and recompute the resources
|
347
|
+
# later
|
348
|
+
if not reranker.needs_extra_results:
|
349
|
+
if rid not in resource_hydration_ops:
|
350
|
+
resource_hydration_ops[rid] = asyncio.create_task(
|
351
|
+
hydrate_resource_metadata(
|
352
|
+
kbid,
|
353
|
+
rid,
|
354
|
+
options=resource_hydration_options,
|
355
|
+
concurrency_control=max_operations,
|
356
|
+
service_name=SERVICE_NAME,
|
208
357
|
)
|
209
358
|
)
|
210
359
|
|
211
|
-
|
212
|
-
asyncio.create_task(
|
213
|
-
set_text_value(
|
214
|
-
kbid=kbid,
|
215
|
-
result_paragraph=result_paragraph,
|
216
|
-
highlight=highlight,
|
217
|
-
ematches=ematches,
|
218
|
-
max_operations=max_operations,
|
219
|
-
extracted_text_cache=etcache,
|
220
|
-
)
|
221
|
-
)
|
222
|
-
)
|
223
|
-
resources.add(result_paragraph.rid)
|
224
|
-
etcache.clear()
|
225
|
-
|
226
|
-
for order, (rid, field_id, paragraph_id, _) in enumerate(orderer.sorted_by_score()):
|
227
|
-
find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order
|
228
|
-
best_matches.append(paragraph_id)
|
229
|
-
|
230
|
-
for resource in resources:
|
231
|
-
operations.append(
|
360
|
+
text_block_hydration_ops.append(
|
232
361
|
asyncio.create_task(
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
field_type_filter=field_type_filter,
|
239
|
-
extracted=extracted,
|
240
|
-
find_resources=find_resources,
|
241
|
-
max_operations=max_operations,
|
362
|
+
hydrate_text_block(
|
363
|
+
kbid,
|
364
|
+
text_block,
|
365
|
+
text_block_hydration_options,
|
366
|
+
concurrency_control=max_operations,
|
242
367
|
)
|
243
368
|
)
|
244
369
|
)
|
245
370
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
371
|
+
# hydrate only the strictly needed before rerank
|
372
|
+
hydrated_text_blocks: list[TextBlockMatch]
|
373
|
+
hydrated_resources: list[Union[Resource, None]]
|
374
|
+
|
375
|
+
ops = [
|
376
|
+
*text_block_hydration_ops,
|
377
|
+
*resource_hydration_ops.values(),
|
378
|
+
]
|
379
|
+
FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
|
380
|
+
results = await asyncio.gather(*ops)
|
381
|
+
|
382
|
+
hydrated_text_blocks = results[: len(text_block_hydration_ops)] # type: ignore
|
383
|
+
hydrated_resources = results[len(text_block_hydration_ops) :] # type: ignore
|
384
|
+
|
385
|
+
# with the hydrated text, rerank and apply new scores to the text blocks
|
386
|
+
to_rerank = [
|
387
|
+
RerankableItem(
|
388
|
+
id=text_block.paragraph_id.full(),
|
389
|
+
score=text_block.score,
|
390
|
+
score_type=text_block.score_type,
|
391
|
+
content=text_block.text or "", # TODO: add a warning, this shouldn't usually happen
|
392
|
+
)
|
393
|
+
for text_block in hydrated_text_blocks
|
394
|
+
]
|
395
|
+
reranked = await reranker.rerank(to_rerank, reranking_options)
|
396
|
+
|
397
|
+
# after reranking, we can cut to the number of results the user wants, so we
|
398
|
+
# don't hydrate unnecessary stuff
|
399
|
+
reranked = reranked[:top_k]
|
400
|
+
|
401
|
+
matches = []
|
402
|
+
for item in reranked:
|
403
|
+
paragraph_id = item.id
|
404
|
+
score = item.score
|
405
|
+
score_type = item.score_type
|
406
|
+
|
407
|
+
text_block = text_blocks_by_id[paragraph_id]
|
408
|
+
text_block.score = score
|
409
|
+
text_block.score_type = score_type
|
410
|
+
|
411
|
+
matches.append((paragraph_id, score))
|
412
|
+
|
413
|
+
matches.sort(key=lambda x: x[1], reverse=True)
|
414
|
+
|
415
|
+
best_matches = []
|
416
|
+
best_text_blocks = []
|
417
|
+
resource_hydration_ops = {}
|
418
|
+
for order, (paragraph_id, _) in enumerate(matches):
|
419
|
+
text_block = text_blocks_by_id[paragraph_id]
|
420
|
+
text_block.order = order
|
421
|
+
best_matches.append(paragraph_id)
|
422
|
+
best_text_blocks.append(text_block)
|
423
|
+
|
424
|
+
# now we have removed the text block surplus, fetch resource metadata
|
425
|
+
if reranker.needs_extra_results:
|
426
|
+
rid = ParagraphId.from_string(paragraph_id).rid
|
427
|
+
if rid not in resource_hydration_ops:
|
428
|
+
resource_hydration_ops[rid] = asyncio.create_task(
|
429
|
+
hydrate_resource_metadata(
|
430
|
+
kbid,
|
431
|
+
rid,
|
432
|
+
options=resource_hydration_options,
|
433
|
+
concurrency_control=max_operations,
|
434
|
+
service_name=SERVICE_NAME,
|
435
|
+
)
|
282
436
|
)
|
283
|
-
)
|
284
437
|
|
285
|
-
#
|
438
|
+
# Finally, fetch resource metadata if we haven't already done it
|
439
|
+
if reranker.needs_extra_results:
|
440
|
+
ops = list(resource_hydration_ops.values())
|
441
|
+
FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
|
442
|
+
hydrated_resources = await asyncio.gather(*ops) # type: ignore
|
286
443
|
|
287
|
-
|
288
|
-
for vectors_shard in vectors_shards:
|
289
|
-
for vector in vectors_shard:
|
290
|
-
if vector.score < min_score:
|
291
|
-
logger.warning(
|
292
|
-
f"Skipping low score vector: {vector.doc_id.id}. This should not happen"
|
293
|
-
)
|
294
|
-
continue
|
295
|
-
doc_id_split = vector.doc_id.id.split("/")
|
296
|
-
split = None
|
297
|
-
if len(doc_id_split) == 5:
|
298
|
-
rid, field_type, field, index, position = doc_id_split
|
299
|
-
paragraph_id = f"{rid}/{field_type}/{field}/{position}"
|
300
|
-
elif len(doc_id_split) == 6:
|
301
|
-
rid, field_type, field, split, index, position = doc_id_split
|
302
|
-
paragraph_id = f"{rid}/{field_type}/{field}/{split}/{position}"
|
303
|
-
else:
|
304
|
-
logger.warning(f"Skipping invalid doc_id: {vector.doc_id.id}")
|
305
|
-
continue
|
306
|
-
start, end = position.split("-")
|
307
|
-
merged_paragrahs.insert(
|
308
|
-
nextpos,
|
309
|
-
TempFindParagraph(
|
310
|
-
vector_index=vector,
|
311
|
-
rid=rid,
|
312
|
-
field=f"/{field_type}/{field}",
|
313
|
-
score=vector.score,
|
314
|
-
start=int(start),
|
315
|
-
end=int(end),
|
316
|
-
split=split,
|
317
|
-
id=paragraph_id,
|
318
|
-
),
|
319
|
-
)
|
320
|
-
nextpos += 3
|
321
|
-
|
322
|
-
# merged_paragrahs.sort(key=lambda r: r.score, reverse=True)
|
323
|
-
init_position = count * page
|
324
|
-
end_position = init_position + count
|
325
|
-
next_page = len(merged_paragrahs) > end_position
|
326
|
-
merged_paragrahs = merged_paragrahs[init_position:end_position]
|
327
|
-
|
328
|
-
for merged_paragraph in merged_paragrahs:
|
329
|
-
if merged_paragraph.vector_index is not None:
|
330
|
-
merged_paragraph.paragraph = FindParagraph(
|
331
|
-
score=merged_paragraph.vector_index.score,
|
332
|
-
score_type=SCORE_TYPE.VECTOR,
|
333
|
-
text="",
|
334
|
-
labels=[], # TODO: Get labels from index
|
335
|
-
page_with_visual=merged_paragraph.vector_index.metadata.page_with_visual,
|
336
|
-
reference=merged_paragraph.vector_index.metadata.representation.file,
|
337
|
-
is_a_table=merged_paragraph.vector_index.metadata.representation.is_a_table,
|
338
|
-
position=TextPosition(
|
339
|
-
page_number=merged_paragraph.vector_index.metadata.position.page_number,
|
340
|
-
index=merged_paragraph.vector_index.metadata.position.index,
|
341
|
-
start=merged_paragraph.start,
|
342
|
-
end=merged_paragraph.end,
|
343
|
-
start_seconds=[
|
344
|
-
x
|
345
|
-
for x in merged_paragraph.vector_index.metadata.position.start_seconds
|
346
|
-
],
|
347
|
-
end_seconds=[
|
348
|
-
x
|
349
|
-
for x in merged_paragraph.vector_index.metadata.position.end_seconds
|
350
|
-
],
|
351
|
-
),
|
352
|
-
id=merged_paragraph.id,
|
353
|
-
# Vector searches don't have fuzziness
|
354
|
-
fuzzy_result=False,
|
355
|
-
)
|
356
|
-
elif merged_paragraph.paragraph_index is not None:
|
357
|
-
merged_paragraph.paragraph = FindParagraph(
|
358
|
-
score=merged_paragraph.paragraph_index.score.bm25,
|
359
|
-
score_type=SCORE_TYPE.BM25,
|
360
|
-
text="",
|
361
|
-
labels=[x for x in merged_paragraph.paragraph_index.labels],
|
362
|
-
page_with_visual=merged_paragraph.paragraph_index.metadata.page_with_visual,
|
363
|
-
reference=merged_paragraph.paragraph_index.metadata.representation.file,
|
364
|
-
is_a_table=merged_paragraph.paragraph_index.metadata.representation.is_a_table,
|
365
|
-
position=TextPosition(
|
366
|
-
page_number=merged_paragraph.paragraph_index.metadata.position.page_number,
|
367
|
-
index=merged_paragraph.paragraph_index.metadata.position.index,
|
368
|
-
start=merged_paragraph.start,
|
369
|
-
end=merged_paragraph.end,
|
370
|
-
start_seconds=[
|
371
|
-
x
|
372
|
-
for x in merged_paragraph.paragraph_index.metadata.position.start_seconds
|
373
|
-
],
|
374
|
-
end_seconds=[
|
375
|
-
x
|
376
|
-
for x in merged_paragraph.paragraph_index.metadata.position.end_seconds
|
377
|
-
],
|
378
|
-
),
|
379
|
-
id=merged_paragraph.id,
|
380
|
-
fuzzy_result=merged_paragraph.fuzzy_result,
|
381
|
-
)
|
382
|
-
return merged_paragrahs, next_page
|
444
|
+
resources = [resource for resource in hydrated_resources if resource is not None]
|
383
445
|
|
446
|
+
return best_text_blocks, resources, best_matches
|
384
447
|
|
385
|
-
@merge_observer.wrap({"type": "find_merge"})
|
386
|
-
async def find_merge_results(
|
387
|
-
search_responses: list[SearchResponse],
|
388
|
-
count: int,
|
389
|
-
page: int,
|
390
|
-
kbid: str,
|
391
|
-
show: list[ResourceProperties],
|
392
|
-
field_type_filter: list[FieldTypeName],
|
393
|
-
extracted: list[ExtractedDataTypeName],
|
394
|
-
requested_relations: EntitiesSubgraphRequest,
|
395
|
-
min_score_bm25: float,
|
396
|
-
min_score_semantic: float,
|
397
|
-
highlight: bool = False,
|
398
|
-
) -> KnowledgeboxFindResults:
|
399
|
-
# force getting transaction on current asyncio task
|
400
|
-
# so all sub tasks will use the same transaction
|
401
|
-
# this is contextvar magic that is probably not ideal
|
402
|
-
await get_read_only_transaction()
|
403
448
|
|
404
|
-
|
405
|
-
|
406
|
-
|
449
|
+
def compose_find_resources(
|
450
|
+
text_blocks: list[TextBlockMatch],
|
451
|
+
resources: list[Resource],
|
452
|
+
) -> dict[str, FindResource]:
|
453
|
+
find_resources: dict[str, FindResource] = {}
|
407
454
|
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
# Iterate over answers from different logic shards
|
455
|
+
for resource in resources:
|
456
|
+
rid = resource.id
|
457
|
+
if rid not in find_resources:
|
458
|
+
find_resources[rid] = FindResource(id=rid, fields={})
|
459
|
+
find_resources[rid].updated_from(resource)
|
414
460
|
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
461
|
+
for text_block in text_blocks:
|
462
|
+
rid = text_block.paragraph_id.rid
|
463
|
+
if rid not in find_resources:
|
464
|
+
# resource not found in db, skipping
|
465
|
+
continue
|
419
466
|
|
420
|
-
|
421
|
-
|
467
|
+
find_resource = find_resources[rid]
|
468
|
+
field_id = text_block.paragraph_id.field_id.short_without_subfield()
|
469
|
+
find_field = find_resource.fields.setdefault(field_id, FindField(paragraphs={}))
|
422
470
|
|
423
|
-
|
471
|
+
paragraph_id = text_block.paragraph_id.full()
|
472
|
+
find_paragraph = text_block_to_find_paragraph(text_block)
|
424
473
|
|
425
|
-
|
474
|
+
find_field.paragraphs[paragraph_id] = find_paragraph
|
426
475
|
|
427
|
-
|
428
|
-
result_paragraphs, merged_next_page = merge_paragraphs_vectors(
|
429
|
-
paragraphs, vectors, count, page, min_score_semantic
|
430
|
-
)
|
431
|
-
next_page = next_page or merged_next_page
|
432
|
-
|
433
|
-
api_results = KnowledgeboxFindResults(
|
434
|
-
resources={},
|
435
|
-
query=real_query,
|
436
|
-
total=total_paragraphs,
|
437
|
-
page_number=page,
|
438
|
-
page_size=count,
|
439
|
-
next_page=next_page,
|
440
|
-
min_score=MinScore(
|
441
|
-
bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)
|
442
|
-
),
|
443
|
-
best_matches=[],
|
444
|
-
)
|
476
|
+
return find_resources
|
445
477
|
|
446
|
-
await fetch_find_metadata(
|
447
|
-
api_results.resources,
|
448
|
-
api_results.best_matches,
|
449
|
-
result_paragraphs,
|
450
|
-
kbid,
|
451
|
-
show,
|
452
|
-
field_type_filter,
|
453
|
-
extracted,
|
454
|
-
highlight,
|
455
|
-
ematches,
|
456
|
-
)
|
457
|
-
api_results.relations = await merge_relations_results(
|
458
|
-
relations, requested_relations
|
459
|
-
)
|
460
478
|
|
461
|
-
|
462
|
-
|
463
|
-
rcache.clear()
|
479
|
+
def _round(x: float) -> float:
|
480
|
+
return round(x, ndigits=3)
|