nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/search/search/merge.py
CHANGED
@@ -22,19 +22,10 @@ import datetime
|
|
22
22
|
import math
|
23
23
|
from typing import Any, Optional, Set, Union
|
24
24
|
|
25
|
-
from
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
EntitiesSubgraphRequest,
|
30
|
-
ParagraphResult,
|
31
|
-
ParagraphSearchResponse,
|
32
|
-
RelationSearchResponse,
|
33
|
-
SearchResponse,
|
34
|
-
SuggestResponse,
|
35
|
-
VectorSearchResponse,
|
36
|
-
)
|
37
|
-
|
25
|
+
from nucliadb.common.ids import FieldId, ParagraphId
|
26
|
+
from nucliadb.common.models_utils.from_proto import RelationTypePbMap
|
27
|
+
from nucliadb.search.search import cache
|
28
|
+
from nucliadb.search.search.cut import cut_page
|
38
29
|
from nucliadb.search.search.fetch import (
|
39
30
|
fetch_resources,
|
40
31
|
get_labels_paragraph,
|
@@ -43,11 +34,11 @@ from nucliadb.search.search.fetch import (
|
|
43
34
|
)
|
44
35
|
from nucliadb_models.common import FieldTypeName
|
45
36
|
from nucliadb_models.labels import translate_system_to_alias_label
|
46
|
-
from nucliadb_models.metadata import RelationTypePbMap
|
47
37
|
from nucliadb_models.resource import ExtractedDataTypeName
|
48
38
|
from nucliadb_models.search import (
|
49
39
|
DirectionalRelation,
|
50
40
|
EntitySubgraph,
|
41
|
+
EntityType,
|
51
42
|
KnowledgeboxSearchResults,
|
52
43
|
KnowledgeboxSuggestResults,
|
53
44
|
MinScore,
|
@@ -56,7 +47,6 @@ from nucliadb_models.search import (
|
|
56
47
|
RelatedEntities,
|
57
48
|
RelatedEntity,
|
58
49
|
RelationDirection,
|
59
|
-
RelationNodeTypeMap,
|
60
50
|
Relations,
|
61
51
|
ResourceProperties,
|
62
52
|
ResourceResult,
|
@@ -69,10 +59,22 @@ from nucliadb_models.search import (
|
|
69
59
|
SortOrder,
|
70
60
|
TextPosition,
|
71
61
|
)
|
62
|
+
from nucliadb_protos.nodereader_pb2 import (
|
63
|
+
DocumentResult,
|
64
|
+
DocumentScored,
|
65
|
+
DocumentSearchResponse,
|
66
|
+
EntitiesSubgraphRequest,
|
67
|
+
ParagraphResult,
|
68
|
+
ParagraphSearchResponse,
|
69
|
+
RelationSearchResponse,
|
70
|
+
SearchResponse,
|
71
|
+
SuggestResponse,
|
72
|
+
VectorSearchResponse,
|
73
|
+
)
|
74
|
+
from nucliadb_protos.utils_pb2 import RelationNode
|
72
75
|
|
73
|
-
from .cache import get_resource_cache, get_resource_from_cache
|
74
76
|
from .metrics import merge_observer
|
75
|
-
from .paragraphs import
|
77
|
+
from .paragraphs import get_paragraph_text, get_text_sentence
|
76
78
|
|
77
79
|
Bm25Score = tuple[float, float]
|
78
80
|
TimestampScore = datetime.datetime
|
@@ -80,6 +82,15 @@ TitleScore = str
|
|
80
82
|
SortValue = Union[Bm25Score, TimestampScore, TitleScore]
|
81
83
|
|
82
84
|
|
85
|
+
def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
|
86
|
+
return {
|
87
|
+
RelationNode.NodeType.ENTITY: EntityType.ENTITY,
|
88
|
+
RelationNode.NodeType.LABEL: EntityType.LABEL,
|
89
|
+
RelationNode.NodeType.RESOURCE: EntityType.RESOURCE,
|
90
|
+
RelationNode.NodeType.USER: EntityType.USER,
|
91
|
+
}[node_type]
|
92
|
+
|
93
|
+
|
83
94
|
def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
|
84
95
|
results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
|
85
96
|
|
@@ -97,7 +108,7 @@ async def get_sort_value(
|
|
97
108
|
return (item.score.bm25, item.score.booster)
|
98
109
|
|
99
110
|
score: Any = None
|
100
|
-
resource = await
|
111
|
+
resource = await cache.get_resource(kbid, item.uuid)
|
101
112
|
if resource is None:
|
102
113
|
return score
|
103
114
|
|
@@ -118,8 +129,7 @@ async def get_sort_value(
|
|
118
129
|
async def merge_documents_results(
|
119
130
|
document_responses: list[DocumentSearchResponse],
|
120
131
|
resources: list[str],
|
121
|
-
|
122
|
-
page: int,
|
132
|
+
top_k: int,
|
123
133
|
kbid: str,
|
124
134
|
sort: SortOptions,
|
125
135
|
min_score: float,
|
@@ -148,15 +158,9 @@ async def merge_documents_results(
|
|
148
158
|
raw_resource_list.append((result, sort_value))
|
149
159
|
total += document_response.total
|
150
160
|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
if length > end:
|
156
|
-
next_page = True
|
157
|
-
|
158
|
-
# We need to cut first and then sort, otherwise pagination will be wrong if the order is DESC
|
159
|
-
raw_resource_list = raw_resource_list[min(skip, length) : min(end, length)]
|
161
|
+
# We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
|
162
|
+
raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
|
163
|
+
next_page = next_page or has_more
|
160
164
|
raw_resource_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
|
161
165
|
|
162
166
|
result_resource_list: list[ResourceResult] = []
|
@@ -181,8 +185,8 @@ async def merge_documents_results(
|
|
181
185
|
results=result_resource_list,
|
182
186
|
query=query,
|
183
187
|
total=total,
|
184
|
-
page_number=
|
185
|
-
page_size=
|
188
|
+
page_number=0, # Bw/c with pagination
|
189
|
+
page_size=top_k,
|
186
190
|
next_page=next_page,
|
187
191
|
min_score=min_score,
|
188
192
|
)
|
@@ -207,65 +211,58 @@ async def merge_suggest_paragraph_results(
|
|
207
211
|
if len(suggest_responses) > 1:
|
208
212
|
sort_results_by_score(raw_paragraph_list)
|
209
213
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
end=result.end,
|
222
|
-
split=result.split,
|
223
|
-
highlight=highlight,
|
224
|
-
ematches=ematches, # type: ignore
|
225
|
-
matches=result.matches, # type: ignore
|
226
|
-
extracted_text_cache=etcache,
|
227
|
-
)
|
228
|
-
labels = await get_labels_paragraph(result, kbid)
|
229
|
-
new_paragraph = Paragraph(
|
230
|
-
score=result.score.bm25,
|
231
|
-
rid=result.uuid,
|
232
|
-
field_type=field_type,
|
233
|
-
field=field,
|
234
|
-
text=text,
|
235
|
-
labels=labels,
|
236
|
-
position=TextPosition(
|
237
|
-
index=result.metadata.position.index,
|
238
|
-
start=result.metadata.position.start,
|
239
|
-
end=result.metadata.position.end,
|
240
|
-
page_number=result.metadata.position.page_number,
|
214
|
+
result_paragraph_list: list[Paragraph] = []
|
215
|
+
for result in raw_paragraph_list[:10]:
|
216
|
+
_, field_type, field = result.field.split("/")
|
217
|
+
text = await get_paragraph_text(
|
218
|
+
kbid=kbid,
|
219
|
+
paragraph_id=ParagraphId(
|
220
|
+
field_id=FieldId(
|
221
|
+
rid=result.uuid,
|
222
|
+
type=field_type,
|
223
|
+
key=field,
|
224
|
+
subfield_id=result.split,
|
241
225
|
),
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
226
|
+
paragraph_start=result.start,
|
227
|
+
paragraph_end=result.end,
|
228
|
+
),
|
229
|
+
highlight=highlight,
|
230
|
+
ematches=ematches, # type: ignore
|
231
|
+
matches=result.matches, # type: ignore
|
232
|
+
)
|
233
|
+
labels = await get_labels_paragraph(result, kbid)
|
234
|
+
new_paragraph = Paragraph(
|
235
|
+
score=result.score.bm25,
|
236
|
+
rid=result.uuid,
|
237
|
+
field_type=field_type,
|
238
|
+
field=field,
|
239
|
+
text=text,
|
240
|
+
labels=labels,
|
241
|
+
position=TextPosition(
|
242
|
+
index=result.metadata.position.index,
|
243
|
+
start=result.metadata.position.start,
|
244
|
+
end=result.metadata.position.end,
|
245
|
+
page_number=result.metadata.position.page_number,
|
246
|
+
),
|
247
|
+
)
|
248
|
+
if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
|
249
|
+
new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
|
250
|
+
new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
|
251
|
+
else:
|
252
|
+
# TODO: Remove once we are sure all data has been migrated!
|
253
|
+
seconds_positions = await get_seconds_paragraph(result, kbid)
|
254
|
+
if seconds_positions is not None:
|
255
|
+
new_paragraph.start_seconds = seconds_positions[0]
|
256
|
+
new_paragraph.end_seconds = seconds_positions[1]
|
257
|
+
result_paragraph_list.append(new_paragraph)
|
258
|
+
return Paragraphs(results=result_paragraph_list, query=query, min_score=0)
|
261
259
|
|
262
260
|
|
263
261
|
async def merge_vectors_results(
|
264
262
|
vector_responses: list[VectorSearchResponse],
|
265
263
|
resources: list[str],
|
266
264
|
kbid: str,
|
267
|
-
|
268
|
-
page: int,
|
265
|
+
top_k: int,
|
269
266
|
min_score: Optional[float] = None,
|
270
267
|
):
|
271
268
|
facets: dict[str, Any] = {}
|
@@ -282,12 +279,10 @@ async def merge_vectors_results(
|
|
282
279
|
if len(vector_responses) > 1:
|
283
280
|
raw_vectors_list.sort(key=lambda x: x.score, reverse=True)
|
284
281
|
|
285
|
-
|
286
|
-
end_element = skip + count
|
287
|
-
length = len(raw_vectors_list)
|
282
|
+
raw_vectors_list, _ = cut_page(raw_vectors_list, top_k)
|
288
283
|
|
289
284
|
result_sentence_list: list[Sentence] = []
|
290
|
-
for result in raw_vectors_list
|
285
|
+
for result in raw_vectors_list:
|
291
286
|
id_count = result.doc_id.id.count("/")
|
292
287
|
if id_count == 4:
|
293
288
|
rid, field_type, field, index, position = result.doc_id.id.split("/")
|
@@ -335,8 +330,8 @@ async def merge_vectors_results(
|
|
335
330
|
return Sentences(
|
336
331
|
results=result_sentence_list,
|
337
332
|
facets=facets,
|
338
|
-
page_number=
|
339
|
-
page_size=
|
333
|
+
page_number=0, # Bw/c with pagination
|
334
|
+
page_size=top_k,
|
340
335
|
min_score=round(min_score or 0, ndigits=3),
|
341
336
|
)
|
342
337
|
|
@@ -345,12 +340,11 @@ async def merge_paragraph_results(
|
|
345
340
|
paragraph_responses: list[ParagraphSearchResponse],
|
346
341
|
resources: list[str],
|
347
342
|
kbid: str,
|
348
|
-
|
349
|
-
page: int,
|
343
|
+
top_k: int,
|
350
344
|
highlight: bool,
|
351
345
|
sort: SortOptions,
|
352
346
|
min_score: float,
|
353
|
-
):
|
347
|
+
) -> Paragraphs:
|
354
348
|
raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
|
355
349
|
facets: dict[str, Any] = {}
|
356
350
|
query = None
|
@@ -380,76 +374,68 @@ async def merge_paragraph_results(
|
|
380
374
|
|
381
375
|
raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
|
382
376
|
|
383
|
-
|
384
|
-
|
385
|
-
length = len(raw_paragraph_list)
|
386
|
-
|
387
|
-
if length > end:
|
388
|
-
next_page = True
|
377
|
+
raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
|
378
|
+
next_page = next_page or has_more
|
389
379
|
|
390
380
|
result_paragraph_list: list[Paragraph] = []
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
split=result.split,
|
402
|
-
highlight=highlight,
|
403
|
-
ematches=ematches,
|
404
|
-
matches=result.matches, # type: ignore
|
405
|
-
extracted_text_cache=etcache,
|
406
|
-
)
|
407
|
-
labels = await get_labels_paragraph(result, kbid)
|
408
|
-
fuzzy_result = len(result.matches) > 0
|
409
|
-
new_paragraph = Paragraph(
|
410
|
-
score=result.score.bm25,
|
411
|
-
rid=result.uuid,
|
412
|
-
field_type=field_type,
|
413
|
-
field=field,
|
414
|
-
text=text,
|
415
|
-
labels=labels,
|
416
|
-
position=TextPosition(
|
417
|
-
index=result.metadata.position.index,
|
418
|
-
start=result.metadata.position.start,
|
419
|
-
end=result.metadata.position.end,
|
420
|
-
page_number=result.metadata.position.page_number,
|
381
|
+
for result, _ in raw_paragraph_list:
|
382
|
+
_, field_type, field = result.field.split("/")
|
383
|
+
text = await get_paragraph_text(
|
384
|
+
kbid=kbid,
|
385
|
+
paragraph_id=ParagraphId(
|
386
|
+
field_id=FieldId(
|
387
|
+
rid=result.uuid,
|
388
|
+
type=field_type,
|
389
|
+
key=field,
|
390
|
+
subfield_id=result.split,
|
421
391
|
),
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
result.metadata.position.start_seconds
|
429
|
-
)
|
430
|
-
new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
|
431
|
-
else:
|
432
|
-
# TODO: Remove once we are sure all data has been migrated!
|
433
|
-
seconds_positions = await get_seconds_paragraph(result, kbid)
|
434
|
-
if seconds_positions is not None:
|
435
|
-
new_paragraph.start_seconds = seconds_positions[0]
|
436
|
-
new_paragraph.end_seconds = seconds_positions[1]
|
437
|
-
|
438
|
-
result_paragraph_list.append(new_paragraph)
|
439
|
-
if new_paragraph.rid not in resources:
|
440
|
-
resources.append(new_paragraph.rid)
|
441
|
-
return Paragraphs(
|
442
|
-
results=result_paragraph_list,
|
443
|
-
facets=facets,
|
444
|
-
query=query,
|
445
|
-
total=total,
|
446
|
-
page_number=page,
|
447
|
-
page_size=count,
|
448
|
-
next_page=next_page,
|
449
|
-
min_score=min_score,
|
392
|
+
paragraph_start=result.start,
|
393
|
+
paragraph_end=result.end,
|
394
|
+
),
|
395
|
+
highlight=highlight,
|
396
|
+
ematches=ematches,
|
397
|
+
matches=result.matches, # type: ignore
|
450
398
|
)
|
451
|
-
|
452
|
-
|
399
|
+
labels = await get_labels_paragraph(result, kbid)
|
400
|
+
fuzzy_result = len(result.matches) > 0
|
401
|
+
new_paragraph = Paragraph(
|
402
|
+
score=result.score.bm25,
|
403
|
+
rid=result.uuid,
|
404
|
+
field_type=field_type,
|
405
|
+
field=field,
|
406
|
+
text=text,
|
407
|
+
labels=labels,
|
408
|
+
position=TextPosition(
|
409
|
+
index=result.metadata.position.index,
|
410
|
+
start=result.metadata.position.start,
|
411
|
+
end=result.metadata.position.end,
|
412
|
+
page_number=result.metadata.position.page_number,
|
413
|
+
),
|
414
|
+
fuzzy_result=fuzzy_result,
|
415
|
+
)
|
416
|
+
if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
|
417
|
+
new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
|
418
|
+
new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
|
419
|
+
else:
|
420
|
+
# TODO: Remove once we are sure all data has been migrated!
|
421
|
+
seconds_positions = await get_seconds_paragraph(result, kbid)
|
422
|
+
if seconds_positions is not None:
|
423
|
+
new_paragraph.start_seconds = seconds_positions[0]
|
424
|
+
new_paragraph.end_seconds = seconds_positions[1]
|
425
|
+
|
426
|
+
result_paragraph_list.append(new_paragraph)
|
427
|
+
if new_paragraph.rid not in resources:
|
428
|
+
resources.append(new_paragraph.rid)
|
429
|
+
return Paragraphs(
|
430
|
+
results=result_paragraph_list,
|
431
|
+
facets=facets,
|
432
|
+
query=query,
|
433
|
+
total=total,
|
434
|
+
page_number=0, # Bw/c with pagination
|
435
|
+
page_size=top_k,
|
436
|
+
next_page=next_page,
|
437
|
+
min_score=min_score,
|
438
|
+
)
|
453
439
|
|
454
440
|
|
455
441
|
@merge_observer.wrap({"type": "merge_relations"})
|
@@ -458,9 +444,7 @@ async def merge_relations_results(
|
|
458
444
|
query: EntitiesSubgraphRequest,
|
459
445
|
) -> Relations:
|
460
446
|
loop = asyncio.get_event_loop()
|
461
|
-
return await loop.run_in_executor(
|
462
|
-
None, _merge_relations_results, relations_responses, query
|
463
|
-
)
|
447
|
+
return await loop.run_in_executor(None, _merge_relations_results, relations_responses, query)
|
464
448
|
|
465
449
|
|
466
450
|
def _merge_relations_results(
|
@@ -483,7 +467,7 @@ def _merge_relations_results(
|
|
483
467
|
relations.entities[origin.value].related_to.append(
|
484
468
|
DirectionalRelation(
|
485
469
|
entity=destination.value,
|
486
|
-
entity_type=
|
470
|
+
entity_type=relation_node_type_to_entity_type(destination.ntype),
|
487
471
|
relation=relation_type,
|
488
472
|
relation_label=relation_label,
|
489
473
|
direction=RelationDirection.OUT,
|
@@ -493,7 +477,7 @@ def _merge_relations_results(
|
|
493
477
|
relations.entities[destination.value].related_to.append(
|
494
478
|
DirectionalRelation(
|
495
479
|
entity=origin.value,
|
496
|
-
entity_type=
|
480
|
+
entity_type=relation_node_type_to_entity_type(origin.ntype),
|
497
481
|
relation=relation_type,
|
498
482
|
relation_label=relation_label,
|
499
483
|
direction=RelationDirection.IN,
|
@@ -506,8 +490,7 @@ def _merge_relations_results(
|
|
506
490
|
@merge_observer.wrap({"type": "merge"})
|
507
491
|
async def merge_results(
|
508
492
|
search_responses: list[SearchResponse],
|
509
|
-
|
510
|
-
page: int,
|
493
|
+
top_k: int,
|
511
494
|
kbid: str,
|
512
495
|
show: list[ResourceProperties],
|
513
496
|
field_type_filter: list[FieldTypeName],
|
@@ -530,77 +513,59 @@ async def merge_results(
|
|
530
513
|
|
531
514
|
api_results = KnowledgeboxSearchResults()
|
532
515
|
|
533
|
-
|
534
|
-
|
535
|
-
resources
|
536
|
-
|
537
|
-
documents, resources, count, page, kbid, sort, min_score=min_score.bm25
|
538
|
-
)
|
516
|
+
resources: list[str] = list()
|
517
|
+
api_results.fulltext = await merge_documents_results(
|
518
|
+
documents, resources, top_k, kbid, sort, min_score=min_score.bm25
|
519
|
+
)
|
539
520
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
)
|
521
|
+
api_results.paragraphs = await merge_paragraph_results(
|
522
|
+
paragraphs,
|
523
|
+
resources,
|
524
|
+
kbid,
|
525
|
+
top_k,
|
526
|
+
highlight,
|
527
|
+
sort,
|
528
|
+
min_score=min_score.bm25,
|
529
|
+
)
|
550
530
|
|
551
|
-
|
552
|
-
|
553
|
-
|
531
|
+
api_results.sentences = await merge_vectors_results(
|
532
|
+
vectors, resources, kbid, top_k, min_score=min_score.semantic
|
533
|
+
)
|
554
534
|
|
555
|
-
|
556
|
-
relations, requested_relations
|
557
|
-
)
|
535
|
+
api_results.relations = await merge_relations_results(relations, requested_relations)
|
558
536
|
|
559
|
-
|
560
|
-
|
561
|
-
)
|
562
|
-
return api_results
|
563
|
-
finally:
|
564
|
-
rcache.clear()
|
537
|
+
api_results.resources = await fetch_resources(resources, kbid, show, field_type_filter, extracted)
|
538
|
+
return api_results
|
565
539
|
|
566
540
|
|
567
541
|
async def merge_paragraphs_results(
|
568
|
-
|
569
|
-
|
570
|
-
page: int,
|
542
|
+
responses: list[SearchResponse],
|
543
|
+
top_k: int,
|
571
544
|
kbid: str,
|
572
|
-
show: list[ResourceProperties],
|
573
|
-
field_type_filter: list[FieldTypeName],
|
574
|
-
extracted: list[ExtractedDataTypeName],
|
575
545
|
highlight_split: bool,
|
576
546
|
min_score: float,
|
577
547
|
) -> ResourceSearchResults:
|
578
548
|
paragraphs = []
|
579
|
-
for result in
|
580
|
-
paragraphs.append(result)
|
549
|
+
for result in responses:
|
550
|
+
paragraphs.append(result.paragraph)
|
581
551
|
|
582
552
|
api_results = ResourceSearchResults()
|
583
553
|
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
min_score=min_score,
|
600
|
-
)
|
601
|
-
return api_results
|
602
|
-
finally:
|
603
|
-
rcache.clear()
|
554
|
+
resources: list[str] = list()
|
555
|
+
api_results.paragraphs = await merge_paragraph_results(
|
556
|
+
paragraphs,
|
557
|
+
resources,
|
558
|
+
kbid,
|
559
|
+
top_k,
|
560
|
+
highlight=highlight_split,
|
561
|
+
sort=SortOptions(
|
562
|
+
field=SortField.SCORE,
|
563
|
+
order=SortOrder.DESC,
|
564
|
+
limit=None,
|
565
|
+
),
|
566
|
+
min_score=min_score,
|
567
|
+
)
|
568
|
+
return api_results
|
604
569
|
|
605
570
|
|
606
571
|
async def merge_suggest_entities_results(
|
@@ -609,8 +574,7 @@ async def merge_suggest_entities_results(
|
|
609
574
|
unique_entities: Set[RelatedEntity] = set()
|
610
575
|
for response in suggest_responses:
|
611
576
|
response_entities = (
|
612
|
-
RelatedEntity(family=e.subtype, value=e.value)
|
613
|
-
for e in response.entity_results.nodes
|
577
|
+
RelatedEntity(family=e.subtype, value=e.value) for e in response.entity_results.nodes
|
614
578
|
)
|
615
579
|
unique_entities.update(response_entities)
|
616
580
|
|
@@ -17,10 +17,81 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import contextlib
|
21
|
+
import time
|
22
|
+
from typing import Optional
|
23
|
+
|
20
24
|
from nucliadb_telemetry import metrics
|
21
25
|
|
22
26
|
merge_observer = metrics.Observer("merge_results", labels={"type": ""})
|
23
27
|
node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
|
24
|
-
query_parse_dependency_observer = metrics.Observer(
|
25
|
-
|
28
|
+
query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
|
29
|
+
|
30
|
+
buckets = [
|
31
|
+
0.005,
|
32
|
+
0.01,
|
33
|
+
0.025,
|
34
|
+
0.05,
|
35
|
+
0.075,
|
36
|
+
0.1,
|
37
|
+
0.25,
|
38
|
+
0.5,
|
39
|
+
0.75,
|
40
|
+
1.0,
|
41
|
+
2.5,
|
42
|
+
5.0,
|
43
|
+
7.5,
|
44
|
+
10.0,
|
45
|
+
30.0,
|
46
|
+
60.0,
|
47
|
+
metrics.INF,
|
48
|
+
]
|
49
|
+
|
50
|
+
generative_first_chunk_histogram = metrics.Histogram(
|
51
|
+
name="generative_first_chunk",
|
52
|
+
buckets=buckets,
|
53
|
+
)
|
54
|
+
rag_histogram = metrics.Histogram(
|
55
|
+
name="rag",
|
56
|
+
labels={"step": ""},
|
57
|
+
buckets=buckets,
|
26
58
|
)
|
59
|
+
|
60
|
+
|
61
|
+
class RAGMetrics:
|
62
|
+
def __init__(self):
|
63
|
+
self.global_start = time.monotonic()
|
64
|
+
self._start_times: dict[str, float] = {}
|
65
|
+
self._end_times: dict[str, float] = {}
|
66
|
+
self.first_chunk_yielded_at: Optional[float] = None
|
67
|
+
|
68
|
+
@contextlib.contextmanager
|
69
|
+
def time(self, step: str):
|
70
|
+
self._start(step)
|
71
|
+
try:
|
72
|
+
yield
|
73
|
+
finally:
|
74
|
+
self._end(step)
|
75
|
+
|
76
|
+
def steps(self) -> dict[str, float]:
|
77
|
+
return {step: self.elapsed(step) for step in self._end_times.keys()}
|
78
|
+
|
79
|
+
def elapsed(self, step: str) -> float:
|
80
|
+
return self._end_times[step] - self._start_times[step]
|
81
|
+
|
82
|
+
def record_first_chunk_yielded(self):
|
83
|
+
self.first_chunk_yielded_at = time.monotonic()
|
84
|
+
generative_first_chunk_histogram.observe(self.first_chunk_yielded_at - self.global_start)
|
85
|
+
|
86
|
+
def get_first_chunk_time(self) -> Optional[float]:
|
87
|
+
if self.first_chunk_yielded_at is None:
|
88
|
+
return None
|
89
|
+
return self.first_chunk_yielded_at - self.global_start
|
90
|
+
|
91
|
+
def _start(self, step: str):
|
92
|
+
self._start_times[step] = time.monotonic()
|
93
|
+
|
94
|
+
def _end(self, step: str):
|
95
|
+
self._end_times[step] = time.monotonic()
|
96
|
+
elapsed = self.elapsed(step)
|
97
|
+
rag_histogram.observe(elapsed, labels={"step": step})
|