PyPI - nucliadb - Versions diffs - 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl - Mend

nucliadb 2.46.1.post382py3-none-any.whl → 6.2.1.post2777py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (431) hide show

migrations/0002_rollover_shards.py +1 -2
migrations/0003_allfields_key.py +2 -37
migrations/0004_rollover_shards.py +1 -2
migrations/0005_rollover_shards.py +1 -2
migrations/0006_rollover_shards.py +2 -4
migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
migrations/0010_fix_corrupt_indexes.py +11 -12
migrations/0011_materialize_labelset_ids.py +2 -18
migrations/0012_rollover_shards.py +6 -12
migrations/0013_rollover_shards.py +2 -4
migrations/0014_rollover_shards.py +5 -7
migrations/0015_targeted_rollover.py +6 -12
migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
migrations/0017_multiple_writable_shards.py +3 -6
migrations/0018_purge_orphan_kbslugs.py +59 -0
migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
migrations/0020_drain_nodes_from_cluster.py +83 -0
nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
migrations/0023_backfill_pg_catalog.py +80 -0
migrations/0025_assign_models_to_kbs_v2.py +113 -0
migrations/0026_fix_high_cardinality_content_types.py +61 -0
migrations/0027_rollover_texts3.py +73 -0
nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
migrations/pg/0002_catalog.py +42 -0
nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
nucliadb/common/cluster/base.py +41 -24
nucliadb/common/cluster/discovery/base.py +6 -14
nucliadb/common/cluster/discovery/k8s.py +9 -19
nucliadb/common/cluster/discovery/manual.py +1 -3
nucliadb/common/cluster/discovery/single.py +1 -2
nucliadb/common/cluster/discovery/utils.py +1 -3
nucliadb/common/cluster/grpc_node_dummy.py +11 -16
nucliadb/common/cluster/index_node.py +10 -19
nucliadb/common/cluster/manager.py +223 -102
nucliadb/common/cluster/rebalance.py +42 -37
nucliadb/common/cluster/rollover.py +377 -204
nucliadb/common/cluster/settings.py +16 -9
nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
nucliadb/common/cluster/standalone/index_node.py +4 -11
nucliadb/common/cluster/standalone/service.py +2 -6
nucliadb/common/cluster/standalone/utils.py +9 -6
nucliadb/common/cluster/utils.py +43 -29
nucliadb/common/constants.py +20 -0
nucliadb/common/context/__init__.py +6 -4
nucliadb/common/context/fastapi.py +8 -5
nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
nucliadb/common/datamanagers/__init__.py +24 -5
nucliadb/common/datamanagers/atomic.py +102 -0
nucliadb/common/datamanagers/cluster.py +5 -5
nucliadb/common/datamanagers/entities.py +6 -16
nucliadb/common/datamanagers/fields.py +84 -0
nucliadb/common/datamanagers/kb.py +101 -24
nucliadb/common/datamanagers/labels.py +26 -56
nucliadb/common/datamanagers/processing.py +2 -6
nucliadb/common/datamanagers/resources.py +214 -117
nucliadb/common/datamanagers/rollover.py +77 -16
nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
nucliadb/common/datamanagers/utils.py +19 -11
nucliadb/common/datamanagers/vectorsets.py +110 -0
nucliadb/common/external_index_providers/base.py +257 -0
nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
nucliadb/common/external_index_providers/manager.py +101 -0
nucliadb/common/external_index_providers/pinecone.py +933 -0
nucliadb/common/external_index_providers/settings.py +52 -0
nucliadb/common/http_clients/auth.py +3 -6
nucliadb/common/http_clients/processing.py +6 -11
nucliadb/common/http_clients/utils.py +1 -3
nucliadb/common/ids.py +240 -0
nucliadb/common/locking.py +43 -13
nucliadb/common/maindb/driver.py +11 -35
nucliadb/common/maindb/exceptions.py +6 -6
nucliadb/common/maindb/local.py +22 -9
nucliadb/common/maindb/pg.py +206 -111
nucliadb/common/maindb/utils.py +13 -44
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +260 -0
nucliadb/export_import/datamanager.py +25 -19
nucliadb/export_import/exceptions.py +8 -0
nucliadb/export_import/exporter.py +20 -7
nucliadb/export_import/importer.py +6 -11
nucliadb/export_import/models.py +5 -5
nucliadb/export_import/tasks.py +4 -4
nucliadb/export_import/utils.py +94 -54
nucliadb/health.py +1 -3
nucliadb/ingest/app.py +15 -11
nucliadb/ingest/consumer/auditing.py +30 -147
nucliadb/ingest/consumer/consumer.py +96 -52
nucliadb/ingest/consumer/materializer.py +10 -12
nucliadb/ingest/consumer/pull.py +12 -27
nucliadb/ingest/consumer/service.py +20 -19
nucliadb/ingest/consumer/shard_creator.py +7 -14
nucliadb/ingest/consumer/utils.py +1 -3
nucliadb/ingest/fields/base.py +139 -188
nucliadb/ingest/fields/conversation.py +18 -5
nucliadb/ingest/fields/exceptions.py +1 -4
nucliadb/ingest/fields/file.py +7 -25
nucliadb/ingest/fields/link.py +11 -16
nucliadb/ingest/fields/text.py +9 -4
nucliadb/ingest/orm/brain.py +255 -262
nucliadb/ingest/orm/broker_message.py +181 -0
nucliadb/ingest/orm/entities.py +36 -51
nucliadb/ingest/orm/exceptions.py +12 -0
nucliadb/ingest/orm/knowledgebox.py +334 -278
nucliadb/ingest/orm/processor/__init__.py +2 -697
nucliadb/ingest/orm/processor/auditing.py +117 -0
nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
nucliadb/ingest/orm/processor/processor.py +752 -0
nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
nucliadb/ingest/orm/resource.py +280 -520
nucliadb/ingest/orm/utils.py +25 -31
nucliadb/ingest/partitions.py +3 -9
nucliadb/ingest/processing.py +76 -81
nucliadb/ingest/py.typed +0 -0
nucliadb/ingest/serialize.py +37 -173
nucliadb/ingest/service/__init__.py +1 -3
nucliadb/ingest/service/writer.py +186 -577
nucliadb/ingest/settings.py +13 -22
nucliadb/ingest/utils.py +3 -6
nucliadb/learning_proxy.py +264 -51
nucliadb/metrics_exporter.py +30 -19
nucliadb/middleware/__init__.py +1 -3
nucliadb/migrator/command.py +1 -3
nucliadb/migrator/datamanager.py +13 -13
nucliadb/migrator/migrator.py +57 -37
nucliadb/migrator/settings.py +2 -1
nucliadb/migrator/utils.py +18 -10
nucliadb/purge/__init__.py +139 -33
nucliadb/purge/orphan_shards.py +7 -13
nucliadb/reader/__init__.py +1 -3
nucliadb/reader/api/models.py +3 -14
nucliadb/reader/api/v1/__init__.py +0 -1
nucliadb/reader/api/v1/download.py +27 -94
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +13 -13
nucliadb/reader/api/v1/learning_config.py +8 -12
nucliadb/reader/api/v1/resource.py +67 -93
nucliadb/reader/api/v1/services.py +70 -125
nucliadb/reader/app.py +16 -46
nucliadb/reader/lifecycle.py +18 -4
nucliadb/reader/py.typed +0 -0
nucliadb/reader/reader/notifications.py +10 -31
nucliadb/search/__init__.py +1 -3
nucliadb/search/api/v1/__init__.py +2 -2
nucliadb/search/api/v1/ask.py +112 -0
nucliadb/search/api/v1/catalog.py +184 -0
nucliadb/search/api/v1/feedback.py +17 -25
nucliadb/search/api/v1/find.py +41 -41
nucliadb/search/api/v1/knowledgebox.py +90 -62
nucliadb/search/api/v1/predict_proxy.py +2 -2
nucliadb/search/api/v1/resource/ask.py +66 -117
nucliadb/search/api/v1/resource/search.py +51 -72
nucliadb/search/api/v1/router.py +1 -0
nucliadb/search/api/v1/search.py +50 -197
nucliadb/search/api/v1/suggest.py +40 -54
nucliadb/search/api/v1/summarize.py +9 -5
nucliadb/search/api/v1/utils.py +2 -1
nucliadb/search/app.py +16 -48
nucliadb/search/lifecycle.py +10 -3
nucliadb/search/predict.py +176 -188
nucliadb/search/py.typed +0 -0
nucliadb/search/requesters/utils.py +41 -63
nucliadb/search/search/cache.py +149 -20
nucliadb/search/search/chat/ask.py +918 -0
nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
nucliadb/search/search/chat/images.py +41 -17
nucliadb/search/search/chat/prompt.py +851 -282
nucliadb/search/search/chat/query.py +274 -267
nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
nucliadb/search/search/fetch.py +43 -36
nucliadb/search/search/filters.py +9 -15
nucliadb/search/search/find.py +214 -54
nucliadb/search/search/find_merge.py +408 -391
nucliadb/search/search/hydrator.py +191 -0
nucliadb/search/search/merge.py +198 -234
nucliadb/search/search/metrics.py +73 -2
nucliadb/search/search/paragraphs.py +64 -106
nucliadb/search/search/pgcatalog.py +233 -0
nucliadb/search/search/predict_proxy.py +1 -1
nucliadb/search/search/query.py +386 -257
nucliadb/search/search/query_parser/exceptions.py +22 -0
nucliadb/search/search/query_parser/models.py +101 -0
nucliadb/search/search/query_parser/parser.py +183 -0
nucliadb/search/search/rank_fusion.py +204 -0
nucliadb/search/search/rerankers.py +270 -0
nucliadb/search/search/shards.py +4 -38
nucliadb/search/search/summarize.py +14 -18
nucliadb/search/search/utils.py +27 -4
nucliadb/search/settings.py +15 -1
nucliadb/standalone/api_router.py +4 -10
nucliadb/standalone/app.py +17 -14
nucliadb/standalone/auth.py +7 -21
nucliadb/standalone/config.py +9 -12
nucliadb/standalone/introspect.py +5 -5
nucliadb/standalone/lifecycle.py +26 -25
nucliadb/standalone/migrations.py +58 -0
nucliadb/standalone/purge.py +9 -8
nucliadb/standalone/py.typed +0 -0
nucliadb/standalone/run.py +25 -18
nucliadb/standalone/settings.py +10 -14
nucliadb/standalone/versions.py +15 -5
nucliadb/tasks/consumer.py +8 -12
nucliadb/tasks/producer.py +7 -6
nucliadb/tests/config.py +53 -0
nucliadb/train/__init__.py +1 -3
nucliadb/train/api/utils.py +1 -2
nucliadb/train/api/v1/shards.py +2 -2
nucliadb/train/api/v1/trainset.py +4 -6
nucliadb/train/app.py +14 -47
nucliadb/train/generator.py +10 -19
nucliadb/train/generators/field_classifier.py +7 -19
nucliadb/train/generators/field_streaming.py +156 -0
nucliadb/train/generators/image_classifier.py +12 -18
nucliadb/train/generators/paragraph_classifier.py +5 -9
nucliadb/train/generators/paragraph_streaming.py +6 -9
nucliadb/train/generators/question_answer_streaming.py +19 -20
nucliadb/train/generators/sentence_classifier.py +9 -15
nucliadb/train/generators/token_classifier.py +45 -36
nucliadb/train/generators/utils.py +14 -18
nucliadb/train/lifecycle.py +7 -3
nucliadb/train/nodes.py +23 -32
nucliadb/train/py.typed +0 -0
nucliadb/train/servicer.py +13 -21
nucliadb/train/settings.py +2 -6
nucliadb/train/types.py +13 -10
nucliadb/train/upload.py +3 -6
nucliadb/train/uploader.py +20 -25
nucliadb/train/utils.py +1 -1
nucliadb/writer/__init__.py +1 -3
nucliadb/writer/api/constants.py +0 -5
nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
nucliadb/writer/api/v1/export_import.py +102 -49
nucliadb/writer/api/v1/field.py +196 -620
nucliadb/writer/api/v1/knowledgebox.py +221 -71
nucliadb/writer/api/v1/learning_config.py +2 -2
nucliadb/writer/api/v1/resource.py +114 -216
nucliadb/writer/api/v1/services.py +64 -132
nucliadb/writer/api/v1/slug.py +61 -0
nucliadb/writer/api/v1/transaction.py +67 -0
nucliadb/writer/api/v1/upload.py +184 -215
nucliadb/writer/app.py +11 -61
nucliadb/writer/back_pressure.py +62 -43
nucliadb/writer/exceptions.py +0 -4
nucliadb/writer/lifecycle.py +21 -15
nucliadb/writer/py.typed +0 -0
nucliadb/writer/resource/audit.py +2 -1
nucliadb/writer/resource/basic.py +48 -62
nucliadb/writer/resource/field.py +45 -135
nucliadb/writer/resource/origin.py +1 -2
nucliadb/writer/settings.py +14 -5
nucliadb/writer/tus/__init__.py +17 -15
nucliadb/writer/tus/azure.py +111 -0
nucliadb/writer/tus/dm.py +17 -5
nucliadb/writer/tus/exceptions.py +1 -3
nucliadb/writer/tus/gcs.py +56 -84
nucliadb/writer/tus/local.py +21 -37
nucliadb/writer/tus/s3.py +28 -68
nucliadb/writer/tus/storage.py +5 -56
nucliadb/writer/vectorsets.py +125 -0
nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
nucliadb/common/maindb/redis.py +0 -194
nucliadb/common/maindb/tikv.py +0 -412
nucliadb/ingest/fields/layout.py +0 -58
nucliadb/ingest/tests/conftest.py +0 -30
nucliadb/ingest/tests/fixtures.py +0 -771
nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
nucliadb/ingest/tests/unit/test_partitions.py +0 -40
nucliadb/ingest/tests/unit/test_processing.py +0 -171
nucliadb/middleware/transaction.py +0 -117
nucliadb/reader/api/v1/learning_collector.py +0 -63
nucliadb/reader/tests/__init__.py +0 -19
nucliadb/reader/tests/conftest.py +0 -31
nucliadb/reader/tests/fixtures.py +0 -136
nucliadb/reader/tests/test_list_resources.py +0 -75
nucliadb/reader/tests/test_reader_file_download.py +0 -273
nucliadb/reader/tests/test_reader_resource.py +0 -379
nucliadb/reader/tests/test_reader_resource_field.py +0 -219
nucliadb/search/api/v1/chat.py +0 -258
nucliadb/search/api/v1/resource/chat.py +0 -94
nucliadb/search/tests/__init__.py +0 -19
nucliadb/search/tests/conftest.py +0 -33
nucliadb/search/tests/fixtures.py +0 -199
nucliadb/search/tests/node.py +0 -465
nucliadb/search/tests/unit/__init__.py +0 -18
nucliadb/search/tests/unit/api/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
nucliadb/search/tests/unit/search/__init__.py +0 -18
nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
nucliadb/search/tests/unit/search/search/__init__.py +0 -19
nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
nucliadb/search/tests/unit/search/test_fetch.py +0 -108
nucliadb/search/tests/unit/search/test_filters.py +0 -125
nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
nucliadb/search/tests/unit/search/test_query.py +0 -201
nucliadb/search/tests/unit/test_app.py +0 -79
nucliadb/search/tests/unit/test_find_merge.py +0 -112
nucliadb/search/tests/unit/test_merge.py +0 -34
nucliadb/search/tests/unit/test_predict.py +0 -584
nucliadb/standalone/tests/__init__.py +0 -19
nucliadb/standalone/tests/conftest.py +0 -33
nucliadb/standalone/tests/fixtures.py +0 -38
nucliadb/standalone/tests/unit/__init__.py +0 -18
nucliadb/standalone/tests/unit/test_api_router.py +0 -61
nucliadb/standalone/tests/unit/test_auth.py +0 -169
nucliadb/standalone/tests/unit/test_introspect.py +0 -35
nucliadb/standalone/tests/unit/test_versions.py +0 -68
nucliadb/tests/benchmarks/__init__.py +0 -19
nucliadb/tests/benchmarks/test_search.py +0 -99
nucliadb/tests/conftest.py +0 -32
nucliadb/tests/fixtures.py +0 -736
nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
nucliadb/tests/migrations/__init__.py +0 -19
nucliadb/tests/migrations/test_migration_0017.py +0 -80
nucliadb/tests/tikv.py +0 -240
nucliadb/tests/unit/__init__.py +0 -19
nucliadb/tests/unit/common/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
nucliadb/tests/unit/common/maindb/__init__.py +0 -18
nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
nucliadb/tests/unit/common/test_context.py +0 -36
nucliadb/tests/unit/export_import/__init__.py +0 -19
nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
nucliadb/tests/unit/export_import/test_utils.py +0 -294
nucliadb/tests/unit/migrator/__init__.py +0 -19
nucliadb/tests/unit/migrator/test_migrator.py +0 -87
nucliadb/tests/unit/tasks/__init__.py +0 -19
nucliadb/tests/unit/tasks/conftest.py +0 -42
nucliadb/tests/unit/tasks/test_consumer.py +0 -93
nucliadb/tests/unit/tasks/test_producer.py +0 -95
nucliadb/tests/unit/tasks/test_tasks.py +0 -60
nucliadb/tests/unit/test_field_ids.py +0 -49
nucliadb/tests/unit/test_health.py +0 -84
nucliadb/tests/unit/test_kb_slugs.py +0 -54
nucliadb/tests/unit/test_learning_proxy.py +0 -252
nucliadb/tests/unit/test_metrics_exporter.py +0 -77
nucliadb/tests/unit/test_purge.py +0 -138
nucliadb/tests/utils/__init__.py +0 -74
nucliadb/tests/utils/aiohttp_session.py +0 -44
nucliadb/tests/utils/broker_messages/__init__.py +0 -167
nucliadb/tests/utils/broker_messages/fields.py +0 -181
nucliadb/tests/utils/broker_messages/helpers.py +0 -33
nucliadb/tests/utils/entities.py +0 -78
nucliadb/train/api/v1/check.py +0 -60
nucliadb/train/tests/__init__.py +0 -19
nucliadb/train/tests/conftest.py +0 -29
nucliadb/train/tests/fixtures.py +0 -342
nucliadb/train/tests/test_field_classification.py +0 -122
nucliadb/train/tests/test_get_entities.py +0 -80
nucliadb/train/tests/test_get_info.py +0 -51
nucliadb/train/tests/test_get_ontology.py +0 -34
nucliadb/train/tests/test_get_ontology_count.py +0 -63
nucliadb/train/tests/test_image_classification.py +0 -222
nucliadb/train/tests/test_list_fields.py +0 -39
nucliadb/train/tests/test_list_paragraphs.py +0 -73
nucliadb/train/tests/test_list_resources.py +0 -39
nucliadb/train/tests/test_list_sentences.py +0 -71
nucliadb/train/tests/test_paragraph_classification.py +0 -123
nucliadb/train/tests/test_paragraph_streaming.py +0 -118
nucliadb/train/tests/test_question_answer_streaming.py +0 -239
nucliadb/train/tests/test_sentence_classification.py +0 -143
nucliadb/train/tests/test_token_classification.py +0 -136
nucliadb/train/tests/utils.py +0 -108
nucliadb/writer/layouts/__init__.py +0 -51
nucliadb/writer/layouts/v1.py +0 -59
nucliadb/writer/resource/vectors.py +0 -120
nucliadb/writer/tests/__init__.py +0 -19
nucliadb/writer/tests/conftest.py +0 -31
nucliadb/writer/tests/fixtures.py +0 -192
nucliadb/writer/tests/test_fields.py +0 -486
nucliadb/writer/tests/test_files.py +0 -743
nucliadb/writer/tests/test_knowledgebox.py +0 -49
nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
nucliadb/writer/tests/test_resources.py +0 -546
nucliadb/writer/tests/test_service.py +0 -137
nucliadb/writer/tests/test_tus.py +0 -203
nucliadb/writer/tests/utils.py +0 -35
nucliadb/writer/tus/pg.py +0 -125
nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
{nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
/nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
/nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
/nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
/nucliadb/{ingest/tests → tests}/vectors.py +0 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0

nucliadb/search/search/find_merge.py CHANGED Viewed

@@ -18,37 +18,48 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import asyncio
-from typing import Any, Iterator, Optional, cast
+from typing import Iterable, Union
-from nucliadb_protos.nodereader_pb2 import (
-    DocumentScored,
-    EntitiesSubgraphRequest,
-    ParagraphResult,
-    SearchResponse,
-)
-from nucliadb.common.maindb.driver import Transaction
-from nucliadb.ingest.serialize import managed_serialize
-from nucliadb.middleware.transaction import get_read_only_transaction
+from nucliadb.common.external_index_providers.base import TextBlockMatch
+from nucliadb.common.ids import ParagraphId, VectorId
 from nucliadb.search import SERVICE_NAME, logger
-from nucliadb.search.search.cache import get_resource_cache
+from nucliadb.search.search.cut import cut_page
+from nucliadb.search.search.hydrator import (
+    ResourceHydrationOptions,
+    TextBlockHydrationOptions,
+    hydrate_resource_metadata,
+    hydrate_text_block,
+    text_block_to_find_paragraph,
+)
 from nucliadb.search.search.merge import merge_relations_results
+from nucliadb.search.search.rank_fusion import RankFusionAlgorithm
+from nucliadb.search.search.rerankers import (
+    RerankableItem,
+    Reranker,
+    RerankingOptions,
+)
 from nucliadb_models.common import FieldTypeName
-from nucliadb_models.resource import ExtractedDataTypeName
+from nucliadb_models.resource import ExtractedDataTypeName, Resource
 from nucliadb_models.search import (
     SCORE_TYPE,
     FindField,
-    FindParagraph,
     FindResource,
     KnowledgeboxFindResults,
     MinScore,
     ResourceProperties,
-    TempFindParagraph,
     TextPosition,
 )
+from nucliadb_protos.nodereader_pb2 import (
+    DocumentScored,
+    EntitiesSubgraphRequest,
+    ParagraphResult,
+    ParagraphSearchResponse,
+    RelationSearchResponse,
+    SearchResponse,
+    VectorSearchResponse,
+)
 from nucliadb_telemetry import metrics
-from . import paragraphs
 from .metrics import merge_observer
 FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
@@ -57,407 +68,413 @@ FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
 )
-def _round(x: float) -> float:
-    return round(x, ndigits=3)
-@merge_observer.wrap({"type": "set_text_value"})
-async def set_text_value(
+@merge_observer.wrap({"type": "find_merge"})
+async def build_find_response(
+    search_responses: list[SearchResponse],
+    *,
     kbid: str,
-    result_paragraph: TempFindParagraph,
-    max_operations: asyncio.Semaphore,
+    query: str,
+    relation_subgraph_query: EntitiesSubgraphRequest,
+    top_k: int,
+    min_score_bm25: float,
+    min_score_semantic: float,
+    rank_fusion_algorithm: RankFusionAlgorithm,
+    reranker: Reranker,
+    show: list[ResourceProperties] = [],
+    extracted: list[ExtractedDataTypeName] = [],
+    field_type_filter: list[FieldTypeName] = [],
     highlight: bool = False,
-    ematches: Optional[list[str]] = None,
-    extracted_text_cache: Optional[paragraphs.ExtractedTextCache] = None,
-):
-    async with max_operations:
-        assert result_paragraph.paragraph
-        assert result_paragraph.paragraph.position
-        result_paragraph.paragraph.text = await paragraphs.get_paragraph_text(
-            kbid=kbid,
-            rid=result_paragraph.rid,
-            field=result_paragraph.field,
-            start=result_paragraph.paragraph.position.start,
-            end=result_paragraph.paragraph.position.end,
-            split=result_paragraph.split,
-            highlight=highlight,
-            ematches=ematches,
-            matches=[],  # TODO
-            extracted_text_cache=extracted_text_cache,
+) -> KnowledgeboxFindResults:
+    # merge
+    search_response = merge_shard_responses(search_responses)
+    keyword_results = keyword_results_to_text_block_matches(search_response.paragraph.results)
+    semantic_results = semantic_results_to_text_block_matches(
+        filter(
+            lambda x: x.score >= min_score_semantic,
+            search_response.vector.documents,
         )
+    )
+    merged_text_blocks: list[TextBlockMatch] = rank_fusion_algorithm.fuse(
+        keyword_results, semantic_results
+    )
+    # cut
+    # we assume pagination + predict reranker is forbidden and has been already
+    # enforced/validated by the query parsing.
+    if reranker.needs_extra_results:
+        assert reranker.window is not None, "Reranker definition must enforce this condition"
+        text_blocks_page, next_page = cut_page(merged_text_blocks, reranker.window)
+    else:
+        text_blocks_page, next_page = cut_page(merged_text_blocks, top_k)
+    # hydrate and rerank
+    resource_hydration_options = ResourceHydrationOptions(
+        show=show, extracted=extracted, field_type_filter=field_type_filter
+    )
+    text_block_hydration_options = TextBlockHydrationOptions(
+        highlight=highlight,
+        ematches=search_response.paragraph.ematches,  # type: ignore
+    )
+    reranking_options = RerankingOptions(kbid=kbid, query=query)
+    text_blocks, resources, best_matches = await hydrate_and_rerank(
+        text_blocks_page,
+        kbid,
+        resource_hydration_options=resource_hydration_options,
+        text_block_hydration_options=text_block_hydration_options,
+        reranker=reranker,
+        reranking_options=reranking_options,
+        top_k=top_k,
+    )
+    # build relations graph
+    relations = await merge_relations_results([search_response.relation], relation_subgraph_query)
+    # compose response
+    find_resources = compose_find_resources(text_blocks, resources)
+    next_page = search_response.paragraph.next_page or next_page
+    total_paragraphs = search_response.paragraph.total
+    find_results = KnowledgeboxFindResults(
+        query=query,
+        resources=find_resources,
+        best_matches=best_matches,
+        relations=relations,
+        total=total_paragraphs,
+        page_number=0,  # Bw/c with pagination
+        page_size=top_k,
+        next_page=next_page,
+        min_score=MinScore(bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)),
+    )
+    return find_results
+def merge_shard_responses(
+    responses: list[SearchResponse],
+) -> SearchResponse:
+    """Merge search responses into a single response as if there were no shards
+    involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    paragraphs = []
+    vectors = []
+    relations = []
+    for response in responses:
+        paragraphs.append(response.paragraph)
+        vectors.append(response.vector)
+        relations.append(response.relation)
-@merge_observer.wrap({"type": "set_resource_metadada_value"})
-async def set_resource_metadata_value(
-    txn: Transaction,
-    kbid: str,
-    resource: str,
-    show: list[ResourceProperties],
-    field_type_filter: list[FieldTypeName],
-    extracted: list[ExtractedDataTypeName],
-    find_resources: dict[str, FindResource],
-    max_operations: asyncio.Semaphore,
-):
-    async with max_operations:
-        serialized_resource = await managed_serialize(
-            txn,
-            kbid,
-            resource,
-            show,
-            field_type_filter=field_type_filter,
-            extracted=extracted,
-            service_name=SERVICE_NAME,
-        )
-        if serialized_resource is not None:
-            find_resources[resource].updated_from(serialized_resource)
-        else:
-            logger.warning(f"Resource {resource} not found in {kbid}")
-            find_resources.pop(resource, None)
-class Orderer:
-    def __init__(self):
-        self.boosted_items = []
-        self.items = []
-    def add(self, key: Any):
-        self.items.append(key)
-    def add_boosted(self, key: Any):
-        self.boosted_items.append(key)
-    def sorted_by_score(self) -> Iterator[Any]:
-        for key in sorted(self.items, key=lambda value: value[3], reverse=True):
-            yield key
-    def sorted_by_insertion(self) -> Iterator[Any]:
-        returned = set()
-        for key in self.boosted_items:
-            if key in returned:
-                continue
-            returned.add(key)
-            yield key
-        for key in self.items:
-            if key in returned:
-                continue
-            returned.add(key)
-            yield key
-@merge_observer.wrap({"type": "fetch_find_metadata"})
-async def fetch_find_metadata(
-    find_resources: dict[str, FindResource],
-    best_matches: list[str],
-    result_paragraphs: list[TempFindParagraph],
+    merged = SearchResponse(
+        paragraph=merge_shards_keyword_responses(paragraphs),
+        vector=merge_shards_semantic_responses(vectors),
+        relation=merge_shards_relation_responses(relations),
+    )
+    return merged
+def merge_shards_keyword_responses(
+    keyword_responses: list[ParagraphSearchResponse],
+) -> ParagraphSearchResponse:
+    """Merge keyword (paragraph) search responses into a single response as if
+    there were no shards involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    merged = ParagraphSearchResponse()
+    for response in keyword_responses:
+        merged.query = response.query
+        merged.next_page = merged.next_page or response.next_page
+        merged.total += response.total
+        merged.results.extend(response.results)
+        merged.ematches.extend(response.ematches)
+    return merged
+def merge_shards_semantic_responses(
+    semantic_responses: list[VectorSearchResponse],
+) -> VectorSearchResponse:
+    """Merge semantic (vector) search responses into a single response as if
+    there were no shards involved.
+    ATENTION! This is not a complete merge, we are only merging the fields
+    needed to compose a /find response.
+    """
+    merged = VectorSearchResponse()
+    for response in semantic_responses:
+        merged.documents.extend(response.documents)
+    return merged
+def merge_shards_relation_responses(
+    relation_responses: list[RelationSearchResponse],
+) -> RelationSearchResponse:
+    merged = RelationSearchResponse()
+    for response in relation_responses:
+        merged.prefix.nodes.extend(response.prefix.nodes)
+        merged.subgraph.relations.extend(response.subgraph.relations)
+    return merged
+def keyword_result_to_text_block_match(item: ParagraphResult) -> TextBlockMatch:
+    fuzzy_result = len(item.matches) > 0
+    return TextBlockMatch(
+        paragraph_id=ParagraphId.from_string(item.paragraph),
+        score=item.score.bm25,
+        score_type=SCORE_TYPE.BM25,
+        order=0,  # NOTE: this will be filled later
+        text="",  # NOTE: this will be filled later too
+        position=TextPosition(
+            page_number=item.metadata.position.page_number,
+            index=item.metadata.position.index,
+            start=item.start,
+            end=item.end,
+            start_seconds=[x for x in item.metadata.position.start_seconds],
+            end_seconds=[x for x in item.metadata.position.end_seconds],
+        ),
+        # XXX: we should split labels
+        field_labels=[],
+        paragraph_labels=list(item.labels),
+        fuzzy_search=fuzzy_result,
+        is_a_table=item.metadata.representation.is_a_table,
+        representation_file=item.metadata.representation.file,
+        page_with_visual=item.metadata.page_with_visual,
+    )
+def keyword_results_to_text_block_matches(items: Iterable[ParagraphResult]) -> list[TextBlockMatch]:
+    return [keyword_result_to_text_block_match(item) for item in items]
+class InvalidDocId(Exception):
+    """Raised while parsing an invalid id coming from semantic search"""
+    def __init__(self, invalid_vector_id: str):
+        self.invalid_vector_id = invalid_vector_id
+        super().__init__(f"Invalid vector ID: {invalid_vector_id}")
+def semantic_result_to_text_block_match(item: DocumentScored) -> TextBlockMatch:
+    try:
+        vector_id = VectorId.from_string(item.doc_id.id)
+    except (IndexError, ValueError):
+        raise InvalidDocId(item.doc_id.id)
+    return TextBlockMatch(
+        paragraph_id=ParagraphId.from_vector_id(vector_id),
+        score=item.score,
+        score_type=SCORE_TYPE.VECTOR,
+        order=0,  # NOTE: this will be filled later
+        text="",  # NOTE: this will be filled later too
+        position=TextPosition(
+            page_number=item.metadata.position.page_number,
+            index=item.metadata.position.index,
+            start=vector_id.vector_start,
+            end=vector_id.vector_end,
+            start_seconds=[x for x in item.metadata.position.start_seconds],
+            end_seconds=[x for x in item.metadata.position.end_seconds],
+        ),
+        # XXX: we should split labels
+        field_labels=[],
+        paragraph_labels=list(item.labels),
+        fuzzy_search=False,  # semantic search doesn't have fuzziness
+        is_a_table=item.metadata.representation.is_a_table,
+        representation_file=item.metadata.representation.file,
+        page_with_visual=item.metadata.page_with_visual,
+    )
+def semantic_results_to_text_block_matches(items: Iterable[DocumentScored]) -> list[TextBlockMatch]:
+    text_blocks: list[TextBlockMatch] = []
+    for item in items:
+        try:
+            text_block = semantic_result_to_text_block_match(item)
+        except InvalidDocId as exc:
+            logger.warning(f"Skipping invalid doc_id: {exc.invalid_vector_id}")
+            continue
+        text_blocks.append(text_block)
+    return text_blocks
+@merge_observer.wrap({"type": "hydrate_and_rerank"})
+async def hydrate_and_rerank(
+    text_blocks: Iterable[TextBlockMatch],
     kbid: str,
-    show: list[ResourceProperties],
-    field_type_filter: list[FieldTypeName],
-    extracted: list[ExtractedDataTypeName],
-    highlight: bool = False,
-    ematches: Optional[list[str]] = None,
-):
-    txn = await get_read_only_transaction()
-    resources = set()
-    operations = []
+    *,
+    resource_hydration_options: ResourceHydrationOptions,
+    text_block_hydration_options: TextBlockHydrationOptions,
+    reranker: Reranker,
+    reranking_options: RerankingOptions,
+    top_k: int,
+) -> tuple[list[TextBlockMatch], list[Resource], list[str]]:
+    """Given a list of text blocks from a retrieval operation, hydrate and
+    rerank the results.
+    This function returns either the entire list or a subset of updated
+    (hydrated and reranked) text blocks and their corresponding resource
+    metadata. It also returns an ordered list of best matches.
+    """
     max_operations = asyncio.Semaphore(50)
-    orderer = Orderer()
-    etcache = paragraphs.ExtractedTextCache()
-    for result_paragraph in result_paragraphs:
-        if result_paragraph.paragraph is not None:
-            find_resource = find_resources.setdefault(
-                result_paragraph.rid, FindResource(id=result_paragraph.id, fields={})
-            )
-            find_field = find_resource.fields.setdefault(
-                result_paragraph.field, FindField(paragraphs={})
-            )
-            if result_paragraph.paragraph.id in find_field.paragraphs:
-                # Its a multiple match, push the score
-                # find_field.paragraphs[result_paragraph.paragraph.id].score = 25
-                if (
-                    find_field.paragraphs[result_paragraph.paragraph.id].score
-                    < result_paragraph.paragraph.score
-                ):
-                    # Use Vector score if there are both
-                    find_field.paragraphs[result_paragraph.paragraph.id].score = (
-                        result_paragraph.paragraph.score * 2
-                    )
-                    orderer.add(
-                        (
-                            result_paragraph.rid,
-                            result_paragraph.field,
-                            result_paragraph.paragraph.id,
-                            result_paragraph.paragraph.score,
-                        )
-                    )
-                find_field.paragraphs[
-                    result_paragraph.paragraph.id
-                ].score_type = SCORE_TYPE.BOTH
-            else:
-                find_field.paragraphs[
-                    result_paragraph.paragraph.id
-                ] = result_paragraph.paragraph
-                orderer.add(
-                    (
-                        result_paragraph.rid,
-                        result_paragraph.field,
-                        result_paragraph.paragraph.id,
-                        result_paragraph.paragraph.score,
+    # Iterate text blocks and create text block and resource metadata hydration
+    # tasks depending on the reranker
+    text_blocks_by_id: dict[str, TextBlockMatch] = {}  # useful for faster access to text blocks later
+    resource_hydration_ops = {}
+    text_block_hydration_ops = []
+    for text_block in text_blocks:
+        rid = text_block.paragraph_id.rid
+        paragraph_id = text_block.paragraph_id.full()
+        # If we find multiple results (from different indexes) with different
+        # metadata, this statement will only get the metadata from the first on
+        # the list. We assume metadata is the same on all indexes, otherwise
+        # this would be a BUG
+        text_blocks_by_id.setdefault(paragraph_id, text_block)
+        # rerankers that need extra results may end with less resources than the
+        # ones we see now, so we'll skip this step and recompute the resources
+        # later
+        if not reranker.needs_extra_results:
+            if rid not in resource_hydration_ops:
+                resource_hydration_ops[rid] = asyncio.create_task(
+                    hydrate_resource_metadata(
+                        kbid,
+                        rid,
+                        options=resource_hydration_options,
+                        concurrency_control=max_operations,
+                        service_name=SERVICE_NAME,
                     )
                 )
-            operations.append(
-                asyncio.create_task(
-                    set_text_value(
-                        kbid=kbid,
-                        result_paragraph=result_paragraph,
-                        highlight=highlight,
-                        ematches=ematches,
-                        max_operations=max_operations,
-                        extracted_text_cache=etcache,
-                    )
-                )
-            )
-            resources.add(result_paragraph.rid)
-    etcache.clear()
-    for order, (rid, field_id, paragraph_id, _) in enumerate(orderer.sorted_by_score()):
-        find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order
-        best_matches.append(paragraph_id)
-    for resource in resources:
-        operations.append(
+        text_block_hydration_ops.append(
             asyncio.create_task(
-                set_resource_metadata_value(
-                    txn,
-                    kbid=kbid,
-                    resource=resource,
-                    show=show,
-                    field_type_filter=field_type_filter,
-                    extracted=extracted,
-                    find_resources=find_resources,
-                    max_operations=max_operations,
+                hydrate_text_block(
+                    kbid,
+                    text_block,
+                    text_block_hydration_options,
+                    concurrency_control=max_operations,
                 )
             )
         )
-    FIND_FETCH_OPS_DISTRIBUTION.observe(len(operations))
-    if len(operations) > 0:
-        done, _ = await asyncio.wait(operations)  # type: ignore
-        for task in done:
-            if task.exception() is not None:  # pragma: no cover
-                logger.error("Error fetching find metadata", exc_info=task.exception())
-@merge_observer.wrap({"type": "merge_paragraphs_vectors"})
-def merge_paragraphs_vectors(
-    paragraphs_shards: list[list[ParagraphResult]],
-    vectors_shards: list[list[DocumentScored]],
-    count: int,
-    page: int,
-    min_score: float,
-) -> tuple[list[TempFindParagraph], bool]:
-    merged_paragrahs: list[TempFindParagraph] = []
-    # We assume that paragraphs_shards and vectors_shards are already ordered
-    for paragraphs_shard in paragraphs_shards:
-        for paragraph in paragraphs_shard:
-            fuzzy_result = len(paragraph.matches) > 0
-            merged_paragrahs.append(
-                TempFindParagraph(
-                    paragraph_index=paragraph,
-                    field=paragraph.field,
-                    rid=paragraph.uuid,
-                    score=paragraph.score.bm25,
-                    start=paragraph.start,
-                    split=paragraph.split,
-                    end=paragraph.end,
-                    id=paragraph.paragraph,
-                    fuzzy_result=fuzzy_result,
-                    page_with_visual=paragraph.metadata.page_with_visual,
-                    reference=paragraph.metadata.representation.file,
-                    is_a_table=paragraph.metadata.representation.is_a_table,
+    # hydrate only the strictly needed before rerank
+    hydrated_text_blocks: list[TextBlockMatch]
+    hydrated_resources: list[Union[Resource, None]]
+    ops = [
+        *text_block_hydration_ops,
+        *resource_hydration_ops.values(),
+    ]
+    FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
+    results = await asyncio.gather(*ops)
+    hydrated_text_blocks = results[: len(text_block_hydration_ops)]  # type: ignore
+    hydrated_resources = results[len(text_block_hydration_ops) :]  # type: ignore
+    # with the hydrated text, rerank and apply new scores to the text blocks
+    to_rerank = [
+        RerankableItem(
+            id=text_block.paragraph_id.full(),
+            score=text_block.score,
+            score_type=text_block.score_type,
+            content=text_block.text or "",  # TODO: add a warning, this shouldn't usually happen
+        )
+        for text_block in hydrated_text_blocks
+    ]
+    reranked = await reranker.rerank(to_rerank, reranking_options)
+    # after reranking, we can cut to the number of results the user wants, so we
+    # don't hydrate unnecessary stuff
+    reranked = reranked[:top_k]
+    matches = []
+    for item in reranked:
+        paragraph_id = item.id
+        score = item.score
+        score_type = item.score_type
+        text_block = text_blocks_by_id[paragraph_id]
+        text_block.score = score
+        text_block.score_type = score_type
+        matches.append((paragraph_id, score))
+    matches.sort(key=lambda x: x[1], reverse=True)
+    best_matches = []
+    best_text_blocks = []
+    resource_hydration_ops = {}
+    for order, (paragraph_id, _) in enumerate(matches):
+        text_block = text_blocks_by_id[paragraph_id]
+        text_block.order = order
+        best_matches.append(paragraph_id)
+        best_text_blocks.append(text_block)
+        # now we have removed the text block surplus, fetch resource metadata
+        if reranker.needs_extra_results:
+            rid = ParagraphId.from_string(paragraph_id).rid
+            if rid not in resource_hydration_ops:
+                resource_hydration_ops[rid] = asyncio.create_task(
+                    hydrate_resource_metadata(
+                        kbid,
+                        rid,
+                        options=resource_hydration_options,
+                        concurrency_control=max_operations,
+                        service_name=SERVICE_NAME,
+                    )
                 )
-            )
-    # merged_paragrahs.sort(key=lambda r: r.score, reverse=True)
+    # Finally, fetch resource metadata if we haven't already done it
+    if reranker.needs_extra_results:
+        ops = list(resource_hydration_ops.values())
+        FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
+        hydrated_resources = await asyncio.gather(*ops)  # type: ignore
-    nextpos = 1
-    for vectors_shard in vectors_shards:
-        for vector in vectors_shard:
-            if vector.score < min_score:
-                logger.warning(
-                    f"Skipping low score vector: {vector.doc_id.id}. This should not happen"
-                )
-                continue
-            doc_id_split = vector.doc_id.id.split("/")
-            split = None
-            if len(doc_id_split) == 5:
-                rid, field_type, field, index, position = doc_id_split
-                paragraph_id = f"{rid}/{field_type}/{field}/{position}"
-            elif len(doc_id_split) == 6:
-                rid, field_type, field, split, index, position = doc_id_split
-                paragraph_id = f"{rid}/{field_type}/{field}/{split}/{position}"
-            else:
-                logger.warning(f"Skipping invalid doc_id: {vector.doc_id.id}")
-                continue
-            start, end = position.split("-")
-            merged_paragrahs.insert(
-                nextpos,
-                TempFindParagraph(
-                    vector_index=vector,
-                    rid=rid,
-                    field=f"/{field_type}/{field}",
-                    score=vector.score,
-                    start=int(start),
-                    end=int(end),
-                    split=split,
-                    id=paragraph_id,
-                ),
-            )
-            nextpos += 3
-    # merged_paragrahs.sort(key=lambda r: r.score, reverse=True)
-    init_position = count * page
-    end_position = init_position + count
-    next_page = len(merged_paragrahs) > end_position
-    merged_paragrahs = merged_paragrahs[init_position:end_position]
-    for merged_paragraph in merged_paragrahs:
-        if merged_paragraph.vector_index is not None:
-            merged_paragraph.paragraph = FindParagraph(
-                score=merged_paragraph.vector_index.score,
-                score_type=SCORE_TYPE.VECTOR,
-                text="",
-                labels=[],  # TODO: Get labels from index
-                page_with_visual=merged_paragraph.vector_index.metadata.page_with_visual,
-                reference=merged_paragraph.vector_index.metadata.representation.file,
-                is_a_table=merged_paragraph.vector_index.metadata.representation.is_a_table,
-                position=TextPosition(
-                    page_number=merged_paragraph.vector_index.metadata.position.page_number,
-                    index=merged_paragraph.vector_index.metadata.position.index,
-                    start=merged_paragraph.start,
-                    end=merged_paragraph.end,
-                    start_seconds=[
-                        x
-                        for x in merged_paragraph.vector_index.metadata.position.start_seconds
-                    ],
-                    end_seconds=[
-                        x
-                        for x in merged_paragraph.vector_index.metadata.position.end_seconds
-                    ],
-                ),
-                id=merged_paragraph.id,
-                # Vector searches don't have fuzziness
-                fuzzy_result=False,
-            )
-        elif merged_paragraph.paragraph_index is not None:
-            merged_paragraph.paragraph = FindParagraph(
-                score=merged_paragraph.paragraph_index.score.bm25,
-                score_type=SCORE_TYPE.BM25,
-                text="",
-                labels=[x for x in merged_paragraph.paragraph_index.labels],
-                page_with_visual=merged_paragraph.paragraph_index.metadata.page_with_visual,
-                reference=merged_paragraph.paragraph_index.metadata.representation.file,
-                is_a_table=merged_paragraph.paragraph_index.metadata.representation.is_a_table,
-                position=TextPosition(
-                    page_number=merged_paragraph.paragraph_index.metadata.position.page_number,
-                    index=merged_paragraph.paragraph_index.metadata.position.index,
-                    start=merged_paragraph.start,
-                    end=merged_paragraph.end,
-                    start_seconds=[
-                        x
-                        for x in merged_paragraph.paragraph_index.metadata.position.start_seconds
-                    ],
-                    end_seconds=[
-                        x
-                        for x in merged_paragraph.paragraph_index.metadata.position.end_seconds
-                    ],
-                ),
-                id=merged_paragraph.id,
-                fuzzy_result=merged_paragraph.fuzzy_result,
-            )
-    return merged_paragrahs, next_page
+    resources = [resource for resource in hydrated_resources if resource is not None]
+    return best_text_blocks, resources, best_matches
-@merge_observer.wrap({"type": "find_merge"})
-async def find_merge_results(
-    search_responses: list[SearchResponse],
-    count: int,
-    page: int,
-    kbid: str,
-    show: list[ResourceProperties],
-    field_type_filter: list[FieldTypeName],
-    extracted: list[ExtractedDataTypeName],
-    requested_relations: EntitiesSubgraphRequest,
-    min_score_bm25: float,
-    min_score_semantic: float,
-    highlight: bool = False,
-) -> KnowledgeboxFindResults:
-    # force getting transaction on current asyncio task
-    # so all sub tasks will use the same transaction
-    # this is contextvar magic that is probably not ideal
-    await get_read_only_transaction()
-    paragraphs: list[list[ParagraphResult]] = []
-    vectors: list[list[DocumentScored]] = []
-    relations = []
+def compose_find_resources(
+    text_blocks: list[TextBlockMatch],
+    resources: list[Resource],
+) -> dict[str, FindResource]:
+    find_resources: dict[str, FindResource] = {}
-    next_page = True
-    ematches: list[str] = []
-    real_query = ""
-    total_paragraphs = 0
-    for response in search_responses:
-        # Iterate over answers from different logic shards
+    for resource in resources:
+        rid = resource.id
+        if rid not in find_resources:
+            find_resources[rid] = FindResource(id=rid, fields={})
+            find_resources[rid].updated_from(resource)
-        ematches.extend(response.paragraph.ematches)
-        real_query = response.paragraph.query
-        next_page = next_page and response.paragraph.next_page
-        total_paragraphs += response.paragraph.total
+    for text_block in text_blocks:
+        rid = text_block.paragraph_id.rid
+        if rid not in find_resources:
+            # resource not found in db, skipping
+            continue
-        paragraphs.append(cast(list[ParagraphResult], response.paragraph.results))
-        vectors.append(cast(list[DocumentScored], response.vector.documents))
+        find_resource = find_resources[rid]
+        field_id = text_block.paragraph_id.field_id.short_without_subfield()
+        find_field = find_resource.fields.setdefault(field_id, FindField(paragraphs={}))
-        relations.append(response.relation)
+        paragraph_id = text_block.paragraph_id.full()
+        find_paragraph = text_block_to_find_paragraph(text_block)
-    rcache = get_resource_cache(clear=True)
+        find_field.paragraphs[paragraph_id] = find_paragraph
-    try:
-        result_paragraphs, merged_next_page = merge_paragraphs_vectors(
-            paragraphs, vectors, count, page, min_score_semantic
-        )
-        next_page = next_page or merged_next_page
-        api_results = KnowledgeboxFindResults(
-            resources={},
-            query=real_query,
-            total=total_paragraphs,
-            page_number=page,
-            page_size=count,
-            next_page=next_page,
-            min_score=MinScore(
-                bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)
-            ),
-            best_matches=[],
-        )
+    return find_resources
-        await fetch_find_metadata(
-            api_results.resources,
-            api_results.best_matches,
-            result_paragraphs,
-            kbid,
-            show,
-            field_type_filter,
-            extracted,
-            highlight,
-            ematches,
-        )
-        api_results.relations = await merge_relations_results(
-            relations, requested_relations
-        )
-        return api_results
-    finally:
-        rcache.clear()
+def _round(x: float) -> float:
+    return round(x, ndigits=3)

nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

nucliadb 2.46.1.post382py3-none-any.whl → 6.2.1.post2777py3-none-any.whl