PyPI - nucliadb - Versions diffs - 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl - Mend

nucliadb 2.46.1.post382py3-none-any.whl → 6.2.1.post2777py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (431) hide show

migrations/0002_rollover_shards.py +1 -2
migrations/0003_allfields_key.py +2 -37
migrations/0004_rollover_shards.py +1 -2
migrations/0005_rollover_shards.py +1 -2
migrations/0006_rollover_shards.py +2 -4
migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
migrations/0010_fix_corrupt_indexes.py +11 -12
migrations/0011_materialize_labelset_ids.py +2 -18
migrations/0012_rollover_shards.py +6 -12
migrations/0013_rollover_shards.py +2 -4
migrations/0014_rollover_shards.py +5 -7
migrations/0015_targeted_rollover.py +6 -12
migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
migrations/0017_multiple_writable_shards.py +3 -6
migrations/0018_purge_orphan_kbslugs.py +59 -0
migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
migrations/0020_drain_nodes_from_cluster.py +83 -0
nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
migrations/0023_backfill_pg_catalog.py +80 -0
migrations/0025_assign_models_to_kbs_v2.py +113 -0
migrations/0026_fix_high_cardinality_content_types.py +61 -0
migrations/0027_rollover_texts3.py +73 -0
nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
migrations/pg/0002_catalog.py +42 -0
nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
nucliadb/common/cluster/base.py +41 -24
nucliadb/common/cluster/discovery/base.py +6 -14
nucliadb/common/cluster/discovery/k8s.py +9 -19
nucliadb/common/cluster/discovery/manual.py +1 -3
nucliadb/common/cluster/discovery/single.py +1 -2
nucliadb/common/cluster/discovery/utils.py +1 -3
nucliadb/common/cluster/grpc_node_dummy.py +11 -16
nucliadb/common/cluster/index_node.py +10 -19
nucliadb/common/cluster/manager.py +223 -102
nucliadb/common/cluster/rebalance.py +42 -37
nucliadb/common/cluster/rollover.py +377 -204
nucliadb/common/cluster/settings.py +16 -9
nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
nucliadb/common/cluster/standalone/index_node.py +4 -11
nucliadb/common/cluster/standalone/service.py +2 -6
nucliadb/common/cluster/standalone/utils.py +9 -6
nucliadb/common/cluster/utils.py +43 -29
nucliadb/common/constants.py +20 -0
nucliadb/common/context/__init__.py +6 -4
nucliadb/common/context/fastapi.py +8 -5
nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
nucliadb/common/datamanagers/__init__.py +24 -5
nucliadb/common/datamanagers/atomic.py +102 -0
nucliadb/common/datamanagers/cluster.py +5 -5
nucliadb/common/datamanagers/entities.py +6 -16
nucliadb/common/datamanagers/fields.py +84 -0
nucliadb/common/datamanagers/kb.py +101 -24
nucliadb/common/datamanagers/labels.py +26 -56
nucliadb/common/datamanagers/processing.py +2 -6
nucliadb/common/datamanagers/resources.py +214 -117
nucliadb/common/datamanagers/rollover.py +77 -16
nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
nucliadb/common/datamanagers/utils.py +19 -11
nucliadb/common/datamanagers/vectorsets.py +110 -0
nucliadb/common/external_index_providers/base.py +257 -0
nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
nucliadb/common/external_index_providers/manager.py +101 -0
nucliadb/common/external_index_providers/pinecone.py +933 -0
nucliadb/common/external_index_providers/settings.py +52 -0
nucliadb/common/http_clients/auth.py +3 -6
nucliadb/common/http_clients/processing.py +6 -11
nucliadb/common/http_clients/utils.py +1 -3
nucliadb/common/ids.py +240 -0
nucliadb/common/locking.py +43 -13
nucliadb/common/maindb/driver.py +11 -35
nucliadb/common/maindb/exceptions.py +6 -6
nucliadb/common/maindb/local.py +22 -9
nucliadb/common/maindb/pg.py +206 -111
nucliadb/common/maindb/utils.py +13 -44
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +260 -0
nucliadb/export_import/datamanager.py +25 -19
nucliadb/export_import/exceptions.py +8 -0
nucliadb/export_import/exporter.py +20 -7
nucliadb/export_import/importer.py +6 -11
nucliadb/export_import/models.py +5 -5
nucliadb/export_import/tasks.py +4 -4
nucliadb/export_import/utils.py +94 -54
nucliadb/health.py +1 -3
nucliadb/ingest/app.py +15 -11
nucliadb/ingest/consumer/auditing.py +30 -147
nucliadb/ingest/consumer/consumer.py +96 -52
nucliadb/ingest/consumer/materializer.py +10 -12
nucliadb/ingest/consumer/pull.py +12 -27
nucliadb/ingest/consumer/service.py +20 -19
nucliadb/ingest/consumer/shard_creator.py +7 -14
nucliadb/ingest/consumer/utils.py +1 -3
nucliadb/ingest/fields/base.py +139 -188
nucliadb/ingest/fields/conversation.py +18 -5
nucliadb/ingest/fields/exceptions.py +1 -4
nucliadb/ingest/fields/file.py +7 -25
nucliadb/ingest/fields/link.py +11 -16
nucliadb/ingest/fields/text.py +9 -4
nucliadb/ingest/orm/brain.py +255 -262
nucliadb/ingest/orm/broker_message.py +181 -0
nucliadb/ingest/orm/entities.py +36 -51
nucliadb/ingest/orm/exceptions.py +12 -0
nucliadb/ingest/orm/knowledgebox.py +334 -278
nucliadb/ingest/orm/processor/__init__.py +2 -697
nucliadb/ingest/orm/processor/auditing.py +117 -0
nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
nucliadb/ingest/orm/processor/processor.py +752 -0
nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
nucliadb/ingest/orm/resource.py +280 -520
nucliadb/ingest/orm/utils.py +25 -31
nucliadb/ingest/partitions.py +3 -9
nucliadb/ingest/processing.py +76 -81
nucliadb/ingest/py.typed +0 -0
nucliadb/ingest/serialize.py +37 -173
nucliadb/ingest/service/__init__.py +1 -3
nucliadb/ingest/service/writer.py +186 -577
nucliadb/ingest/settings.py +13 -22
nucliadb/ingest/utils.py +3 -6
nucliadb/learning_proxy.py +264 -51
nucliadb/metrics_exporter.py +30 -19
nucliadb/middleware/__init__.py +1 -3
nucliadb/migrator/command.py +1 -3
nucliadb/migrator/datamanager.py +13 -13
nucliadb/migrator/migrator.py +57 -37
nucliadb/migrator/settings.py +2 -1
nucliadb/migrator/utils.py +18 -10
nucliadb/purge/__init__.py +139 -33
nucliadb/purge/orphan_shards.py +7 -13
nucliadb/reader/__init__.py +1 -3
nucliadb/reader/api/models.py +3 -14
nucliadb/reader/api/v1/__init__.py +0 -1
nucliadb/reader/api/v1/download.py +27 -94
nucliadb/reader/api/v1/export_import.py +4 -4
nucliadb/reader/api/v1/knowledgebox.py +13 -13
nucliadb/reader/api/v1/learning_config.py +8 -12
nucliadb/reader/api/v1/resource.py +67 -93
nucliadb/reader/api/v1/services.py +70 -125
nucliadb/reader/app.py +16 -46
nucliadb/reader/lifecycle.py +18 -4
nucliadb/reader/py.typed +0 -0
nucliadb/reader/reader/notifications.py +10 -31
nucliadb/search/__init__.py +1 -3
nucliadb/search/api/v1/__init__.py +2 -2
nucliadb/search/api/v1/ask.py +112 -0
nucliadb/search/api/v1/catalog.py +184 -0
nucliadb/search/api/v1/feedback.py +17 -25
nucliadb/search/api/v1/find.py +41 -41
nucliadb/search/api/v1/knowledgebox.py +90 -62
nucliadb/search/api/v1/predict_proxy.py +2 -2
nucliadb/search/api/v1/resource/ask.py +66 -117
nucliadb/search/api/v1/resource/search.py +51 -72
nucliadb/search/api/v1/router.py +1 -0
nucliadb/search/api/v1/search.py +50 -197
nucliadb/search/api/v1/suggest.py +40 -54
nucliadb/search/api/v1/summarize.py +9 -5
nucliadb/search/api/v1/utils.py +2 -1
nucliadb/search/app.py +16 -48
nucliadb/search/lifecycle.py +10 -3
nucliadb/search/predict.py +176 -188
nucliadb/search/py.typed +0 -0
nucliadb/search/requesters/utils.py +41 -63
nucliadb/search/search/cache.py +149 -20
nucliadb/search/search/chat/ask.py +918 -0
nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
nucliadb/search/search/chat/images.py +41 -17
nucliadb/search/search/chat/prompt.py +851 -282
nucliadb/search/search/chat/query.py +274 -267
nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
nucliadb/search/search/fetch.py +43 -36
nucliadb/search/search/filters.py +9 -15
nucliadb/search/search/find.py +214 -54
nucliadb/search/search/find_merge.py +408 -391
nucliadb/search/search/hydrator.py +191 -0
nucliadb/search/search/merge.py +198 -234
nucliadb/search/search/metrics.py +73 -2
nucliadb/search/search/paragraphs.py +64 -106
nucliadb/search/search/pgcatalog.py +233 -0
nucliadb/search/search/predict_proxy.py +1 -1
nucliadb/search/search/query.py +386 -257
nucliadb/search/search/query_parser/exceptions.py +22 -0
nucliadb/search/search/query_parser/models.py +101 -0
nucliadb/search/search/query_parser/parser.py +183 -0
nucliadb/search/search/rank_fusion.py +204 -0
nucliadb/search/search/rerankers.py +270 -0
nucliadb/search/search/shards.py +4 -38
nucliadb/search/search/summarize.py +14 -18
nucliadb/search/search/utils.py +27 -4
nucliadb/search/settings.py +15 -1
nucliadb/standalone/api_router.py +4 -10
nucliadb/standalone/app.py +17 -14
nucliadb/standalone/auth.py +7 -21
nucliadb/standalone/config.py +9 -12
nucliadb/standalone/introspect.py +5 -5
nucliadb/standalone/lifecycle.py +26 -25
nucliadb/standalone/migrations.py +58 -0
nucliadb/standalone/purge.py +9 -8
nucliadb/standalone/py.typed +0 -0
nucliadb/standalone/run.py +25 -18
nucliadb/standalone/settings.py +10 -14
nucliadb/standalone/versions.py +15 -5
nucliadb/tasks/consumer.py +8 -12
nucliadb/tasks/producer.py +7 -6
nucliadb/tests/config.py +53 -0
nucliadb/train/__init__.py +1 -3
nucliadb/train/api/utils.py +1 -2
nucliadb/train/api/v1/shards.py +2 -2
nucliadb/train/api/v1/trainset.py +4 -6
nucliadb/train/app.py +14 -47
nucliadb/train/generator.py +10 -19
nucliadb/train/generators/field_classifier.py +7 -19
nucliadb/train/generators/field_streaming.py +156 -0
nucliadb/train/generators/image_classifier.py +12 -18
nucliadb/train/generators/paragraph_classifier.py +5 -9
nucliadb/train/generators/paragraph_streaming.py +6 -9
nucliadb/train/generators/question_answer_streaming.py +19 -20
nucliadb/train/generators/sentence_classifier.py +9 -15
nucliadb/train/generators/token_classifier.py +45 -36
nucliadb/train/generators/utils.py +14 -18
nucliadb/train/lifecycle.py +7 -3
nucliadb/train/nodes.py +23 -32
nucliadb/train/py.typed +0 -0
nucliadb/train/servicer.py +13 -21
nucliadb/train/settings.py +2 -6
nucliadb/train/types.py +13 -10
nucliadb/train/upload.py +3 -6
nucliadb/train/uploader.py +20 -25
nucliadb/train/utils.py +1 -1
nucliadb/writer/__init__.py +1 -3
nucliadb/writer/api/constants.py +0 -5
nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
nucliadb/writer/api/v1/export_import.py +102 -49
nucliadb/writer/api/v1/field.py +196 -620
nucliadb/writer/api/v1/knowledgebox.py +221 -71
nucliadb/writer/api/v1/learning_config.py +2 -2
nucliadb/writer/api/v1/resource.py +114 -216
nucliadb/writer/api/v1/services.py +64 -132
nucliadb/writer/api/v1/slug.py +61 -0
nucliadb/writer/api/v1/transaction.py +67 -0
nucliadb/writer/api/v1/upload.py +184 -215
nucliadb/writer/app.py +11 -61
nucliadb/writer/back_pressure.py +62 -43
nucliadb/writer/exceptions.py +0 -4
nucliadb/writer/lifecycle.py +21 -15
nucliadb/writer/py.typed +0 -0
nucliadb/writer/resource/audit.py +2 -1
nucliadb/writer/resource/basic.py +48 -62
nucliadb/writer/resource/field.py +45 -135
nucliadb/writer/resource/origin.py +1 -2
nucliadb/writer/settings.py +14 -5
nucliadb/writer/tus/__init__.py +17 -15
nucliadb/writer/tus/azure.py +111 -0
nucliadb/writer/tus/dm.py +17 -5
nucliadb/writer/tus/exceptions.py +1 -3
nucliadb/writer/tus/gcs.py +56 -84
nucliadb/writer/tus/local.py +21 -37
nucliadb/writer/tus/s3.py +28 -68
nucliadb/writer/tus/storage.py +5 -56
nucliadb/writer/vectorsets.py +125 -0
nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
nucliadb/common/maindb/redis.py +0 -194
nucliadb/common/maindb/tikv.py +0 -412
nucliadb/ingest/fields/layout.py +0 -58
nucliadb/ingest/tests/conftest.py +0 -30
nucliadb/ingest/tests/fixtures.py +0 -771
nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
nucliadb/ingest/tests/unit/test_partitions.py +0 -40
nucliadb/ingest/tests/unit/test_processing.py +0 -171
nucliadb/middleware/transaction.py +0 -117
nucliadb/reader/api/v1/learning_collector.py +0 -63
nucliadb/reader/tests/__init__.py +0 -19
nucliadb/reader/tests/conftest.py +0 -31
nucliadb/reader/tests/fixtures.py +0 -136
nucliadb/reader/tests/test_list_resources.py +0 -75
nucliadb/reader/tests/test_reader_file_download.py +0 -273
nucliadb/reader/tests/test_reader_resource.py +0 -379
nucliadb/reader/tests/test_reader_resource_field.py +0 -219
nucliadb/search/api/v1/chat.py +0 -258
nucliadb/search/api/v1/resource/chat.py +0 -94
nucliadb/search/tests/__init__.py +0 -19
nucliadb/search/tests/conftest.py +0 -33
nucliadb/search/tests/fixtures.py +0 -199
nucliadb/search/tests/node.py +0 -465
nucliadb/search/tests/unit/__init__.py +0 -18
nucliadb/search/tests/unit/api/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
nucliadb/search/tests/unit/search/__init__.py +0 -18
nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
nucliadb/search/tests/unit/search/search/__init__.py +0 -19
nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
nucliadb/search/tests/unit/search/test_fetch.py +0 -108
nucliadb/search/tests/unit/search/test_filters.py +0 -125
nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
nucliadb/search/tests/unit/search/test_query.py +0 -201
nucliadb/search/tests/unit/test_app.py +0 -79
nucliadb/search/tests/unit/test_find_merge.py +0 -112
nucliadb/search/tests/unit/test_merge.py +0 -34
nucliadb/search/tests/unit/test_predict.py +0 -584
nucliadb/standalone/tests/__init__.py +0 -19
nucliadb/standalone/tests/conftest.py +0 -33
nucliadb/standalone/tests/fixtures.py +0 -38
nucliadb/standalone/tests/unit/__init__.py +0 -18
nucliadb/standalone/tests/unit/test_api_router.py +0 -61
nucliadb/standalone/tests/unit/test_auth.py +0 -169
nucliadb/standalone/tests/unit/test_introspect.py +0 -35
nucliadb/standalone/tests/unit/test_versions.py +0 -68
nucliadb/tests/benchmarks/__init__.py +0 -19
nucliadb/tests/benchmarks/test_search.py +0 -99
nucliadb/tests/conftest.py +0 -32
nucliadb/tests/fixtures.py +0 -736
nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
nucliadb/tests/migrations/__init__.py +0 -19
nucliadb/tests/migrations/test_migration_0017.py +0 -80
nucliadb/tests/tikv.py +0 -240
nucliadb/tests/unit/__init__.py +0 -19
nucliadb/tests/unit/common/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
nucliadb/tests/unit/common/maindb/__init__.py +0 -18
nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
nucliadb/tests/unit/common/test_context.py +0 -36
nucliadb/tests/unit/export_import/__init__.py +0 -19
nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
nucliadb/tests/unit/export_import/test_utils.py +0 -294
nucliadb/tests/unit/migrator/__init__.py +0 -19
nucliadb/tests/unit/migrator/test_migrator.py +0 -87
nucliadb/tests/unit/tasks/__init__.py +0 -19
nucliadb/tests/unit/tasks/conftest.py +0 -42
nucliadb/tests/unit/tasks/test_consumer.py +0 -93
nucliadb/tests/unit/tasks/test_producer.py +0 -95
nucliadb/tests/unit/tasks/test_tasks.py +0 -60
nucliadb/tests/unit/test_field_ids.py +0 -49
nucliadb/tests/unit/test_health.py +0 -84
nucliadb/tests/unit/test_kb_slugs.py +0 -54
nucliadb/tests/unit/test_learning_proxy.py +0 -252
nucliadb/tests/unit/test_metrics_exporter.py +0 -77
nucliadb/tests/unit/test_purge.py +0 -138
nucliadb/tests/utils/__init__.py +0 -74
nucliadb/tests/utils/aiohttp_session.py +0 -44
nucliadb/tests/utils/broker_messages/__init__.py +0 -167
nucliadb/tests/utils/broker_messages/fields.py +0 -181
nucliadb/tests/utils/broker_messages/helpers.py +0 -33
nucliadb/tests/utils/entities.py +0 -78
nucliadb/train/api/v1/check.py +0 -60
nucliadb/train/tests/__init__.py +0 -19
nucliadb/train/tests/conftest.py +0 -29
nucliadb/train/tests/fixtures.py +0 -342
nucliadb/train/tests/test_field_classification.py +0 -122
nucliadb/train/tests/test_get_entities.py +0 -80
nucliadb/train/tests/test_get_info.py +0 -51
nucliadb/train/tests/test_get_ontology.py +0 -34
nucliadb/train/tests/test_get_ontology_count.py +0 -63
nucliadb/train/tests/test_image_classification.py +0 -222
nucliadb/train/tests/test_list_fields.py +0 -39
nucliadb/train/tests/test_list_paragraphs.py +0 -73
nucliadb/train/tests/test_list_resources.py +0 -39
nucliadb/train/tests/test_list_sentences.py +0 -71
nucliadb/train/tests/test_paragraph_classification.py +0 -123
nucliadb/train/tests/test_paragraph_streaming.py +0 -118
nucliadb/train/tests/test_question_answer_streaming.py +0 -239
nucliadb/train/tests/test_sentence_classification.py +0 -143
nucliadb/train/tests/test_token_classification.py +0 -136
nucliadb/train/tests/utils.py +0 -108
nucliadb/writer/layouts/__init__.py +0 -51
nucliadb/writer/layouts/v1.py +0 -59
nucliadb/writer/resource/vectors.py +0 -120
nucliadb/writer/tests/__init__.py +0 -19
nucliadb/writer/tests/conftest.py +0 -31
nucliadb/writer/tests/fixtures.py +0 -192
nucliadb/writer/tests/test_fields.py +0 -486
nucliadb/writer/tests/test_files.py +0 -743
nucliadb/writer/tests/test_knowledgebox.py +0 -49
nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
nucliadb/writer/tests/test_resources.py +0 -546
nucliadb/writer/tests/test_service.py +0 -137
nucliadb/writer/tests/test_tus.py +0 -203
nucliadb/writer/tests/utils.py +0 -35
nucliadb/writer/tus/pg.py +0 -125
nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
{nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
/nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
/nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
/nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
/nucliadb/{ingest/tests → tests}/vectors.py +0 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
{nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0

nucliadb/search/search/chat/prompt.py CHANGED Viewed

@@ -17,32 +17,55 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
+import asyncio
+import copy
+from collections import deque
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Sequence, Tuple
+from typing import Deque, Dict, List, Optional, Sequence, Tuple, Union, cast
+import yaml
+from pydantic import BaseModel
+from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, FieldId, ParagraphId
+from nucliadb.common.maindb.utils import get_driver
+from nucliadb.common.models_utils import from_proto
 from nucliadb.ingest.fields.base import Field
 from nucliadb.ingest.fields.conversation import Conversation
+from nucliadb.ingest.fields.file import File
 from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
-from nucliadb.ingest.orm.resource import KB_REVERSE
-from nucliadb.ingest.orm.resource import Resource as ResourceORM
-from nucliadb.middleware.transaction import get_read_only_transaction
 from nucliadb.search import logger
-from nucliadb.search.search import paragraphs
-from nucliadb.search.search.chat.images import get_page_image, get_paragraph_image
+from nucliadb.search.search import cache
+from nucliadb.search.search.chat.images import (
+    get_file_thumbnail_image,
+    get_page_image,
+    get_paragraph_image,
+)
+from nucliadb.search.search.hydrator import hydrate_field_text, hydrate_resource_text
+from nucliadb.search.search.paragraphs import get_paragraph_text
+from nucliadb_models.metadata import Extra, Origin
 from nucliadb_models.search import (
     SCORE_TYPE,
+    ConversationalStrategy,
+    FieldExtensionStrategy,
     FindParagraph,
+    FullResourceStrategy,
+    HierarchyResourceStrategy,
     ImageRagStrategy,
     ImageRagStrategyName,
-    KnowledgeboxFindResults,
+    MetadataExtensionStrategy,
+    MetadataExtensionType,
+    NeighbouringParagraphsStrategy,
+    PageImageStrategy,
+    ParagraphImageStrategy,
     PromptContext,
     PromptContextImages,
     PromptContextOrder,
     RagStrategy,
     RagStrategyName,
+    TableImageStrategy,
 )
 from nucliadb_protos import resources_pb2
-from nucliadb_utils.asyncio_utils import ConcurrentRunner, run_concurrently
+from nucliadb_utils.asyncio_utils import run_concurrently
 from nucliadb_utils.utilities import get_storage
 MAX_RESOURCE_TASKS = 5
@@ -53,12 +76,20 @@ MAX_RESOURCE_FIELD_TASKS = 4
 # The hope here is it will be enough to get the answer to the question.
 CONVERSATION_MESSAGE_CONTEXT_EXPANSION = 15
+TextBlockId = Union[ParagraphId, FieldId]
+class ParagraphIdNotFoundInExtractedMetadata(Exception):
+    pass
 class CappedPromptContext:
     """
-    Class to keep track of the size of the prompt context and raise an exception if it exceeds the configured limit.
+    Class to keep track of the size (in number of characters) of the prompt context
+    and raise an exception if it exceeds the configured limit.
-    This class will automatically trim data that exceeds the limit when it's being set on the dictionary.
+    This class will automatically trim data that exceeds the limit when it's being
+    set on the dictionary.
     """
     def __init__(self, max_size: Optional[int]):
@@ -68,15 +99,26 @@ class CappedPromptContext:
         self._size = 0
     def __setitem__(self, key: str, value: str) -> None:
+        prev_value_len = len(self.output.get(key, ""))
         if self.max_size is None:
-            self.output[key] = value
+            # Unbounded size context
+            to_add = value
         else:
-            existing_len = len(self.output.get(key, ""))
-            self._size -= existing_len
-            size_available = self.max_size - self._size
-            if size_available > 0:
-                self.output[key] = value[:size_available]
-                self._size += len(self.output[key])
+            # Make sure we don't exceed the max size
+            size_available = max(self.max_size - self._size + prev_value_len, 0)
+            to_add = value[:size_available]
+        self.output[key] = to_add
+        self._size = self._size - prev_value_len + len(to_add)
+    def __getitem__(self, key: str) -> str:
+        return self.output.__getitem__(key)
+    def __delitem__(self, key: str) -> None:
+        value = self.output.pop(key, "")
+        self._size -= len(value)
+    def text_block_ids(self) -> list[str]:
+        return list(self.output.keys())
     @property
     def size(self) -> int:
@@ -91,15 +133,15 @@ async def get_next_conversation_messages(
     num_messages: int,
     message_type: Optional[resources_pb2.Message.MessageType.ValueType] = None,
     msg_to: Optional[str] = None,
-):
+) -> List[resources_pb2.Message]:
     output = []
     cmetadata = await field_obj.get_metadata()
     for current_page in range(page, cmetadata.pages + 1):
         conv = await field_obj.db_get_value(current_page)
         for message in conv.messages[start_idx:]:
-            if message_type is not None and message.type != message_type:
+            if message_type is not None and message.type != message_type:  # pragma: no cover
                 continue
-            if msg_to is not None and msg_to not in message.to:
+            if msg_to is not None and msg_to not in message.to:  # pragma: no cover
                 continue
             output.append(message)
             if len(output) >= num_messages:
@@ -122,16 +164,21 @@ async def find_conversation_message(
 async def get_expanded_conversation_messages(
-    *, kb: KnowledgeBoxORM, rid: str, field_id: str, mident: str
+    *,
+    kb: KnowledgeBoxORM,
+    rid: str,
+    field_id: str,
+    mident: str,
+    max_messages: int = CONVERSATION_MESSAGE_CONTEXT_EXPANSION,
 ) -> list[resources_pb2.Message]:
     resource = await kb.get(rid)
-    if resource is None:
+    if resource is None:  # pragma: no cover
         return []
-    field_obj = await resource.get_field(field_id, KB_REVERSE["c"], load=True)
+    field_obj: Conversation = await resource.get_field(field_id, FIELD_TYPE_STR_TO_PB["c"], load=True)  # type: ignore
     found_message, found_page, found_idx = await find_conversation_message(
         field_obj=field_obj, mident=mident
     )
-    if found_message is None:
+    if found_message is None:  # pragma: no cover
         return []
     elif found_message.type == resources_pb2.Message.MessageType.QUESTION:
         # only try to get answer if it was a question
@@ -147,14 +194,14 @@ async def get_expanded_conversation_messages(
             field_obj=field_obj,
             page=found_page,
             start_idx=found_idx + 1,
-            num_messages=CONVERSATION_MESSAGE_CONTEXT_EXPANSION,
+            num_messages=max_messages,
         )
 async def default_prompt_context(
     context: CappedPromptContext,
     kbid: str,
-    results: KnowledgeboxFindResults,
+    ordered_paragraphs: list[FindParagraph],
 ) -> None:
     """
     - Updates context (which is an ordered dict of text_block_id -> context_text).
@@ -166,128 +213,253 @@ async def default_prompt_context(
     - Using an dict prevents from duplicates pulled in through conversation expansion.
     """
     # Sort retrieved paragraphs by decreasing order (most relevant first)
-    ordered_paras = get_ordered_paragraphs(results)
-    txn = await get_read_only_transaction()
-    storage = await get_storage()
-    kb = KnowledgeBoxORM(txn, storage, kbid)
-    for paragraph in ordered_paras:
-        context[paragraph.id] = _clean_paragraph_text(paragraph)
-        # If the paragraph is a conversation and it matches semantically, we assume we
-        # have matched with the question, therefore try to include the answer to the
-        # context by pulling the next few messages of the conversation field
-        rid, field_type, field_id, mident = paragraph.id.split("/")[:4]
-        if field_type == "c" and paragraph.score_type in (
-            SCORE_TYPE.VECTOR,
-            SCORE_TYPE.BOTH,
-        ):
-            expanded_msgs = await get_expanded_conversation_messages(
-                kb=kb, rid=rid, field_id=field_id, mident=mident
-            )
-            for msg in expanded_msgs:
-                text = msg.content.text.strip()
-                pid = f"{rid}/{field_type}/{field_id}/{msg.ident}/0-{len(msg.content.text) + 1}"
-                context[pid] = text
-async def get_field_extracted_text(field: Field) -> Optional[tuple[Field, str]]:
-    extracted_text_pb = await field.get_extracted_text(force=True)
-    if extracted_text_pb is None:
-        return None
-    return field, extracted_text_pb.text
-async def get_resource_field_extracted_text(
-    kb_obj: KnowledgeBoxORM,
-    resource_uuid,
-    field_id: str,
-) -> Optional[tuple[Field, str]]:
-    resource = await kb_obj.get(resource_uuid)
-    if resource is None:
-        return None
-    try:
-        field_type, field_key = field_id.strip("/").split("/")
-    except ValueError:
-        logger.error(f"Invalid field id: {field_id}. Skipping getting extracted text.")
-        return None
-    field = await resource.get_field(field_key, KB_REVERSE[field_type], load=False)
-    if field is None:
-        return None
-    result = await get_field_extracted_text(field)
-    if result is None:
-        return None
-    _, extracted_text = result
-    return field, extracted_text
-async def get_resource_extracted_texts(
-    kbid: str,
-    resource_uuid: str,
-) -> list[tuple[Field, str]]:
-    txn = await get_read_only_transaction()
-    storage = await get_storage()
-    kb = KnowledgeBoxORM(txn, storage, kbid)
-    resource = ResourceORM(
-        txn=txn,
-        storage=storage,
-        kb=kb,
-        uuid=resource_uuid,
-    )
-    # Schedule the extraction of the text of each field in the resource
-    runner = ConcurrentRunner(max_tasks=MAX_RESOURCE_FIELD_TASKS)
-    for field_type, field_key in await resource.get_fields(force=True):
-        field = await resource.get_field(field_key, field_type, load=False)
-        runner.schedule(get_field_extracted_text(field))
-    # Wait for the results
-    results = await runner.wait()
-    return [result for result in results if result is not None]
+    async with get_driver().transaction(read_only=True) as txn:
+        storage = await get_storage()
+        kb = KnowledgeBoxORM(txn, storage, kbid)
+        for paragraph in ordered_paragraphs:
+            context[paragraph.id] = _clean_paragraph_text(paragraph)
+            # If the paragraph is a conversation and it matches semantically, we assume we
+            # have matched with the question, therefore try to include the answer to the
+            # context by pulling the next few messages of the conversation field
+            rid, field_type, field_id, mident = paragraph.id.split("/")[:4]
+            if field_type == "c" and paragraph.score_type in (
+                SCORE_TYPE.VECTOR,
+                SCORE_TYPE.BOTH,
+            ):
+                expanded_msgs = await get_expanded_conversation_messages(
+                    kb=kb, rid=rid, field_id=field_id, mident=mident
+                )
+                for msg in expanded_msgs:
+                    text = msg.content.text.strip()
+                    pid = f"{rid}/{field_type}/{field_id}/{msg.ident}/0-{len(msg.content.text) + 1}"
+                    context[pid] = text
 async def full_resource_prompt_context(
     context: CappedPromptContext,
     kbid: str,
-    results: KnowledgeboxFindResults,
-    number_of_full_resources: Optional[int] = None,
+    ordered_paragraphs: list[FindParagraph],
+    resource: Optional[str],
+    strategy: FullResourceStrategy,
 ) -> None:
     """
     Algorithm steps:
         - Collect the list of resources in the results (in order of relevance).
         - For each resource, collect the extracted text from all its fields and craft the context.
-    """
-    # Collect the list of resources in the results (in order of relevance).
-    ordered_paras = get_ordered_paragraphs(results)
-    ordered_resources = []
-    for paragraph in ordered_paras:
-        resource_uuid = paragraph.id.split("/")[0]
-        if resource_uuid not in ordered_resources:
-            ordered_resources.append(resource_uuid)
+    Arguments:
+        context: The context to be updated.
+        kbid: The knowledge box id.
+        ordered_paragraphs: The results of the retrieval (find) operation.
+        resource: The resource to be included in the context. This is used only when chatting with a specific resource with no retrieval.
+        strategy: strategy instance containing, for example, the number of full resources to include in the context.
+    """  # noqa: E501
+    if resource is not None:
+        # The user has specified a resource to be included in the context.
+        ordered_resources = [resource]
+    else:
+        # Collect the list of resources in the results (in order of relevance).
+        ordered_resources = []
+        for paragraph in ordered_paragraphs:
+            resource_uuid = parse_text_block_id(paragraph.id).rid
+            if resource_uuid not in ordered_resources:
+                skip = False
+                if strategy.apply_to is not None:
+                    # decide whether the resource should be extended or not
+                    for label in strategy.apply_to.exclude:
+                        skip = skip or (label in (paragraph.labels or []))
+                if not skip:
+                    ordered_resources.append(resource_uuid)
     # For each resource, collect the extracted text from all its fields.
-    resource_extracted_texts = await run_concurrently(
+    resources_extracted_texts = await run_concurrently(
         [
-            get_resource_extracted_texts(kbid, resource_uuid)
-            for resource_uuid in ordered_resources[:number_of_full_resources]
+            hydrate_resource_text(kbid, resource_uuid, max_concurrent_tasks=MAX_RESOURCE_FIELD_TASKS)
+            for resource_uuid in ordered_resources[: strategy.count]
         ],
         max_concurrent=MAX_RESOURCE_TASKS,
     )
-    for extracted_texts in resource_extracted_texts:
-        if extracted_texts is None:
+    added_fields = set()
+    for resource_extracted_texts in resources_extracted_texts:
+        if resource_extracted_texts is None:
             continue
-        for field, extracted_text in extracted_texts:
+        for field, extracted_text in resource_extracted_texts:
+            # First off, remove the text block ids from paragraphs that belong to
+            # the same field, as otherwise the context will be duplicated.
+            for tb_id in context.text_block_ids():
+                if tb_id.startswith(field.full()):
+                    del context[tb_id]
             # Add the extracted text of each field to the context.
-            context[field.resource_unique_id] = extracted_text
+            context[field.full()] = extracted_text
+            added_fields.add(field.full())
+    if strategy.include_remaining_text_blocks:
+        for paragraph in ordered_paragraphs:
+            pid = cast(ParagraphId, parse_text_block_id(paragraph.id))
+            if pid.field_id.full() not in added_fields:
+                context[paragraph.id] = _clean_paragraph_text(paragraph)
-async def composed_prompt_context(
+async def extend_prompt_context_with_metadata(
     context: CappedPromptContext,
     kbid: str,
-    results: KnowledgeboxFindResults,
-    extend_with_fields: list[str],
+    strategy: MetadataExtensionStrategy,
+) -> None:
+    text_block_ids: list[TextBlockId] = []
+    for text_block_id in context.text_block_ids():
+        try:
+            text_block_ids.append(parse_text_block_id(text_block_id))
+        except ValueError:  # pragma: no cover
+            # Some text block ids are not paragraphs nor fields, so they are skipped
+            # (e.g. USER_CONTEXT_0, when the user provides extra context)
+            continue
+    if len(text_block_ids) == 0:  # pragma: no cover
+        return
+    if MetadataExtensionType.ORIGIN in strategy.types:
+        await extend_prompt_context_with_origin_metadata(context, kbid, text_block_ids)
+    if MetadataExtensionType.CLASSIFICATION_LABELS in strategy.types:
+        await extend_prompt_context_with_classification_labels(context, kbid, text_block_ids)
+    if MetadataExtensionType.NERS in strategy.types:
+        await extend_prompt_context_with_ner(context, kbid, text_block_ids)
+    if MetadataExtensionType.EXTRA_METADATA in strategy.types:
+        await extend_prompt_context_with_extra_metadata(context, kbid, text_block_ids)
+def parse_text_block_id(text_block_id: str) -> TextBlockId:
+    try:
+        # Typically, the text block id is a paragraph id
+        return ParagraphId.from_string(text_block_id)
+    except ValueError:
+        # When we're doing `full_resource` or `hierarchy` strategies,the text block id
+        # is a field id
+        return FieldId.from_string(text_block_id)
+async def extend_prompt_context_with_origin_metadata(context, kbid, text_block_ids: list[TextBlockId]):
+    async def _get_origin(kbid: str, rid: str) -> tuple[str, Optional[Origin]]:
+        origin = None
+        resource = await cache.get_resource(kbid, rid)
+        if resource is not None:
+            pb_origin = await resource.get_origin()
+            if pb_origin is not None:
+                origin = from_proto.origin(pb_origin)
+        return rid, origin
+    rids = {tb_id.rid for tb_id in text_block_ids}
+    origins = await run_concurrently([_get_origin(kbid, rid) for rid in rids])
+    rid_to_origin = {rid: origin for rid, origin in origins if origin is not None}
+    for tb_id in text_block_ids:
+        origin = rid_to_origin.get(tb_id.rid)
+        if origin is not None and tb_id.full() in context.output:
+            context[tb_id.full()] += f"\n\nDOCUMENT METADATA AT ORIGIN:\n{to_yaml(origin)}"
+async def extend_prompt_context_with_classification_labels(
+    context, kbid, text_block_ids: list[TextBlockId]
+):
+    async def _get_labels(kbid: str, _id: TextBlockId) -> tuple[TextBlockId, list[tuple[str, str]]]:
+        fid = _id if isinstance(_id, FieldId) else _id.field_id
+        labels = set()
+        resource = await cache.get_resource(kbid, fid.rid)
+        if resource is not None:
+            pb_basic = await resource.get_basic()
+            if pb_basic is not None:
+                # Add the classification labels of the resource
+                for classif in pb_basic.usermetadata.classifications:
+                    labels.add((classif.labelset, classif.label))
+                # Add the classifications labels of the field
+                for fc in pb_basic.computedmetadata.field_classifications:
+                    if fc.field.field == fid.key and fc.field.field_type == fid.pb_type:
+                        for classif in fc.classifications:
+                            if classif.cancelled_by_user:  # pragma: no cover
+                                continue
+                            labels.add((classif.labelset, classif.label))
+        return _id, list(labels)
+    classif_labels = await run_concurrently([_get_labels(kbid, tb_id) for tb_id in text_block_ids])
+    tb_id_to_labels = {tb_id: labels for tb_id, labels in classif_labels if len(labels) > 0}
+    for tb_id in text_block_ids:
+        labels = tb_id_to_labels.get(tb_id)
+        if labels is not None and tb_id.full() in context.output:
+            labels_text = "DOCUMENT CLASSIFICATION LABELS:"
+            for labelset, label in labels:
+                labels_text += f"\n - {label} ({labelset})"
+            context[tb_id.full()] += "\n\n" + labels_text
+async def extend_prompt_context_with_ner(context, kbid, text_block_ids: list[TextBlockId]):
+    async def _get_ners(kbid: str, _id: TextBlockId) -> tuple[TextBlockId, dict[str, set[str]]]:
+        fid = _id if isinstance(_id, FieldId) else _id.field_id
+        ners: dict[str, set[str]] = {}
+        resource = await cache.get_resource(kbid, fid.rid)
+        if resource is not None:
+            field = await resource.get_field(fid.key, fid.pb_type, load=False)
+            fcm = await field.get_field_metadata()
+            if fcm is not None:
+                # Data Augmentation + Processor entities
+                for (
+                    data_aumgentation_task_id,
+                    entities_wrapper,
+                ) in fcm.metadata.entities.items():
+                    for entity in entities_wrapper.entities:
+                        ners.setdefault(entity.label, set()).add(entity.text)
+                # Legacy processor entities
+                # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
+                for token, family in fcm.metadata.ner.items():
+                    ners.setdefault(family, set()).add(token)
+        return _id, ners
+    nerss = await run_concurrently([_get_ners(kbid, tb_id) for tb_id in text_block_ids])
+    tb_id_to_ners = {tb_id: ners for tb_id, ners in nerss if len(ners) > 0}
+    for tb_id in text_block_ids:
+        ners = tb_id_to_ners.get(tb_id)
+        if ners is not None and tb_id.full() in context.output:
+            ners_text = "DOCUMENT NAMED ENTITIES (NERs):"
+            for family, tokens in ners.items():
+                ners_text += f"\n - {family}:"
+                for token in sorted(list(tokens)):
+                    ners_text += f"\n   - {token}"
+            context[tb_id.full()] += "\n\n" + ners_text
+async def extend_prompt_context_with_extra_metadata(context, kbid, text_block_ids: list[TextBlockId]):
+    async def _get_extra(kbid: str, rid: str) -> tuple[str, Optional[Extra]]:
+        extra = None
+        resource = await cache.get_resource(kbid, rid)
+        if resource is not None:
+            pb_extra = await resource.get_extra()
+            if pb_extra is not None:
+                extra = from_proto.extra(pb_extra)
+        return rid, extra
+    rids = {tb_id.rid for tb_id in text_block_ids}
+    extras = await run_concurrently([_get_extra(kbid, rid) for rid in rids])
+    rid_to_extra = {rid: extra for rid, extra in extras if extra is not None}
+    for tb_id in text_block_ids:
+        extra = rid_to_extra.get(tb_id.rid)
+        if extra is not None and tb_id.full() in context.output:
+            context[tb_id.full()] += f"\n\nDOCUMENT EXTRA METADATA:\n{to_yaml(extra)}"
+def to_yaml(obj: BaseModel) -> str:
+    return yaml.dump(
+        obj.model_dump(exclude_none=True, exclude_defaults=True, exclude_unset=True),
+        default_flow_style=False,
+        indent=2,
+        sort_keys=True,
+    )
+async def field_extension_prompt_context(
+    context: CappedPromptContext,
+    kbid: str,
+    ordered_paragraphs: list[FindParagraph],
+    strategy: FieldExtensionStrategy,
 ) -> None:
     """
     Algorithm steps:
@@ -296,35 +468,402 @@ async def composed_prompt_context(
         - Add the extracted text of each field to the beginning of the context.
         - Add the extracted text of each paragraph to the end of the context.
     """
-    # Collect the list of resources in the results (in order of relevance).
-    ordered_paras = get_ordered_paragraphs(results)
     ordered_resources = []
-    for paragraph in ordered_paras:
-        resource_uuid = paragraph.id.split("/")[0]
+    for paragraph in ordered_paragraphs:
+        resource_uuid = ParagraphId.from_string(paragraph.id).rid
         if resource_uuid not in ordered_resources:
             ordered_resources.append(resource_uuid)
     # Fetch the extracted texts of the specified fields for each resource
-    txn = await get_read_only_transaction()
-    kb_obj = KnowledgeBoxORM(txn, await get_storage(), kbid)
-    tasks = [
-        get_resource_field_extracted_text(kb_obj, resource_uuid, field_id)
-        for resource_uuid in ordered_resources
-        for field_id in extend_with_fields
-    ]
+    extend_fields = strategy.fields
+    extend_field_ids = []
+    for resource_uuid in ordered_resources:
+        for field_id in extend_fields:
+            try:
+                fid = FieldId.from_string(f"{resource_uuid}/{field_id.strip('/')}")
+                extend_field_ids.append(fid)
+            except ValueError:  # pragma: no cover
+                # Invalid field id, skiping
+                continue
+    tasks = [hydrate_field_text(kbid, fid) for fid in extend_field_ids]
     field_extracted_texts = await run_concurrently(tasks)
     for result in field_extracted_texts:
-        if result is None:
+        if result is None:  # pragma: no cover
             continue
-        # Add the extracted text of each field to the beginning of the context.
         field, extracted_text = result
-        context[field.resource_unique_id] = extracted_text
+        # First off, remove the text block ids from paragraphs that belong to
+        # the same field, as otherwise the context will be duplicated.
+        for tb_id in context.text_block_ids():
+            if tb_id.startswith(field.full()):
+                del context[tb_id]
+        # Add the extracted text of each field to the beginning of the context.
+        context[field.full()] = extracted_text
     # Add the extracted text of each paragraph to the end of the context.
-    for paragraph in ordered_paras:
+    for paragraph in ordered_paragraphs:
+        context[paragraph.id] = _clean_paragraph_text(paragraph)
+async def get_paragraph_text_with_neighbours(
+    kbid: str,
+    pid: ParagraphId,
+    field_paragraphs: list[ParagraphId],
+    before: int = 0,
+    after: int = 0,
+) -> tuple[ParagraphId, str]:
+    """
+    This function will get the paragraph text of the paragraph with the neighbouring paragraphs included.
+    Parameters:
+        kbid: The knowledge box id.
+        pid: The matching paragraph id.
+        field_paragraphs: The list of paragraph ids of the field.
+        before: The number of paragraphs to include before the matching paragraph.
+        after: The number of paragraphs to include after the matching paragraph.
+    """
+    async def _get_paragraph_text(
+        kbid: str,
+        pid: ParagraphId,
+    ) -> tuple[ParagraphId, str]:
+        return pid, await get_paragraph_text(
+            kbid=kbid,
+            paragraph_id=pid,
+            log_on_missing_field=True,
+        )
+    ops = []
+    try:
+        for paragraph_index in get_neighbouring_paragraph_indexes(
+            field_paragraphs=field_paragraphs,
+            matching_paragraph=pid,
+            before=before,
+            after=after,
+        ):
+            neighbour_pid = field_paragraphs[paragraph_index]
+            ops.append(
+                asyncio.create_task(
+                    _get_paragraph_text(
+                        kbid=kbid,
+                        pid=neighbour_pid,
+                    )
+                )
+            )
+    except ParagraphIdNotFoundInExtractedMetadata:
+        logger.warning(
+            "Could not find matching paragraph in extracted metadata. This is odd and needs to be investigated.",
+            extra={
+                "kbid": kbid,
+                "matching_paragraph": pid.full(),
+                "field_paragraphs": [p.full() for p in field_paragraphs],
+            },
+        )
+        # If we could not find the matching paragraph in the extracted metadata, we can't retrieve
+        # the neighbouring paragraphs and we simply fetch the text of the matching paragraph.
+        ops.append(
+            asyncio.create_task(
+                _get_paragraph_text(
+                    kbid=kbid,
+                    pid=pid,
+                )
+            )
+        )
+    results = []
+    if len(ops) > 0:
+        results = await asyncio.gather(*ops)
+    # Sort the results by the paragraph start
+    results.sort(key=lambda x: x[0].paragraph_start)
+    paragraph_texts = []
+    for _, text in results:
+        if text != "":
+            paragraph_texts.append(text)
+    return pid, "\n\n".join(paragraph_texts)
+async def get_field_paragraphs_list(
+    kbid: str,
+    field: FieldId,
+    paragraphs: list[ParagraphId],
+) -> None:
+    """
+    Modifies the paragraphs list by adding the paragraph ids of the field, sorted by position.
+    """
+    resource = await cache.get_resource(kbid, field.rid)
+    if resource is None:  # pragma: no cover
+        return
+    field_obj: Field = await resource.get_field(key=field.key, type=field.pb_type, load=False)
+    field_metadata: Optional[resources_pb2.FieldComputedMetadata] = await field_obj.get_field_metadata(
+        force=True
+    )
+    if field_metadata is None:  # pragma: no cover
+        return
+    for paragraph in field_metadata.metadata.paragraphs:
+        paragraphs.append(
+            ParagraphId(
+                field_id=field,
+                paragraph_start=paragraph.start,
+                paragraph_end=paragraph.end,
+            )
+        )
+async def neighbouring_paragraphs_prompt_context(
+    context: CappedPromptContext,
+    kbid: str,
+    ordered_text_blocks: list[FindParagraph],
+    strategy: NeighbouringParagraphsStrategy,
+) -> None:
+    """
+    This function will get the paragraph texts and then craft a context with the neighbouring paragraphs of the
+    paragraphs in the ordered_paragraphs list. The number of paragraphs to include before and after each paragraph
+    """
+    # First, get the sorted list of paragraphs for each matching field
+    # so we can know the indexes of the neighbouring paragraphs
+    unique_fields = {
+        ParagraphId.from_string(text_block.id).field_id for text_block in ordered_text_blocks
+    }
+    paragraphs_by_field: dict[FieldId, list[ParagraphId]] = {}
+    field_ops = []
+    for field_id in unique_fields:
+        plist = paragraphs_by_field.setdefault(field_id, [])
+        field_ops.append(
+            asyncio.create_task(get_field_paragraphs_list(kbid=kbid, field=field_id, paragraphs=plist))
+        )
+    if field_ops:
+        await asyncio.gather(*field_ops)
+    # Now, get the paragraph texts with the neighbouring paragraphs
+    paragraph_ops = []
+    for text_block in ordered_text_blocks:
+        pid = ParagraphId.from_string(text_block.id)
+        paragraph_ops.append(
+            asyncio.create_task(
+                get_paragraph_text_with_neighbours(
+                    kbid=kbid,
+                    pid=pid,
+                    before=strategy.before,
+                    after=strategy.after,
+                    field_paragraphs=paragraphs_by_field.get(pid.field_id, []),
+                )
+            )
+        )
+    if not paragraph_ops:  # pragma: no cover
+        return
+    results: list[tuple[ParagraphId, str]] = await asyncio.gather(*paragraph_ops)
+    # Add the paragraph texts to the context
+    for pid, text in results:
+        if text != "":
+            context[pid.full()] = text
+async def conversation_prompt_context(
+    context: CappedPromptContext,
+    kbid: str,
+    ordered_paragraphs: list[FindParagraph],
+    conversational_strategy: ConversationalStrategy,
+    visual_llm: bool,
+):
+    analyzed_fields: List[str] = []
+    async with get_driver().transaction(read_only=True) as txn:
+        storage = await get_storage()
+        kb = KnowledgeBoxORM(txn, storage, kbid)
+        for paragraph in ordered_paragraphs:
+            context[paragraph.id] = _clean_paragraph_text(paragraph)
+            # If the paragraph is a conversation and it matches semantically, we assume we
+            # have matched with the question, therefore try to include the answer to the
+            # context by pulling the next few messages of the conversation field
+            rid, field_type, field_id, mident = paragraph.id.split("/")[:4]
+            if field_type == "c" and paragraph.score_type in (
+                SCORE_TYPE.VECTOR,
+                SCORE_TYPE.BOTH,
+                SCORE_TYPE.BM25,
+            ):
+                field_unique_id = "-".join([rid, field_type, field_id])
+                if field_unique_id in analyzed_fields:
+                    continue
+                resource = await kb.get(rid)
+                if resource is None:  # pragma: no cover
+                    continue
+                field_obj: Conversation = await resource.get_field(
+                    field_id, FIELD_TYPE_STR_TO_PB["c"], load=True
+                )  # type: ignore
+                cmetadata = await field_obj.get_metadata()
+                attachments: List[resources_pb2.FieldRef] = []
+                if conversational_strategy.full:
+                    extracted_text = await field_obj.get_extracted_text()
+                    for current_page in range(1, cmetadata.pages + 1):
+                        conv = await field_obj.db_get_value(current_page)
+                        for message in conv.messages:
+                            ident = message.ident
+                            if extracted_text is not None:
+                                text = extracted_text.split_text.get(ident, message.content.text.strip())
+                            else:
+                                text = message.content.text.strip()
+                            pid = f"{rid}/{field_type}/{field_id}/{ident}/0-{len(text) + 1}"
+                            context[pid] = text
+                            attachments.extend(message.content.attachments_fields)
+                else:
+                    # Add first message
+                    extracted_text = await field_obj.get_extracted_text()
+                    first_page = await field_obj.db_get_value()
+                    if len(first_page.messages) > 0:
+                        message = first_page.messages[0]
+                        ident = message.ident
+                        if extracted_text is not None:
+                            text = extracted_text.split_text.get(ident, message.content.text.strip())
+                        else:
+                            text = message.content.text.strip()
+                        pid = f"{rid}/{field_type}/{field_id}/{ident}/0-{len(text) + 1}"
+                        context[pid] = text
+                        attachments.extend(message.content.attachments_fields)
+                    messages: Deque[resources_pb2.Message] = deque(
+                        maxlen=conversational_strategy.max_messages
+                    )
+                    pending = -1
+                    for page in range(1, cmetadata.pages + 1):
+                        # Collect the messages with the window asked by the user arround the match paragraph
+                        conv = await field_obj.db_get_value(page)
+                        for message in conv.messages:
+                            messages.append(message)
+                            if pending > 0:
+                                pending -= 1
+                            if message.ident == mident:
+                                pending = (conversational_strategy.max_messages - 1) // 2
+                            if pending == 0:
+                                break
+                        if pending == 0:
+                            break
+                    for message in messages:
+                        text = message.content.text.strip()
+                        pid = f"{rid}/{field_type}/{field_id}/{message.ident}/0-{len(message.content.text) + 1}"
+                        context[pid] = text
+                        attachments.extend(message.content.attachments_fields)
+                if conversational_strategy.attachments_text:
+                    # add on the context the images if vlm enabled
+                    for attachment in attachments:
+                        field: File = await resource.get_field(
+                            attachment.field_id, attachment.field_type, load=True
+                        )  # type: ignore
+                        extracted_text = await field.get_extracted_text()
+                        if extracted_text is not None:
+                            pid = f"{rid}/{field_type}/{attachment.field_id}/0-{len(extracted_text.text) + 1}"
+                            context[pid] = f"Attachment {attachment.field_id}: {extracted_text.text}\n\n"
+                if conversational_strategy.attachments_images and visual_llm:
+                    for attachment in attachments:
+                        file_field: File = await resource.get_field(
+                            attachment.field_id, attachment.field_type, load=True
+                        )  # type: ignore
+                        image = await get_file_thumbnail_image(file_field)
+                        if image is not None:
+                            pid = f"{rid}/f/{attachment.field_id}/0-0"
+                            context.images[pid] = image
+                analyzed_fields.append(field_unique_id)
+async def hierarchy_prompt_context(
+    context: CappedPromptContext,
+    kbid: str,
+    ordered_paragraphs: list[FindParagraph],
+    strategy: HierarchyResourceStrategy,
+) -> None:
+    """
+    This function will get the paragraph texts (possibly with extra characters, if extra_characters > 0) and then
+    craft a context with all paragraphs of the same resource grouped together. Moreover, on each group of paragraphs,
+    it includes the resource title and summary so that the LLM can have a better understanding of the context.
+    """
+    paragraphs_extra_characters = max(strategy.count, 0)
+    # Make a copy of the ordered paragraphs to avoid modifying the original list, which is returned
+    # in the response to the user
+    ordered_paragraphs_copy = copy.deepcopy(ordered_paragraphs)
+    resources: Dict[str, ExtraCharsParagraph] = {}
+    # Iterate paragraphs to get extended text
+    for paragraph in ordered_paragraphs_copy:
+        paragraph_id = ParagraphId.from_string(paragraph.id)
+        extended_paragraph_text = paragraph.text
+        if paragraphs_extra_characters > 0:
+            extended_paragraph_text = await get_paragraph_text(
+                kbid=kbid,
+                paragraph_id=paragraph_id,
+                log_on_missing_field=True,
+            )
+        rid = paragraph_id.rid
+        if rid not in resources:
+            # Get the title and the summary of the resource
+            title_text = await get_paragraph_text(
+                kbid=kbid,
+                paragraph_id=ParagraphId(
+                    field_id=FieldId(
+                        rid=rid,
+                        type="a",
+                        key="title",
+                    ),
+                    paragraph_start=0,
+                    paragraph_end=500,
+                ),
+                log_on_missing_field=False,
+            )
+            summary_text = await get_paragraph_text(
+                kbid=kbid,
+                paragraph_id=ParagraphId(
+                    field_id=FieldId(
+                        rid=rid,
+                        type="a",
+                        key="summary",
+                    ),
+                    paragraph_start=0,
+                    paragraph_end=1000,
+                ),
+                log_on_missing_field=False,
+            )
+            resources[rid] = ExtraCharsParagraph(
+                title=title_text,
+                summary=summary_text,
+                paragraphs=[(paragraph, extended_paragraph_text)],
+            )
+        else:
+            resources[rid].paragraphs.append((paragraph, extended_paragraph_text))
+    # Modify the first paragraph of each resource to include the title and summary of the resource, as well as the
+    # extended paragraph text of all the paragraphs in the resource.
+    for values in resources.values():
+        title_text = values.title
+        summary_text = values.summary
+        first_paragraph = None
+        text_with_hierarchy = ""
+        for paragraph, extended_paragraph_text in values.paragraphs:
+            if first_paragraph is None:
+                first_paragraph = paragraph
+            text_with_hierarchy += "\n EXTRACTED BLOCK: \n " + extended_paragraph_text + " \n\n "
+            # All paragraphs of the resource are cleared except the first one, which will be the
+            # one containing the whole hierarchy information
+            paragraph.text = ""
+        if first_paragraph is not None:
+            # The first paragraph is the only one holding the hierarchy information
+            first_paragraph.text = f"DOCUMENT: {title_text} \n SUMMARY: {summary_text} \n RESOURCE CONTENT: {text_with_hierarchy}"
+    # Now that the paragraphs have been modified, we can add them to the context
+    for paragraph in ordered_paragraphs_copy:
+        if paragraph.text == "":
+            # Skip paragraphs that were cleared in the hierarchy expansion
+            continue
         context[paragraph.id] = _clean_paragraph_text(paragraph)
+    return
 class PromptContextBuilder:
@@ -335,19 +874,21 @@ class PromptContextBuilder:
     def __init__(
         self,
         kbid: str,
-        find_results: KnowledgeboxFindResults,
+        ordered_paragraphs: list[FindParagraph],
+        resource: Optional[str] = None,
         user_context: Optional[list[str]] = None,
         strategies: Optional[Sequence[RagStrategy]] = None,
         image_strategies: Optional[Sequence[ImageRagStrategy]] = None,
-        max_context_size: Optional[int] = None,
+        max_context_characters: Optional[int] = None,
         visual_llm: bool = False,
     ):
         self.kbid = kbid
-        self.find_results = find_results
+        self.ordered_paragraphs = ordered_paragraphs
+        self.resource = resource
         self.user_context = user_context
         self.strategies = strategies
         self.image_strategies = image_strategies
-        self.max_context_size = max_context_size
+        self.max_context_characters = max_context_characters
         self.visual_llm = visual_llm
     def prepend_user_context(self, context: CappedPromptContext):
@@ -359,95 +900,178 @@ class PromptContextBuilder:
     async def build(
         self,
     ) -> tuple[PromptContext, PromptContextOrder, PromptContextImages]:
-        ccontext = CappedPromptContext(max_size=self.max_context_size)
+        ccontext = CappedPromptContext(max_size=self.max_context_characters)
         self.prepend_user_context(ccontext)
         await self._build_context(ccontext)
         if self.visual_llm:
             await self._build_context_images(ccontext)
         context = ccontext.output
         context_images = ccontext.images
-        context_order = {
-            text_block_id: order for order, text_block_id in enumerate(context.keys())
-        }
+        context_order = {text_block_id: order for order, text_block_id in enumerate(context.keys())}
         return context, context_order, context_images
     async def _build_context_images(self, context: CappedPromptContext) -> None:
-        ordered_paras = get_ordered_paragraphs(self.find_results)
-        flatten_strategies = []
-        page_count = 5
-        gather_pages = False
-        gather_tables = False
-        if self.image_strategies is not None:
-            for strategy in self.image_strategies:
-                flatten_strategies.append(strategy.name)
-                if strategy.name == ImageRagStrategyName.PAGE_IMAGE:
-                    gather_pages = True
-                    if strategy.count is not None:  # type: ignore
-                        page_count = strategy.count  # type: ignore
-                if strategy.name == ImageRagStrategyName.TABLES:
-                    gather_tables = True
-        for paragraph in ordered_paras:
-            if paragraph.page_with_visual and paragraph.position:
-                if (
-                    gather_pages
-                    and paragraph.position.page_number
-                    and len(context.images) < page_count
-                ):
-                    field = "/".join(paragraph.id.split("/")[:3])
-                    page = paragraph.position.page_number
-                    page_id = f"{field}/{page}"
-                    if page_id not in context.images:
-                        context.images[page_id] = await get_page_image(
-                            self.kbid, paragraph.id, page
-                        )
+        if self.image_strategies is None or len(self.image_strategies) == 0:
+            # Nothing to do
+            return
+        page_image_strategy: Optional[PageImageStrategy] = None
+        max_page_images = 5
+        table_image_strategy: Optional[TableImageStrategy] = None
+        paragraph_image_strategy: Optional[ParagraphImageStrategy] = None
+        for strategy in self.image_strategies:
+            if strategy.name == ImageRagStrategyName.PAGE_IMAGE:
+                if page_image_strategy is None:
+                    page_image_strategy = cast(PageImageStrategy, strategy)
+                    if page_image_strategy.count is not None:
+                        max_page_images = page_image_strategy.count
+            elif strategy.name == ImageRagStrategyName.TABLES:
+                if table_image_strategy is None:
+                    table_image_strategy = cast(TableImageStrategy, strategy)
+            elif strategy.name == ImageRagStrategyName.PARAGRAPH_IMAGE:
+                if paragraph_image_strategy is None:
+                    paragraph_image_strategy = cast(ParagraphImageStrategy, strategy)
+            else:  # pragma: no cover
+                logger.warning(
+                    "Unknown image strategy",
+                    extra={"strategy": strategy.name, "kbid": self.kbid},
+                )
+        page_images_added = 0
+        for paragraph in self.ordered_paragraphs:
+            pid = ParagraphId.from_string(paragraph.id)
+            paragraph_page_number = get_paragraph_page_number(paragraph)
             if (
-                gather_tables
-                and paragraph.is_a_table
-                and paragraph.reference
-                and paragraph.reference != ""
+                page_image_strategy is not None
+                and page_images_added < max_page_images
+                and paragraph_page_number is not None
             ):
-                image = paragraph.reference
-                context.images[paragraph.id] = await get_paragraph_image(
-                    self.kbid, paragraph.id, image
-                )
+                # page_image_id: rid/f/myfield/0
+                page_image_id = "/".join([pid.field_id.full(), str(paragraph_page_number)])
+                if page_image_id not in context.images:
+                    image = await get_page_image(self.kbid, pid, paragraph_page_number)
+                    if image is not None:
+                        context.images[page_image_id] = image
+                        page_images_added += 1
+                    else:
+                        logger.warning(
+                            f"Could not retrieve image for paragraph from storage",
+                            extra={
+                                "kbid": self.kbid,
+                                "paragraph": pid.full(),
+                                "page_number": paragraph_page_number,
+                            },
+                        )
+            add_table = table_image_strategy is not None and paragraph.is_a_table
+            add_paragraph = paragraph_image_strategy is not None and not paragraph.is_a_table
+            if (add_table or add_paragraph) and (
+                paragraph.reference is not None and paragraph.reference != ""
+            ):
+                pimage = await get_paragraph_image(self.kbid, pid, paragraph.reference)
+                if pimage is not None:
+                    context.images[paragraph.id] = pimage
+                else:
+                    logger.warning(
+                        f"Could not retrieve image for paragraph from storage",
+                        extra={
+                            "kbid": self.kbid,
+                            "paragraph": pid.full(),
+                            "reference": paragraph.reference,
+                        },
+                    )
     async def _build_context(self, context: CappedPromptContext) -> None:
         if self.strategies is None or len(self.strategies) == 0:
-            await default_prompt_context(context, self.kbid, self.find_results)
+            # When no strategy is specified, use the default one
+            await default_prompt_context(context, self.kbid, self.ordered_paragraphs)
             return
-        number_of_full_resources = 0
-        distance = 0
-        extend_with_fields = []
+        else:
+            # Add the paragraphs to the context and then apply the strategies
+            for paragraph in self.ordered_paragraphs:
+                context[paragraph.id] = _clean_paragraph_text(paragraph)
+        full_resource: Optional[FullResourceStrategy] = None
+        hierarchy: Optional[HierarchyResourceStrategy] = None
+        neighbouring_paragraphs: Optional[NeighbouringParagraphsStrategy] = None
+        field_extension: Optional[FieldExtensionStrategy] = None
+        metadata_extension: Optional[MetadataExtensionStrategy] = None
+        conversational_strategy: Optional[ConversationalStrategy] = None
         for strategy in self.strategies:
             if strategy.name == RagStrategyName.FIELD_EXTENSION:
-                extend_with_fields.extend(strategy.fields)  # type: ignore
+                field_extension = cast(FieldExtensionStrategy, strategy)
+            elif strategy.name == RagStrategyName.CONVERSATION:
+                conversational_strategy = cast(ConversationalStrategy, strategy)
             elif strategy.name == RagStrategyName.FULL_RESOURCE:
-                number_of_full_resources = strategy.count or self.find_results.total  # type: ignore
+                full_resource = cast(FullResourceStrategy, strategy)
+                if self.resource:  # pragma: no cover
+                    # When the retrieval is scoped to a specific resource
+                    # the full resource strategy only includes that resource
+                    full_resource.count = 1
             elif strategy.name == RagStrategyName.HIERARCHY:
-                distance = strategy.count  # type: ignore
+                hierarchy = cast(HierarchyResourceStrategy, strategy)
+            elif strategy.name == RagStrategyName.NEIGHBOURING_PARAGRAPHS:
+                neighbouring_paragraphs = cast(NeighbouringParagraphsStrategy, strategy)
+            elif strategy.name == RagStrategyName.METADATA_EXTENSION:
+                metadata_extension = cast(MetadataExtensionStrategy, strategy)
+            elif strategy.name != RagStrategyName.PREQUERIES:  # pragma: no cover
+                # Prequeries are not handled here
+                logger.warning(
+                    "Unknown rag strategy",
+                    extra={"strategy": strategy.name, "kbid": self.kbid},
+                )
-        if number_of_full_resources:
+        if full_resource:
+            # When full resoure is enabled, only metadata extension is allowed.
             await full_resource_prompt_context(
-                context, self.kbid, self.find_results, number_of_full_resources
+                context,
+                self.kbid,
+                self.ordered_paragraphs,
+                self.resource,
+                full_resource,
             )
+            if metadata_extension:
+                await extend_prompt_context_with_metadata(context, self.kbid, metadata_extension)
             return
-        if distance > 0:
-            await get_extra_chars(self.kbid, self.find_results, distance)
-            await default_prompt_context(context, self.kbid, self.find_results)
-            return
+        if hierarchy:
+            await hierarchy_prompt_context(
+                context,
+                self.kbid,
+                self.ordered_paragraphs,
+                hierarchy,
+            )
+        if neighbouring_paragraphs:
+            await neighbouring_paragraphs_prompt_context(
+                context,
+                self.kbid,
+                self.ordered_paragraphs,
+                neighbouring_paragraphs,
+            )
+        if field_extension:
+            await field_extension_prompt_context(
+                context,
+                self.kbid,
+                self.ordered_paragraphs,
+                field_extension,
+            )
+        if conversational_strategy:
+            await conversation_prompt_context(
+                context,
+                self.kbid,
+                self.ordered_paragraphs,
+                conversational_strategy,
+                self.visual_llm,
+            )
+        if metadata_extension:
+            await extend_prompt_context_with_metadata(context, self.kbid, metadata_extension)
-        await composed_prompt_context(
-            context,
-            self.kbid,
-            self.find_results,
-            extend_with_fields=extend_with_fields,
-        )
-        return
+def get_paragraph_page_number(paragraph: FindParagraph) -> Optional[int]:
+    if not paragraph.page_with_visual:
+        return None
+    if paragraph.position is None:
+        return None
+    return paragraph.position.page_number
 @dataclass
@@ -457,67 +1081,6 @@ class ExtraCharsParagraph:
     paragraphs: List[Tuple[FindParagraph, str]]
-async def get_extra_chars(
-    kbid: str, find_results: KnowledgeboxFindResults, distance: int
-):
-    etcache = paragraphs.ExtractedTextCache()
-    resources: Dict[str, ExtraCharsParagraph] = {}
-    for paragraph in get_ordered_paragraphs(find_results):
-        rid, field_type, field = paragraph.id.split("/")[:3]
-        field_path = "/".join([rid, field_type, field])
-        position = paragraph.id.split("/")[-1]
-        start, end = position.split("-")
-        int_start = int(start)
-        int_end = int(end) + distance
-        new_text = await paragraphs.get_paragraph_text(
-            kbid=kbid,
-            rid=rid,
-            field=field_path,
-            start=int_start,
-            end=int_end,
-            extracted_text_cache=etcache,
-        )
-        if rid not in resources:
-            title_text = await paragraphs.get_paragraph_text(
-                kbid=kbid,
-                rid=rid,
-                field="/a/title",
-                start=0,
-                end=500,
-                extracted_text_cache=etcache,
-            )
-            summary_text = await paragraphs.get_paragraph_text(
-                kbid=kbid,
-                rid=rid,
-                field="/a/summary",
-                start=0,
-                end=1000,
-                extracted_text_cache=etcache,
-            )
-            resources[rid] = ExtraCharsParagraph(
-                title=title_text,
-                summary=summary_text,
-                paragraphs=[(paragraph, new_text)],
-            )
-        else:
-            resources[rid].paragraphs.append((paragraph, new_text))  # type: ignore
-    for key, values in resources.items():
-        title_text = values.title
-        summary_text = values.summary
-        first_paragraph = None
-        text = ""
-        for paragraph, text in values.paragraphs:
-            if first_paragraph is None:
-                first_paragraph = paragraph
-            text += "EXTRACTED BLOCK: \n " + text + " \n\n "
-            paragraph.text = ""
-        if first_paragraph is not None:
-            first_paragraph.text = f"DOCUMENT: {title_text} \n SUMMARY: {summary_text} \n RESOURCE CONTENT: {text}"
 def _clean_paragraph_text(paragraph: FindParagraph) -> str:
     text = paragraph.text.strip()
     # Do not send highlight marks on prompt context
@@ -525,17 +1088,23 @@ def _clean_paragraph_text(paragraph: FindParagraph) -> str:
     return text
-def get_ordered_paragraphs(results: KnowledgeboxFindResults) -> list[FindParagraph]:
+def get_neighbouring_paragraph_indexes(
+    field_paragraphs: list[ParagraphId],
+    matching_paragraph: ParagraphId,
+    before: int,
+    after: int,
+) -> list[int]:
     """
-    Returns the list of paragraphs in the results, ordered by relevance.
+    Returns the indexes of the neighbouring paragraphs to fetch (including the matching paragraph).
     """
-    return sorted(
-        [
-            paragraph
-            for resource in results.resources.values()
-            for field in resource.fields.values()
-            for paragraph in field.paragraphs.values()
-        ],
-        key=lambda paragraph: paragraph.order,
-        reverse=False,
-    )
+    assert before >= 0
+    assert after >= 0
+    try:
+        matching_index = field_paragraphs.index(matching_paragraph)
+    except ValueError:
+        raise ParagraphIdNotFoundInExtractedMetadata(
+            f"Matching paragraph {matching_paragraph.full()} not found in extracted metadata"
+        )
+    start_index = max(0, matching_index - before)
+    end_index = min(len(field_paragraphs), matching_index + after + 1)
+    return list(range(start_index, end_index))

nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

nucliadb 2.46.1.post382py3-none-any.whl → 6.2.1.post2777py3-none-any.whl