nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/search/predict.py
CHANGED
@@ -19,31 +19,37 @@
|
|
19
19
|
#
|
20
20
|
import json
|
21
21
|
import os
|
22
|
+
import random
|
22
23
|
from enum import Enum
|
23
|
-
from typing import Any, AsyncIterator,
|
24
|
+
from typing import Any, AsyncIterator, Optional
|
24
25
|
from unittest.mock import AsyncMock, Mock
|
25
26
|
|
26
27
|
import aiohttp
|
27
28
|
import backoff
|
28
|
-
from
|
29
|
-
from pydantic import
|
29
|
+
from nuclia_models.predict.generative_responses import GenerativeChunk
|
30
|
+
from pydantic import ValidationError
|
30
31
|
|
31
|
-
from nucliadb.
|
32
|
+
from nucliadb.common import datamanagers
|
32
33
|
from nucliadb.search import logger
|
33
|
-
from
|
34
|
-
|
35
|
-
FeedbackRequest,
|
34
|
+
from nucliadb.tests.vectors import Q, Qm2023
|
35
|
+
from nucliadb_models.internal.predict import (
|
36
36
|
Ner,
|
37
37
|
QueryInfo,
|
38
|
-
|
38
|
+
RerankModel,
|
39
|
+
RerankResponse,
|
39
40
|
SentenceSearch,
|
41
|
+
TokenSearch,
|
42
|
+
)
|
43
|
+
from nucliadb_models.search import (
|
44
|
+
ChatModel,
|
45
|
+
RephraseModel,
|
40
46
|
SummarizedResource,
|
41
47
|
SummarizedResponse,
|
42
48
|
SummarizeModel,
|
43
|
-
TokenSearch,
|
44
49
|
)
|
50
|
+
from nucliadb_protos.utils_pb2 import RelationNode
|
45
51
|
from nucliadb_telemetry import errors, metrics
|
46
|
-
from nucliadb_utils import
|
52
|
+
from nucliadb_utils.const import Features
|
47
53
|
from nucliadb_utils.exceptions import LimitsExceededError
|
48
54
|
from nucliadb_utils.settings import nuclia_settings
|
49
55
|
from nucliadb_utils.utilities import Utility, has_feature, set_utility
|
@@ -73,13 +79,12 @@ class RephraseMissingContextError(Exception):
|
|
73
79
|
|
74
80
|
DUMMY_RELATION_NODE = [
|
75
81
|
RelationNode(value="Ferran", ntype=RelationNode.NodeType.ENTITY, subtype="PERSON"),
|
76
|
-
RelationNode(
|
77
|
-
value="Joan Antoni", ntype=RelationNode.NodeType.ENTITY, subtype="PERSON"
|
78
|
-
),
|
82
|
+
RelationNode(value="Joan Antoni", ntype=RelationNode.NodeType.ENTITY, subtype="PERSON"),
|
79
83
|
]
|
80
84
|
|
81
85
|
DUMMY_REPHRASE_QUERY = "This is a rephrased query"
|
82
86
|
DUMMY_LEARNING_ID = "00"
|
87
|
+
DUMMY_LEARNING_MODEL = "chatgpt"
|
83
88
|
|
84
89
|
|
85
90
|
PUBLIC_PREDICT = "/api/v1/predict"
|
@@ -92,8 +97,10 @@ SUMMARIZE = "/summarize"
|
|
92
97
|
CHAT = "/chat"
|
93
98
|
REPHRASE = "/rephrase"
|
94
99
|
FEEDBACK = "/feedback"
|
100
|
+
RERANK = "/rerank"
|
95
101
|
|
96
102
|
NUCLIA_LEARNING_ID_HEADER = "NUCLIA-LEARNING-ID"
|
103
|
+
NUCLIA_LEARNING_MODEL_HEADER = "NUCLIA-LEARNING-MODEL"
|
97
104
|
|
98
105
|
|
99
106
|
predict_observer = metrics.Observer(
|
@@ -123,41 +130,6 @@ class AnswerStatusCode(str, Enum):
|
|
123
130
|
}[self]
|
124
131
|
|
125
132
|
|
126
|
-
class TextGenerativeResponse(BaseModel):
|
127
|
-
type: Literal["text"] = "text"
|
128
|
-
text: str
|
129
|
-
|
130
|
-
|
131
|
-
class MetaGenerativeResponse(BaseModel):
|
132
|
-
type: Literal["meta"] = "meta"
|
133
|
-
input_tokens: int
|
134
|
-
output_tokens: int
|
135
|
-
timings: dict[str, float]
|
136
|
-
|
137
|
-
|
138
|
-
class CitationsGenerativeResponse(BaseModel):
|
139
|
-
type: Literal["citations"] = "citations"
|
140
|
-
citations: dict[str, Any]
|
141
|
-
|
142
|
-
|
143
|
-
class StatusGenerativeResponse(BaseModel):
|
144
|
-
type: Literal["status"] = "status"
|
145
|
-
code: str
|
146
|
-
details: Optional[str] = None
|
147
|
-
|
148
|
-
|
149
|
-
GenerativeResponse = Union[
|
150
|
-
TextGenerativeResponse,
|
151
|
-
MetaGenerativeResponse,
|
152
|
-
CitationsGenerativeResponse,
|
153
|
-
StatusGenerativeResponse,
|
154
|
-
]
|
155
|
-
|
156
|
-
|
157
|
-
class GenerativeChunk(BaseModel):
|
158
|
-
chunk: GenerativeResponse = Field(..., discriminator="type")
|
159
|
-
|
160
|
-
|
161
133
|
async def start_predict_engine():
|
162
134
|
if nuclia_settings.dummy_predict:
|
163
135
|
predict_util = DummyPredictEngine()
|
@@ -180,9 +152,7 @@ def convert_relations(data: dict[str, list[dict[str, str]]]) -> list[RelationNod
|
|
180
152
|
for token in data["tokens"]:
|
181
153
|
text = token["text"]
|
182
154
|
klass = token["ner"]
|
183
|
-
result.append(
|
184
|
-
RelationNode(value=text, ntype=RelationNode.NodeType.ENTITY, subtype=klass)
|
185
|
-
)
|
155
|
+
result.append(RelationNode(value=text, ntype=RelationNode.NodeType.ENTITY, subtype=klass))
|
186
156
|
return result
|
187
157
|
|
188
158
|
|
@@ -215,9 +185,7 @@ class PredictEngine:
|
|
215
185
|
await self.session.close()
|
216
186
|
|
217
187
|
def check_nua_key_is_configured_for_onprem(self):
|
218
|
-
if self.onprem and (
|
219
|
-
self.nuclia_service_account is None and self.local_predict is False
|
220
|
-
):
|
188
|
+
if self.onprem and (self.nuclia_service_account is None and self.local_predict is False):
|
221
189
|
raise NUAKeyMissingError()
|
222
190
|
|
223
191
|
def get_predict_url(self, endpoint: str, kbid: str) -> str:
|
@@ -229,7 +197,7 @@ class PredictEngine:
|
|
229
197
|
# /api/v1/predict/rephrase/{kbid}
|
230
198
|
return f"{self.public_url}{PUBLIC_PREDICT}{endpoint}/{kbid}"
|
231
199
|
else:
|
232
|
-
if has_feature(
|
200
|
+
if has_feature(Features.VERSIONED_PRIVATE_PREDICT):
|
233
201
|
return f"{self.cluster_url}{VERSIONED_PRIVATE_PREDICT}{endpoint}"
|
234
202
|
else:
|
235
203
|
return f"{self.cluster_url}{PRIVATE_PREDICT}{endpoint}"
|
@@ -243,16 +211,13 @@ class PredictEngine:
|
|
243
211
|
else:
|
244
212
|
return {"X-STF-KBID": kbid}
|
245
213
|
|
246
|
-
async def check_response(
|
247
|
-
self, resp: aiohttp.ClientResponse, expected_status: int = 200
|
248
|
-
) -> None:
|
214
|
+
async def check_response(self, resp: aiohttp.ClientResponse, expected_status: int = 200) -> None:
|
249
215
|
if resp.status == expected_status:
|
250
216
|
return
|
251
217
|
|
252
218
|
if resp.status == 402:
|
253
219
|
data = await resp.json()
|
254
220
|
raise LimitsExceededError(402, data["detail"])
|
255
|
-
|
256
221
|
try:
|
257
222
|
data = await resp.json()
|
258
223
|
try:
|
@@ -264,7 +229,10 @@ class PredictEngine:
|
|
264
229
|
aiohttp.client_exceptions.ContentTypeError,
|
265
230
|
):
|
266
231
|
detail = await resp.text()
|
267
|
-
|
232
|
+
if str(resp.status).startswith("5"):
|
233
|
+
logger.error(f"Predict API error at {resp.url}: {detail}")
|
234
|
+
else:
|
235
|
+
logger.info(f"Predict API error at {resp.url}: {detail}")
|
268
236
|
raise ProxiedPredictAPIError(status=resp.status, detail=detail)
|
269
237
|
|
270
238
|
@backoff.on_exception(
|
@@ -277,36 +245,6 @@ class PredictEngine:
|
|
277
245
|
func = getattr(self.session, method.lower())
|
278
246
|
return await func(**request_args)
|
279
247
|
|
280
|
-
@predict_observer.wrap({"type": "feedback"})
|
281
|
-
async def send_feedback(
|
282
|
-
self,
|
283
|
-
kbid: str,
|
284
|
-
item: FeedbackRequest,
|
285
|
-
x_nucliadb_user: str,
|
286
|
-
x_ndb_client: str,
|
287
|
-
x_forwarded_for: str,
|
288
|
-
):
|
289
|
-
try:
|
290
|
-
self.check_nua_key_is_configured_for_onprem()
|
291
|
-
except NUAKeyMissingError:
|
292
|
-
logger.warning(
|
293
|
-
"Nuclia Service account is not defined so could not send the feedback"
|
294
|
-
)
|
295
|
-
return
|
296
|
-
|
297
|
-
data = item.dict()
|
298
|
-
data["user_id"] = x_nucliadb_user
|
299
|
-
data["client"] = x_ndb_client
|
300
|
-
data["forwarded"] = x_forwarded_for
|
301
|
-
|
302
|
-
resp = await self.make_request(
|
303
|
-
"POST",
|
304
|
-
url=self.get_predict_url(FEEDBACK, kbid),
|
305
|
-
json=data,
|
306
|
-
headers=self.get_predict_headers(kbid),
|
307
|
-
)
|
308
|
-
await self.check_response(resp, expected_status=204)
|
309
|
-
|
310
248
|
@predict_observer.wrap({"type": "rephrase"})
|
311
249
|
async def rephrase_query(self, kbid: str, item: RephraseModel) -> str:
|
312
250
|
try:
|
@@ -319,38 +257,16 @@ class PredictEngine:
|
|
319
257
|
resp = await self.make_request(
|
320
258
|
"POST",
|
321
259
|
url=self.get_predict_url(REPHRASE, kbid),
|
322
|
-
json=item.
|
260
|
+
json=item.model_dump(),
|
323
261
|
headers=self.get_predict_headers(kbid),
|
324
262
|
)
|
325
263
|
await self.check_response(resp, expected_status=200)
|
326
264
|
return await _parse_rephrase_response(resp)
|
327
265
|
|
328
|
-
@predict_observer.wrap({"type": "chat"})
|
329
|
-
async def chat_query(
|
330
|
-
self, kbid: str, item: ChatModel
|
331
|
-
) -> tuple[str, AsyncIterator[bytes]]:
|
332
|
-
try:
|
333
|
-
self.check_nua_key_is_configured_for_onprem()
|
334
|
-
except NUAKeyMissingError:
|
335
|
-
error = "Nuclia Service account is not defined so the chat operation could not be performed"
|
336
|
-
logger.warning(error)
|
337
|
-
raise SendToPredictError(error)
|
338
|
-
|
339
|
-
resp = await self.make_request(
|
340
|
-
"POST",
|
341
|
-
url=self.get_predict_url(CHAT, kbid),
|
342
|
-
json=item.dict(),
|
343
|
-
headers=self.get_predict_headers(kbid),
|
344
|
-
timeout=None,
|
345
|
-
)
|
346
|
-
await self.check_response(resp, expected_status=200)
|
347
|
-
ident = resp.headers.get(NUCLIA_LEARNING_ID_HEADER)
|
348
|
-
return ident, get_answer_generator(resp)
|
349
|
-
|
350
266
|
@predict_observer.wrap({"type": "chat_ndjson"})
|
351
267
|
async def chat_query_ndjson(
|
352
268
|
self, kbid: str, item: ChatModel
|
353
|
-
) -> tuple[str, AsyncIterator[GenerativeChunk]]:
|
269
|
+
) -> tuple[str, str, AsyncIterator[GenerativeChunk]]:
|
354
270
|
"""
|
355
271
|
Chat query using the new stream format
|
356
272
|
Format specs: https://github.com/ndjson/ndjson-spec
|
@@ -369,35 +285,55 @@ class PredictEngine:
|
|
369
285
|
resp = await self.make_request(
|
370
286
|
"POST",
|
371
287
|
url=self.get_predict_url(CHAT, kbid),
|
372
|
-
json=item.
|
288
|
+
json=item.model_dump(),
|
373
289
|
headers=headers,
|
374
290
|
timeout=None,
|
375
291
|
)
|
376
292
|
await self.check_response(resp, expected_status=200)
|
377
293
|
ident = resp.headers.get(NUCLIA_LEARNING_ID_HEADER)
|
378
|
-
|
294
|
+
model = resp.headers.get(NUCLIA_LEARNING_MODEL_HEADER)
|
295
|
+
return ident, model, get_chat_ndjson_generator(resp)
|
379
296
|
|
380
297
|
@predict_observer.wrap({"type": "query"})
|
381
298
|
async def query(
|
382
299
|
self,
|
383
300
|
kbid: str,
|
384
301
|
sentence: str,
|
302
|
+
semantic_model: Optional[str] = None,
|
385
303
|
generative_model: Optional[str] = None,
|
386
|
-
rephrase:
|
304
|
+
rephrase: bool = False,
|
305
|
+
rephrase_prompt: Optional[str] = None,
|
387
306
|
) -> QueryInfo:
|
307
|
+
"""
|
308
|
+
Query endpoint: returns information to be used by NucliaDB at retrieval time, for instance:
|
309
|
+
- The embeddings
|
310
|
+
- The entities
|
311
|
+
- The stop words
|
312
|
+
- The semantic threshold
|
313
|
+
- etc.
|
314
|
+
|
315
|
+
:param kbid: KnowledgeBox ID
|
316
|
+
:param sentence: The query sentence
|
317
|
+
:param semantic_model: The semantic model to use to generate the embeddings
|
318
|
+
:param generative_model: The generative model that will be used to generate the answer
|
319
|
+
:param rephrase: If the query should be rephrased before calculating the embeddings for a better retrieval
|
320
|
+
:param rephrase_prompt: Custom prompt to use for rephrasing
|
321
|
+
"""
|
388
322
|
try:
|
389
323
|
self.check_nua_key_is_configured_for_onprem()
|
390
324
|
except NUAKeyMissingError:
|
391
|
-
error =
|
392
|
-
"Nuclia Service account is not defined so could not ask query endpoint"
|
393
|
-
)
|
325
|
+
error = "Nuclia Service account is not defined so could not ask query endpoint"
|
394
326
|
logger.warning(error)
|
395
327
|
raise SendToPredictError(error)
|
396
328
|
|
397
|
-
params = {
|
329
|
+
params: dict[str, Any] = {
|
398
330
|
"text": sentence,
|
399
331
|
"rephrase": str(rephrase),
|
400
332
|
}
|
333
|
+
if rephrase_prompt is not None:
|
334
|
+
params["rephrase_prompt"] = rephrase_prompt
|
335
|
+
if semantic_model is not None:
|
336
|
+
params["semantic_models"] = [semantic_model]
|
401
337
|
if generative_model is not None:
|
402
338
|
params["generative_model"] = generative_model
|
403
339
|
|
@@ -442,27 +378,41 @@ class PredictEngine:
|
|
442
378
|
resp = await self.make_request(
|
443
379
|
"POST",
|
444
380
|
url=self.get_predict_url(SUMMARIZE, kbid),
|
445
|
-
json=item.
|
381
|
+
json=item.model_dump(),
|
446
382
|
headers=self.get_predict_headers(kbid),
|
447
383
|
timeout=None,
|
448
384
|
)
|
449
385
|
await self.check_response(resp, expected_status=200)
|
450
386
|
data = await resp.json()
|
451
|
-
return SummarizedResponse.
|
387
|
+
return SummarizedResponse.model_validate(data)
|
388
|
+
|
389
|
+
@predict_observer.wrap({"type": "rerank"})
|
390
|
+
async def rerank(self, kbid: str, item: RerankModel) -> RerankResponse:
|
391
|
+
try:
|
392
|
+
self.check_nua_key_is_configured_for_onprem()
|
393
|
+
except NUAKeyMissingError:
|
394
|
+
error = "Nuclia Service account is not defined. Rerank operation could not be performed"
|
395
|
+
logger.warning(error)
|
396
|
+
raise SendToPredictError(error)
|
397
|
+
resp = await self.make_request(
|
398
|
+
"POST",
|
399
|
+
url=self.get_predict_url(RERANK, kbid),
|
400
|
+
json=item.model_dump(),
|
401
|
+
headers=self.get_predict_headers(kbid),
|
402
|
+
)
|
403
|
+
await self.check_response(resp, expected_status=200)
|
404
|
+
data = await resp.json()
|
405
|
+
return RerankResponse.model_validate(data)
|
452
406
|
|
453
407
|
|
454
408
|
class DummyPredictEngine(PredictEngine):
|
409
|
+
default_semantic_threshold = 0.7
|
410
|
+
|
455
411
|
def __init__(self):
|
456
412
|
self.onprem = True
|
457
413
|
self.cluster_url = "http://localhost:8000"
|
458
414
|
self.public_url = "http://localhost:8000"
|
459
415
|
self.calls = []
|
460
|
-
self.generated_answer = [
|
461
|
-
b"valid ",
|
462
|
-
b"answer ",
|
463
|
-
b" to",
|
464
|
-
AnswerStatusCode.SUCCESS.encode(),
|
465
|
-
]
|
466
416
|
self.ndjson_answer = [
|
467
417
|
b'{"chunk": {"type": "text", "text": "valid "}}\n',
|
468
418
|
b'{"chunk": {"type": "text", "text": "answer "}}\n',
|
@@ -486,79 +436,72 @@ class DummyPredictEngine(PredictEngine):
|
|
486
436
|
response.headers = {NUCLIA_LEARNING_ID_HEADER: DUMMY_LEARNING_ID}
|
487
437
|
return response
|
488
438
|
|
489
|
-
async def send_feedback(
|
490
|
-
self,
|
491
|
-
kbid: str,
|
492
|
-
item: FeedbackRequest,
|
493
|
-
x_nucliadb_user: str,
|
494
|
-
x_ndb_client: str,
|
495
|
-
x_forwarded_for: str,
|
496
|
-
):
|
497
|
-
self.calls.append(("send_feedback", item))
|
498
|
-
return
|
499
|
-
|
500
439
|
async def rephrase_query(self, kbid: str, item: RephraseModel) -> str:
|
501
440
|
self.calls.append(("rephrase_query", item))
|
502
441
|
return DUMMY_REPHRASE_QUERY
|
503
442
|
|
504
|
-
async def chat_query(
|
505
|
-
self, kbid: str, item: ChatModel
|
506
|
-
) -> tuple[str, AsyncIterator[bytes]]:
|
507
|
-
self.calls.append(("chat_query", item))
|
508
|
-
|
509
|
-
async def generate():
|
510
|
-
for i in self.generated_answer:
|
511
|
-
yield i
|
512
|
-
|
513
|
-
return (DUMMY_LEARNING_ID, generate())
|
514
|
-
|
515
443
|
async def chat_query_ndjson(
|
516
444
|
self, kbid: str, item: ChatModel
|
517
|
-
) -> tuple[str, AsyncIterator[
|
445
|
+
) -> tuple[str, str, AsyncIterator[GenerativeChunk]]:
|
518
446
|
self.calls.append(("chat_query_ndjson", item))
|
519
447
|
|
520
448
|
async def generate():
|
521
449
|
for item in self.ndjson_answer:
|
522
|
-
yield GenerativeChunk.
|
450
|
+
yield GenerativeChunk.model_validate_json(item)
|
523
451
|
|
524
|
-
return (DUMMY_LEARNING_ID, generate())
|
452
|
+
return (DUMMY_LEARNING_ID, DUMMY_LEARNING_MODEL, generate())
|
525
453
|
|
526
454
|
async def query(
|
527
455
|
self,
|
528
456
|
kbid: str,
|
529
457
|
sentence: str,
|
458
|
+
semantic_model: Optional[str] = None,
|
530
459
|
generative_model: Optional[str] = None,
|
531
|
-
rephrase:
|
460
|
+
rephrase: bool = False,
|
461
|
+
rephrase_prompt: Optional[str] = None,
|
532
462
|
) -> QueryInfo:
|
533
463
|
self.calls.append(("query", sentence))
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
return QueryInfo(
|
538
|
-
language="en",
|
539
|
-
stop_words=[],
|
540
|
-
semantic_threshold=0.7,
|
541
|
-
visual_llm=True,
|
542
|
-
max_context=self.max_context,
|
543
|
-
entities=TokenSearch(
|
544
|
-
tokens=[Ner(text="text", ner="PERSON", start=0, end=2)], time=0.0
|
545
|
-
),
|
546
|
-
sentence=SentenceSearch(data=Qm2023, time=0.0),
|
547
|
-
query=sentence,
|
548
|
-
)
|
464
|
+
|
465
|
+
if os.environ.get("TEST_SENTENCE_ENCODER") == "multilingual-2023-02-21": # pragma: no cover
|
466
|
+
base_vector = Qm2023
|
549
467
|
else:
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
468
|
+
base_vector = Q
|
469
|
+
|
470
|
+
# populate data with existing vectorsets
|
471
|
+
async with datamanagers.with_ro_transaction() as txn:
|
472
|
+
semantic_thresholds = {}
|
473
|
+
vectors = {}
|
474
|
+
timings = {}
|
475
|
+
async for vectorset_id, config in datamanagers.vectorsets.iter(txn, kbid=kbid):
|
476
|
+
semantic_thresholds[vectorset_id] = self.default_semantic_threshold
|
477
|
+
vectorset_dimension = config.vectorset_index_config.vector_dimension
|
478
|
+
if vectorset_dimension > len(base_vector):
|
479
|
+
padding = vectorset_dimension - len(base_vector)
|
480
|
+
vectors[vectorset_id] = base_vector + [random.random()] * padding
|
481
|
+
else:
|
482
|
+
vectors[vectorset_id] = base_vector[:vectorset_dimension]
|
483
|
+
|
484
|
+
timings[vectorset_id] = 0.010
|
485
|
+
|
486
|
+
# and fake data with the passed one too
|
487
|
+
model = semantic_model or "<PREDICT-DEFAULT-SEMANTIC-MODEL>"
|
488
|
+
semantic_thresholds[model] = self.default_semantic_threshold
|
489
|
+
vectors[model] = base_vector
|
490
|
+
timings[model] = 0.0
|
491
|
+
|
492
|
+
return QueryInfo(
|
493
|
+
language="en",
|
494
|
+
stop_words=[],
|
495
|
+
semantic_thresholds=semantic_thresholds,
|
496
|
+
visual_llm=True,
|
497
|
+
max_context=self.max_context,
|
498
|
+
entities=TokenSearch(tokens=[Ner(text="text", ner="PERSON", start=0, end=2)], time=0.0),
|
499
|
+
sentence=SentenceSearch(
|
500
|
+
vectors=vectors,
|
501
|
+
timings=timings,
|
502
|
+
),
|
503
|
+
query=sentence,
|
504
|
+
)
|
562
505
|
|
563
506
|
async def detect_entities(self, kbid: str, sentence: str) -> list[RelationNode]:
|
564
507
|
self.calls.append(("detect_entities", sentence))
|
@@ -577,9 +520,16 @@ class DummyPredictEngine(PredictEngine):
|
|
577
520
|
rsummary = []
|
578
521
|
for field_id, field_text in item.resources[rid].fields.items():
|
579
522
|
rsummary.append(f"{field_id}: {field_text}")
|
580
|
-
response.resources[rid] = SummarizedResource(
|
581
|
-
|
582
|
-
|
523
|
+
response.resources[rid] = SummarizedResource(summary="\n\n".join(rsummary), tokens=10)
|
524
|
+
return response
|
525
|
+
|
526
|
+
async def rerank(self, kbid: str, item: RerankModel) -> RerankResponse:
|
527
|
+
self.calls.append(("rerank", (kbid, item)))
|
528
|
+
# as we don't have information about the retrieval scores, return a
|
529
|
+
# random score given by the dict iteration
|
530
|
+
response = RerankResponse(
|
531
|
+
context_scores={paragraph_id: i for i, paragraph_id in enumerate(item.context.keys())}
|
532
|
+
)
|
583
533
|
return response
|
584
534
|
|
585
535
|
|
@@ -604,11 +554,10 @@ def get_answer_generator(response: aiohttp.ClientResponse):
|
|
604
554
|
def get_chat_ndjson_generator(
|
605
555
|
response: aiohttp.ClientResponse,
|
606
556
|
) -> AsyncIterator[GenerativeChunk]:
|
607
|
-
|
608
557
|
async def _parse_generative_chunks(gen):
|
609
558
|
async for chunk in gen:
|
610
559
|
try:
|
611
|
-
yield GenerativeChunk.
|
560
|
+
yield GenerativeChunk.model_validate_json(chunk.strip())
|
612
561
|
except ValidationError as ex:
|
613
562
|
errors.capture_exception(ex)
|
614
563
|
logger.error(f"Invalid chunk received: {chunk}")
|
nucliadb/search/py.typed
ADDED
File without changes
|