nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,270 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
import logging
|
22
|
+
from abc import ABC, abstractmethod, abstractproperty
|
23
|
+
from dataclasses import dataclass
|
24
|
+
from typing import Optional
|
25
|
+
|
26
|
+
from nucliadb.search.predict import ProxiedPredictAPIError, SendToPredictError
|
27
|
+
from nucliadb.search.search.query_parser import models as parser_models
|
28
|
+
from nucliadb.search.utilities import get_predict
|
29
|
+
from nucliadb_models.internal.predict import RerankModel
|
30
|
+
from nucliadb_models.search import (
|
31
|
+
SCORE_TYPE,
|
32
|
+
KnowledgeboxFindResults,
|
33
|
+
)
|
34
|
+
from nucliadb_telemetry.metrics import Observer
|
35
|
+
|
36
|
+
logger = logging.getLogger(__name__)
|
37
|
+
|
38
|
+
reranker_observer = Observer("reranker", labels={"type": ""})
|
39
|
+
|
40
|
+
|
41
|
+
@dataclass
|
42
|
+
class RerankableItem:
|
43
|
+
id: str
|
44
|
+
score: float
|
45
|
+
score_type: SCORE_TYPE
|
46
|
+
content: str
|
47
|
+
|
48
|
+
|
49
|
+
@dataclass
|
50
|
+
class RankedItem:
|
51
|
+
id: str
|
52
|
+
score: float
|
53
|
+
score_type: SCORE_TYPE
|
54
|
+
|
55
|
+
|
56
|
+
@dataclass
|
57
|
+
class RerankingOptions:
|
58
|
+
kbid: str
|
59
|
+
|
60
|
+
# Query used to retrieve the results to be reranked. Smart rerankers will use it
|
61
|
+
query: str
|
62
|
+
|
63
|
+
|
64
|
+
class Reranker(ABC):
|
65
|
+
@abstractproperty
|
66
|
+
def window(self) -> Optional[int]:
|
67
|
+
"""Number of elements the reranker requests. `None` means no specific
|
68
|
+
window is enforced."""
|
69
|
+
...
|
70
|
+
|
71
|
+
@property
|
72
|
+
def needs_extra_results(self) -> bool:
|
73
|
+
return self.window is not None
|
74
|
+
|
75
|
+
async def rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
|
76
|
+
"""Given a query and a set of resources, rerank elements and return the
|
77
|
+
list of reranked items sorted by decreasing score. The list will contain
|
78
|
+
at most, `window` elements.
|
79
|
+
|
80
|
+
"""
|
81
|
+
# Enforce reranker window and drop the rest
|
82
|
+
# XXX: other search engines allow a mix of reranked and not reranked
|
83
|
+
# results, there's no technical reason we can't do it
|
84
|
+
items = items[: self.window]
|
85
|
+
reranked = await self._rerank(items, options)
|
86
|
+
return reranked
|
87
|
+
|
88
|
+
@abstractmethod
|
89
|
+
async def _rerank(
|
90
|
+
self, items: list[RerankableItem], options: RerankingOptions
|
91
|
+
) -> list[RankedItem]: ...
|
92
|
+
|
93
|
+
|
94
|
+
class NoopReranker(Reranker):
|
95
|
+
"""No-operation reranker. Given a list of items to rerank, it does nothing
|
96
|
+
with them and return the items in the same order. It can be use to not alter
|
97
|
+
the previous ordering.
|
98
|
+
|
99
|
+
"""
|
100
|
+
|
101
|
+
@property
|
102
|
+
def window(self) -> Optional[int]:
|
103
|
+
return None
|
104
|
+
|
105
|
+
@reranker_observer.wrap({"type": "noop"})
|
106
|
+
async def _rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
|
107
|
+
return [
|
108
|
+
RankedItem(
|
109
|
+
id=item.id,
|
110
|
+
score=item.score,
|
111
|
+
score_type=item.score_type,
|
112
|
+
)
|
113
|
+
for item in items
|
114
|
+
]
|
115
|
+
|
116
|
+
|
117
|
+
class PredictReranker(Reranker):
|
118
|
+
"""Rerank using a reranking model.
|
119
|
+
|
120
|
+
It uses Predict API to rerank elements using a model trained for this
|
121
|
+
|
122
|
+
"""
|
123
|
+
|
124
|
+
def __init__(self, window: int):
|
125
|
+
self._window = window
|
126
|
+
|
127
|
+
@property
|
128
|
+
def window(self) -> int:
|
129
|
+
return self._window
|
130
|
+
|
131
|
+
@reranker_observer.wrap({"type": "predict"})
|
132
|
+
async def _rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
|
133
|
+
if len(items) == 0:
|
134
|
+
return []
|
135
|
+
|
136
|
+
predict = get_predict()
|
137
|
+
|
138
|
+
# Conversion to format expected by predict. At the same time,
|
139
|
+
# deduplicates paragraphs found in different indices
|
140
|
+
context = {item.id: item.content for item in items}
|
141
|
+
request = RerankModel(
|
142
|
+
question=options.query,
|
143
|
+
user_id="", # TODO
|
144
|
+
context=context,
|
145
|
+
)
|
146
|
+
try:
|
147
|
+
response = await predict.rerank(options.kbid, request)
|
148
|
+
except (SendToPredictError, ProxiedPredictAPIError):
|
149
|
+
# predict failed, we can't rerank
|
150
|
+
reranked = [
|
151
|
+
RankedItem(
|
152
|
+
id=item.id,
|
153
|
+
score=item.score,
|
154
|
+
score_type=item.score_type,
|
155
|
+
)
|
156
|
+
for item in items
|
157
|
+
]
|
158
|
+
else:
|
159
|
+
reranked = [
|
160
|
+
RankedItem(
|
161
|
+
id=id,
|
162
|
+
score=score,
|
163
|
+
score_type=SCORE_TYPE.RERANKER,
|
164
|
+
)
|
165
|
+
for id, score in response.context_scores.items()
|
166
|
+
]
|
167
|
+
sort_by_score(reranked)
|
168
|
+
best = reranked
|
169
|
+
return best
|
170
|
+
|
171
|
+
|
172
|
+
class MultiMatchBoosterReranker(Reranker):
|
173
|
+
"""This reranker gives more value to items that come from different indices"""
|
174
|
+
|
175
|
+
@property
|
176
|
+
def window(self) -> Optional[int]:
|
177
|
+
return None
|
178
|
+
|
179
|
+
@reranker_observer.wrap({"type": "multi_match_booster"})
|
180
|
+
async def _rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
|
181
|
+
"""Given a list of rerankable items, boost matches that appear multiple
|
182
|
+
times. The returned list can be smaller than the initial, as repeated
|
183
|
+
matches are deduplicated.
|
184
|
+
"""
|
185
|
+
reranked_by_id = {}
|
186
|
+
for item in items:
|
187
|
+
if item.id not in reranked_by_id:
|
188
|
+
reranked_by_id[item.id] = RankedItem(
|
189
|
+
id=item.id,
|
190
|
+
score=item.score,
|
191
|
+
score_type=item.score_type,
|
192
|
+
)
|
193
|
+
else:
|
194
|
+
# it's a mutiple match, boost the score
|
195
|
+
if reranked_by_id[item.id].score < item.score:
|
196
|
+
# previous implementation noted that we are using vector
|
197
|
+
# score x2 when we find a multiple match. However, this may
|
198
|
+
# not be true, as the same paragraph could come in any
|
199
|
+
# position in the rank fusioned result list
|
200
|
+
reranked_by_id[item.id].score = item.score * 2
|
201
|
+
|
202
|
+
reranked_by_id[item.id].score_type = SCORE_TYPE.BOTH
|
203
|
+
|
204
|
+
reranked = list(reranked_by_id.values())
|
205
|
+
sort_by_score(reranked)
|
206
|
+
return reranked
|
207
|
+
|
208
|
+
|
209
|
+
def get_reranker(reranker: parser_models.Reranker) -> Reranker:
|
210
|
+
algorithm: Reranker
|
211
|
+
|
212
|
+
if isinstance(reranker, parser_models.NoopReranker):
|
213
|
+
algorithm = NoopReranker()
|
214
|
+
|
215
|
+
elif isinstance(reranker, parser_models.MultiMatchBoosterReranker):
|
216
|
+
algorithm = MultiMatchBoosterReranker()
|
217
|
+
|
218
|
+
elif isinstance(reranker, parser_models.PredictReranker):
|
219
|
+
algorithm = PredictReranker(reranker.window)
|
220
|
+
|
221
|
+
else:
|
222
|
+
logger.warning(f"Unknown reranker requested: {reranker}. Using default instead")
|
223
|
+
algorithm = MultiMatchBoosterReranker()
|
224
|
+
|
225
|
+
return algorithm
|
226
|
+
|
227
|
+
|
228
|
+
def sort_by_score(items: list[RankedItem]):
|
229
|
+
"""Sort `items` in place by decreasing score"""
|
230
|
+
items.sort(key=lambda item: item.score, reverse=True)
|
231
|
+
|
232
|
+
|
233
|
+
def apply_reranking(results: KnowledgeboxFindResults, reranked: list[RankedItem]):
|
234
|
+
"""Given a list of reranked items, update the find results payload.
|
235
|
+
|
236
|
+
*ATENTION* we assume `reranked` is an ordered list of decreasing relevance
|
237
|
+
and contains *only* the items relevant for this response. Any paragraph not
|
238
|
+
found in `reranked` will be removed from the `results`
|
239
|
+
|
240
|
+
"""
|
241
|
+
inverted_results = {}
|
242
|
+
for rid, resource in results.resources.items():
|
243
|
+
for field_id, field in resource.fields.items():
|
244
|
+
for paragraph_id, paragraph in field.paragraphs.items():
|
245
|
+
inverted_results[paragraph_id] = (
|
246
|
+
paragraph,
|
247
|
+
(field_id, field),
|
248
|
+
(rid, resource),
|
249
|
+
)
|
250
|
+
|
251
|
+
# update results and best matches according to new scores
|
252
|
+
results.best_matches.clear()
|
253
|
+
for order, item in enumerate(reranked):
|
254
|
+
paragraph_id = item.id
|
255
|
+
paragraph = inverted_results[paragraph_id][0]
|
256
|
+
paragraph.score = item.score
|
257
|
+
paragraph.score_type = item.score_type
|
258
|
+
paragraph.order = order
|
259
|
+
results.best_matches.append(paragraph_id)
|
260
|
+
|
261
|
+
# prune uneeded results (not appearing in `reranked`)
|
262
|
+
extra = set(inverted_results.keys()) - set(results.best_matches)
|
263
|
+
for paragraph_id in extra:
|
264
|
+
_, (field_id, field), (rid, resource) = inverted_results[paragraph_id]
|
265
|
+
field.paragraphs.pop(paragraph_id)
|
266
|
+
if len(field.paragraphs) == 0:
|
267
|
+
resource.fields.pop(field_id)
|
268
|
+
|
269
|
+
if len(resource.fields) == 0:
|
270
|
+
results.resources.pop(rid)
|
nucliadb/search/search/shards.py
CHANGED
@@ -19,20 +19,15 @@
|
|
19
19
|
#
|
20
20
|
import asyncio
|
21
21
|
|
22
|
+
from nucliadb.common.cluster.base import AbstractIndexNode
|
22
23
|
from nucliadb_protos.nodereader_pb2 import (
|
23
24
|
GetShardRequest,
|
24
|
-
ParagraphSearchRequest,
|
25
|
-
ParagraphSearchResponse,
|
26
|
-
RelationSearchRequest,
|
27
|
-
RelationSearchResponse,
|
28
25
|
SearchRequest,
|
29
26
|
SearchResponse,
|
30
27
|
SuggestRequest,
|
31
28
|
SuggestResponse,
|
32
29
|
)
|
33
30
|
from nucliadb_protos.noderesources_pb2 import Shard
|
34
|
-
|
35
|
-
from nucliadb.common.cluster.base import AbstractIndexNode
|
36
31
|
from nucliadb_telemetry import metrics
|
37
32
|
|
38
33
|
node_observer = metrics.Observer(
|
@@ -44,9 +39,7 @@ node_observer = metrics.Observer(
|
|
44
39
|
)
|
45
40
|
|
46
41
|
|
47
|
-
async def query_shard(
|
48
|
-
node: AbstractIndexNode, shard: str, query: SearchRequest
|
49
|
-
) -> SearchResponse:
|
42
|
+
async def query_shard(node: AbstractIndexNode, shard: str, query: SearchRequest) -> SearchResponse:
|
50
43
|
req = SearchRequest()
|
51
44
|
req.CopyFrom(query)
|
52
45
|
req.shard = shard
|
@@ -61,31 +54,9 @@ async def get_shard(node: AbstractIndexNode, shard_id: str) -> Shard:
|
|
61
54
|
return await node.reader.GetShard(req) # type: ignore
|
62
55
|
|
63
56
|
|
64
|
-
async def
|
65
|
-
node: AbstractIndexNode, shard: str, query: ParagraphSearchRequest
|
66
|
-
) -> ParagraphSearchResponse:
|
67
|
-
req = ParagraphSearchRequest()
|
68
|
-
req.CopyFrom(query)
|
69
|
-
req.id = shard
|
70
|
-
with node_observer({"type": "paragraph_search", "node_id": node.id}):
|
71
|
-
return await node.reader.ParagraphSearch(req) # type: ignore
|
72
|
-
|
73
|
-
|
74
|
-
async def suggest_shard(
|
75
|
-
node: AbstractIndexNode, shard: str, query: SuggestRequest
|
76
|
-
) -> SuggestResponse:
|
57
|
+
async def suggest_shard(node: AbstractIndexNode, shard: str, query: SuggestRequest) -> SuggestResponse:
|
77
58
|
req = SuggestRequest()
|
78
59
|
req.CopyFrom(query)
|
79
60
|
req.shard = shard
|
80
61
|
with node_observer({"type": "suggest", "node_id": node.id}):
|
81
62
|
return await node.reader.Suggest(req) # type: ignore
|
82
|
-
|
83
|
-
|
84
|
-
async def relations_shard(
|
85
|
-
node: AbstractIndexNode, shard: str, query: RelationSearchRequest
|
86
|
-
) -> RelationSearchResponse:
|
87
|
-
req = RelationSearchRequest()
|
88
|
-
req.CopyFrom(query)
|
89
|
-
req.shard_id = shard
|
90
|
-
with node_observer({"type": "relation_search", "node_id": node.id}):
|
91
|
-
return await node.reader.RelationSearch(req) # type: ignore
|
@@ -20,8 +20,6 @@
|
|
20
20
|
import asyncio
|
21
21
|
from typing import Optional
|
22
22
|
|
23
|
-
from nucliadb_protos.utils_pb2 import ExtractedText
|
24
|
-
|
25
23
|
from nucliadb.common import datamanagers
|
26
24
|
from nucliadb.common.maindb.utils import get_driver
|
27
25
|
from nucliadb.ingest.fields.base import Field
|
@@ -35,6 +33,7 @@ from nucliadb_models.search import (
|
|
35
33
|
SummarizeRequest,
|
36
34
|
SummarizeResourceModel,
|
37
35
|
)
|
36
|
+
from nucliadb_protos.utils_pb2 import ExtractedText
|
38
37
|
from nucliadb_utils.utilities import get_storage
|
39
38
|
|
40
39
|
ExtractedTexts = list[tuple[str, str, Optional[ExtractedText]]]
|
@@ -52,15 +51,11 @@ async def summarize(kbid: str, request: SummarizeRequest) -> SummarizedResponse:
|
|
52
51
|
predict_request.user_prompt = request.user_prompt
|
53
52
|
predict_request.summary_kind = request.summary_kind
|
54
53
|
|
55
|
-
for uuid_or_slug, field_id, extracted_text in await get_extracted_texts(
|
56
|
-
kbid, request.resources
|
57
|
-
):
|
54
|
+
for uuid_or_slug, field_id, extracted_text in await get_extracted_texts(kbid, request.resources):
|
58
55
|
if extracted_text is None:
|
59
56
|
continue
|
60
57
|
|
61
|
-
fields = predict_request.resources.setdefault(
|
62
|
-
uuid_or_slug, SummarizeResourceModel()
|
63
|
-
).fields
|
58
|
+
fields = predict_request.resources.setdefault(uuid_or_slug, SummarizeResourceModel()).fields
|
64
59
|
fields[field_id] = extracted_text.text
|
65
60
|
|
66
61
|
if len(predict_request.resources) == 0:
|
@@ -70,9 +65,7 @@ async def summarize(kbid: str, request: SummarizeRequest) -> SummarizedResponse:
|
|
70
65
|
return await predict.summarize(kbid, predict_request)
|
71
66
|
|
72
67
|
|
73
|
-
async def get_extracted_texts(
|
74
|
-
kbid: str, resource_uuids_or_slugs: list[str]
|
75
|
-
) -> ExtractedTexts:
|
68
|
+
async def get_extracted_texts(kbid: str, resource_uuids_or_slugs: list[str]) -> ExtractedTexts:
|
76
69
|
results: ExtractedTexts = []
|
77
70
|
|
78
71
|
driver = get_driver()
|
@@ -82,7 +75,7 @@ async def get_extracted_texts(
|
|
82
75
|
tasks = []
|
83
76
|
|
84
77
|
# Schedule getting extracted text for each field of each resource
|
85
|
-
async with driver.transaction() as txn:
|
78
|
+
async with driver.transaction(read_only=True) as txn:
|
86
79
|
if not await datamanagers.kb.exists_kb(txn, kbid=kbid):
|
87
80
|
raise datamanagers.exceptions.KnowledgeBoxNotFound(kbid)
|
88
81
|
|
@@ -90,16 +83,12 @@ async def get_extracted_texts(
|
|
90
83
|
for uuid_or_slug in set(resource_uuids_or_slugs):
|
91
84
|
uuid = await get_resource_uuid(kb_orm, uuid_or_slug)
|
92
85
|
if uuid is None:
|
93
|
-
logger.warning(
|
94
|
-
f"Resource {uuid_or_slug} not found in KB", extra={"kbid": kbid}
|
95
|
-
)
|
86
|
+
logger.warning(f"Resource {uuid_or_slug} not found in KB", extra={"kbid": kbid})
|
96
87
|
continue
|
97
88
|
resource_orm = Resource(txn=txn, storage=storage, kb=kb_orm, uuid=uuid)
|
98
89
|
fields = await resource_orm.get_fields(force=True)
|
99
90
|
for _, field in fields.items():
|
100
|
-
task = asyncio.create_task(
|
101
|
-
get_extracted_text(uuid_or_slug, field, max_tasks)
|
102
|
-
)
|
91
|
+
task = asyncio.create_task(get_extracted_text(uuid_or_slug, field, max_tasks))
|
103
92
|
tasks.append(task)
|
104
93
|
|
105
94
|
if len(tasks) == 0:
|
nucliadb/search/search/utils.py
CHANGED
@@ -17,9 +17,26 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
+
import logging
|
20
21
|
from typing import Optional, Union
|
21
22
|
|
23
|
+
from pydantic import BaseModel
|
24
|
+
|
25
|
+
from nucliadb.common.datamanagers.atomic import kb
|
22
26
|
from nucliadb_models.search import BaseSearchRequest, MinScore
|
27
|
+
from nucliadb_utils import const
|
28
|
+
from nucliadb_utils.utilities import has_feature
|
29
|
+
|
30
|
+
logger = logging.getLogger(__name__)
|
31
|
+
|
32
|
+
|
33
|
+
async def filter_hidden_resources(kbid: str, show_hidden: bool) -> Optional[bool]:
|
34
|
+
kb_config = await kb.get_config(kbid=kbid)
|
35
|
+
hidden_enabled = kb_config and kb_config.hidden_resources_enabled
|
36
|
+
if hidden_enabled and not show_hidden:
|
37
|
+
return False
|
38
|
+
else:
|
39
|
+
return None # None = No filtering, show all resources
|
23
40
|
|
24
41
|
|
25
42
|
def is_empty_query(request: BaseSearchRequest) -> bool:
|
@@ -36,7 +53,7 @@ def is_exact_match_only_query(request: BaseSearchRequest) -> bool:
|
|
36
53
|
'foo "something" else' -> False
|
37
54
|
"""
|
38
55
|
query = request.query.strip()
|
39
|
-
return len(query) > 0 and query
|
56
|
+
return len(query) > 0 and query.startswith('"') and query.endswith('"')
|
40
57
|
|
41
58
|
|
42
59
|
def should_disable_vector_search(request: BaseSearchRequest) -> bool:
|
@@ -58,9 +75,7 @@ def min_score_from_query_params(
|
|
58
75
|
deprecated_min_score: Optional[float],
|
59
76
|
) -> MinScore:
|
60
77
|
# Keep backward compatibility with the deprecated min_score parameter
|
61
|
-
semantic =
|
62
|
-
deprecated_min_score if min_score_semantic is None else min_score_semantic
|
63
|
-
)
|
78
|
+
semantic = deprecated_min_score if min_score_semantic is None else min_score_semantic
|
64
79
|
return MinScore(bm25=min_score_bm25, semantic=semantic)
|
65
80
|
|
66
81
|
|
@@ -72,3 +87,11 @@ def min_score_from_payload(min_score: Optional[Union[float, MinScore]]) -> MinSc
|
|
72
87
|
elif isinstance(min_score, float):
|
73
88
|
return MinScore(bm25=0, semantic=min_score)
|
74
89
|
return min_score
|
90
|
+
|
91
|
+
|
92
|
+
def maybe_log_request_payload(kbid: str, endpoint: str, item: BaseModel):
|
93
|
+
if has_feature(const.Features.LOG_REQUEST_PAYLOADS, context={"kbid": kbid}, default=False):
|
94
|
+
logger.info(
|
95
|
+
"Request payload",
|
96
|
+
extra={"kbid": kbid, "endpoint": endpoint, "payload": item.model_dump_json()},
|
97
|
+
)
|
nucliadb/search/settings.py
CHANGED
@@ -18,6 +18,8 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
+
from typing import Optional
|
22
|
+
|
21
23
|
from pydantic import Field
|
22
24
|
|
23
25
|
from nucliadb.ingest.settings import DriverSettings
|
@@ -28,8 +30,20 @@ class Settings(DriverSettings):
|
|
28
30
|
slow_find_log_threshold: float = Field(
|
29
31
|
default=3.0,
|
30
32
|
title="Slow query log threshold",
|
31
|
-
description="The threshold in seconds for logging slow queries",
|
33
|
+
description="The threshold in seconds for logging slow find queries",
|
34
|
+
)
|
35
|
+
|
36
|
+
slow_node_query_log_threshold: float = Field(
|
37
|
+
default=2.0,
|
38
|
+
title="Slow node query log threshold",
|
39
|
+
description="The threshold in seconds for logging slow node queries",
|
40
|
+
)
|
41
|
+
prequeries_max_parallel: int = Field(
|
42
|
+
default=2,
|
43
|
+
title="Prequeries max parallel",
|
44
|
+
description="The maximum number of prequeries to run in parallel per /ask request",
|
32
45
|
)
|
46
|
+
nidx_address: Optional[str] = Field(default=None)
|
33
47
|
|
34
48
|
|
35
49
|
settings = Settings()
|
@@ -98,9 +98,7 @@ def get_temp_access_token(request: Request):
|
|
98
98
|
logger.warning(
|
99
99
|
"Dynamically generating JWK key. Please set JWK_KEY env variable to avoid this message."
|
100
100
|
)
|
101
|
-
settings.jwk_key = orjson.dumps(
|
102
|
-
jwk.JWK.generate(kty="oct", size=256, kid="dyn")
|
103
|
-
).decode("utf-8")
|
101
|
+
settings.jwk_key = orjson.dumps(jwk.JWK.generate(kty="oct", size=256, kid="dyn")).decode("utf-8")
|
104
102
|
jwetoken.add_recipient(jwk.JWK(**orjson.loads(settings.jwk_key)))
|
105
103
|
token = jwetoken.serialize(compact=True)
|
106
104
|
return JSONResponse({"token": token})
|
@@ -154,16 +152,14 @@ def introspect_endpoint(request: Request) -> StreamingResponse:
|
|
154
152
|
return StreamingResponse(
|
155
153
|
content=introspect.stream_tar(request.app),
|
156
154
|
status_code=200,
|
157
|
-
headers={
|
158
|
-
"Content-Disposition": f"attachment; filename=introspect_{introspect_id}.tar.gz"
|
159
|
-
},
|
155
|
+
headers={"Content-Disposition": f"attachment; filename=introspect_{introspect_id}.tar.gz"},
|
160
156
|
media_type="application/octet-stream",
|
161
157
|
)
|
162
158
|
|
163
159
|
|
164
160
|
@standalone_api_router.get("/pull/position")
|
165
161
|
async def pull_status(request: Request) -> JSONResponse:
|
166
|
-
async with datamanagers.
|
162
|
+
async with datamanagers.with_ro_transaction() as txn:
|
167
163
|
# standalone assumes 1 partition
|
168
164
|
current_offset = await datamanagers.processing.get_pull_offset(
|
169
165
|
txn, pull_type_id=processing.get_nua_api_id(), partition="1"
|
@@ -180,9 +176,7 @@ class UpdatePullPosition(pydantic.BaseModel):
|
|
180
176
|
|
181
177
|
|
182
178
|
@standalone_api_router.patch("/pull/position")
|
183
|
-
async def update_pull_position(
|
184
|
-
request: Request, item: UpdatePullPosition
|
185
|
-
) -> JSONResponse:
|
179
|
+
async def update_pull_position(request: Request, item: UpdatePullPosition) -> JSONResponse:
|
186
180
|
async with datamanagers.with_transaction() as txn:
|
187
181
|
# standalone assumes 1 partition
|
188
182
|
await datamanagers.processing.set_pull_offset(
|
nucliadb/standalone/app.py
CHANGED
@@ -20,7 +20,6 @@
|
|
20
20
|
import logging
|
21
21
|
import os
|
22
22
|
|
23
|
-
import nucliadb_admin_assets # type: ignore
|
24
23
|
from fastapi import FastAPI
|
25
24
|
from fastapi.responses import RedirectResponse
|
26
25
|
from fastapi.staticfiles import StaticFiles
|
@@ -31,13 +30,12 @@ from starlette.requests import ClientDisconnect
|
|
31
30
|
from starlette.responses import HTMLResponse
|
32
31
|
from starlette.routing import Mount
|
33
32
|
|
34
|
-
|
33
|
+
import nucliadb_admin_assets # type: ignore
|
35
34
|
from nucliadb.middleware import ProcessTimeHeaderMiddleware
|
36
|
-
from nucliadb.middleware.transaction import ReadOnlyTransactionMiddleware
|
37
35
|
from nucliadb.reader import API_PREFIX
|
38
36
|
from nucliadb.reader.api.v1.router import api as api_reader_v1
|
39
37
|
from nucliadb.search.api.v1.router import api as api_search_v1
|
40
|
-
from nucliadb.standalone.lifecycle import
|
38
|
+
from nucliadb.standalone.lifecycle import lifespan
|
41
39
|
from nucliadb.train.api.v1.router import api as api_train_v1
|
42
40
|
from nucliadb.writer.api.v1.router import api as api_writer_v1
|
43
41
|
from nucliadb_telemetry.fastapi import metrics_endpoint
|
@@ -45,9 +43,11 @@ from nucliadb_telemetry.fastapi.utils import (
|
|
45
43
|
client_disconnect_handler,
|
46
44
|
global_exception_handler,
|
47
45
|
)
|
46
|
+
from nucliadb_utils.audit.stream import AuditMiddleware
|
48
47
|
from nucliadb_utils.fastapi.openapi import extend_openapi
|
49
48
|
from nucliadb_utils.fastapi.versioning import VersionedFastAPI
|
50
49
|
from nucliadb_utils.settings import http_settings, running_settings
|
50
|
+
from nucliadb_utils.utilities import get_audit
|
51
51
|
|
52
52
|
from .api_router import standalone_api_router
|
53
53
|
from .auth import get_auth_backend
|
@@ -71,7 +71,7 @@ HOMEPAGE_HTML = """
|
|
71
71
|
<h2>Quick Links</h2>
|
72
72
|
<ul>
|
73
73
|
<li><a href="/admin">Admin UI</a></li>
|
74
|
-
<li><a href="https://docs.nuclia.dev/docs/
|
74
|
+
<li><a href="https://docs.nuclia.dev/docs/management/nucliadb/deploy/basics">NucliaDB Deployment Documentation</a></li>
|
75
75
|
<li><a href="https://docs.nuclia.dev/docs/api">API Reference</a></li>
|
76
76
|
<li><a href="/api/v1/docs">API Explorer</a></li>
|
77
77
|
<li><a href="/metrics">Metrics</a></li>
|
@@ -94,7 +94,7 @@ def application_factory(settings: Settings) -> FastAPI:
|
|
94
94
|
AuthenticationMiddleware,
|
95
95
|
backend=get_auth_backend(settings),
|
96
96
|
),
|
97
|
-
Middleware(
|
97
|
+
Middleware(AuditMiddleware, audit_utility_getter=get_audit),
|
98
98
|
]
|
99
99
|
if running_settings.debug:
|
100
100
|
middleware.append(Middleware(ProcessTimeHeaderMiddleware))
|
@@ -102,8 +102,7 @@ def application_factory(settings: Settings) -> FastAPI:
|
|
102
102
|
fastapi_settings = dict(
|
103
103
|
debug=running_settings.debug,
|
104
104
|
middleware=middleware,
|
105
|
-
|
106
|
-
on_shutdown=[finalize],
|
105
|
+
lifespan=lifespan,
|
107
106
|
exception_handlers={
|
108
107
|
Exception: global_exception_handler,
|
109
108
|
ClientDisconnect: client_disconnect_handler,
|
@@ -140,9 +139,7 @@ def application_factory(settings: Settings) -> FastAPI:
|
|
140
139
|
# mount admin app assets
|
141
140
|
application.mount(
|
142
141
|
"/admin",
|
143
|
-
StaticFiles(
|
144
|
-
directory=os.path.dirname(nucliadb_admin_assets.__file__), html=True
|
145
|
-
),
|
142
|
+
StaticFiles(directory=os.path.dirname(nucliadb_admin_assets.__file__), html=True),
|
146
143
|
name="static",
|
147
144
|
)
|
148
145
|
# redirect /contributor -> /admin
|
@@ -158,7 +155,4 @@ def application_factory(settings: Settings) -> FastAPI:
|
|
158
155
|
if isinstance(route, Mount):
|
159
156
|
route.app.settings = settings # type: ignore
|
160
157
|
|
161
|
-
# Inject application context into the fastapi app's state
|
162
|
-
set_app_context(application)
|
163
|
-
|
164
158
|
return application
|