nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from time import time
|
21
|
+
from typing import Optional, Union
|
22
|
+
|
23
|
+
from fastapi import Request, Response
|
24
|
+
from fastapi_versioning import version
|
25
|
+
|
26
|
+
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
27
|
+
from nucliadb.common.maindb.pg import PGDriver
|
28
|
+
from nucliadb.common.maindb.utils import get_driver
|
29
|
+
from nucliadb.models.responses import HTTPClientError
|
30
|
+
from nucliadb.search import logger
|
31
|
+
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
32
|
+
from nucliadb.search.api.v1.utils import fastapi_query
|
33
|
+
from nucliadb.search.search import cache
|
34
|
+
from nucliadb.search.search.exceptions import InvalidQueryError
|
35
|
+
from nucliadb.search.search.merge import fetch_resources
|
36
|
+
from nucliadb.search.search.pgcatalog import pgcatalog_search
|
37
|
+
from nucliadb.search.search.query_parser.parser import parse_catalog
|
38
|
+
from nucliadb.search.search.utils import (
|
39
|
+
maybe_log_request_payload,
|
40
|
+
)
|
41
|
+
from nucliadb_models.common import FieldTypeName
|
42
|
+
from nucliadb_models.metadata import ResourceProcessingStatus
|
43
|
+
from nucliadb_models.resource import NucliaDBRoles
|
44
|
+
from nucliadb_models.search import (
|
45
|
+
CatalogRequest,
|
46
|
+
CatalogResponse,
|
47
|
+
KnowledgeboxSearchResults,
|
48
|
+
ResourceProperties,
|
49
|
+
SearchParamDefaults,
|
50
|
+
SortField,
|
51
|
+
SortOptions,
|
52
|
+
SortOrder,
|
53
|
+
)
|
54
|
+
from nucliadb_models.utils import DateTime
|
55
|
+
from nucliadb_utils.authentication import requires
|
56
|
+
from nucliadb_utils.exceptions import LimitsExceededError
|
57
|
+
|
58
|
+
|
59
|
+
@api.get(
|
60
|
+
f"/{KB_PREFIX}/{{kbid}}/catalog",
|
61
|
+
status_code=200,
|
62
|
+
summary="List resources of a Knowledge Box",
|
63
|
+
description="List resources of a Knowledge Box",
|
64
|
+
response_model=KnowledgeboxSearchResults,
|
65
|
+
response_model_exclude_unset=True,
|
66
|
+
tags=["Search"],
|
67
|
+
)
|
68
|
+
@requires(NucliaDBRoles.READER)
|
69
|
+
@version(1)
|
70
|
+
async def catalog_get(
|
71
|
+
request: Request,
|
72
|
+
response: Response,
|
73
|
+
kbid: str,
|
74
|
+
query: str = fastapi_query(SearchParamDefaults.query),
|
75
|
+
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
76
|
+
faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
|
77
|
+
sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
|
78
|
+
sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
|
79
|
+
sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
|
80
|
+
page_number: int = fastapi_query(SearchParamDefaults.catalog_page_number),
|
81
|
+
page_size: int = fastapi_query(SearchParamDefaults.catalog_page_size),
|
82
|
+
shards: list[str] = fastapi_query(SearchParamDefaults.shards, deprecated=True),
|
83
|
+
with_status: Optional[ResourceProcessingStatus] = fastapi_query(
|
84
|
+
SearchParamDefaults.with_status, deprecated="Use filters instead"
|
85
|
+
),
|
86
|
+
debug: bool = fastapi_query(SearchParamDefaults.debug, include_in_schema=False),
|
87
|
+
range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
|
88
|
+
range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
|
89
|
+
range_modification_start: Optional[DateTime] = fastapi_query(
|
90
|
+
SearchParamDefaults.range_modification_start
|
91
|
+
),
|
92
|
+
range_modification_end: Optional[DateTime] = fastapi_query(
|
93
|
+
SearchParamDefaults.range_modification_end
|
94
|
+
),
|
95
|
+
hidden: Optional[bool] = fastapi_query(SearchParamDefaults.hidden),
|
96
|
+
) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
|
97
|
+
item = CatalogRequest(
|
98
|
+
query=query,
|
99
|
+
filters=filters,
|
100
|
+
faceted=faceted,
|
101
|
+
page_number=page_number,
|
102
|
+
page_size=page_size,
|
103
|
+
shards=shards,
|
104
|
+
debug=debug,
|
105
|
+
with_status=with_status,
|
106
|
+
range_creation_start=range_creation_start,
|
107
|
+
range_creation_end=range_creation_end,
|
108
|
+
range_modification_start=range_modification_start,
|
109
|
+
range_modification_end=range_modification_end,
|
110
|
+
hidden=hidden,
|
111
|
+
)
|
112
|
+
if sort_field:
|
113
|
+
item.sort = SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
|
114
|
+
return await catalog(kbid, item)
|
115
|
+
|
116
|
+
|
117
|
+
@api.post(
|
118
|
+
f"/{KB_PREFIX}/{{kbid}}/catalog",
|
119
|
+
status_code=200,
|
120
|
+
summary="List resources of a Knowledge Box",
|
121
|
+
description="List resources of a Knowledge Box",
|
122
|
+
response_model=KnowledgeboxSearchResults,
|
123
|
+
response_model_exclude_unset=True,
|
124
|
+
tags=["Search"],
|
125
|
+
)
|
126
|
+
@requires(NucliaDBRoles.READER)
|
127
|
+
@version(1)
|
128
|
+
async def catalog_post(
|
129
|
+
request: Request,
|
130
|
+
kbid: str,
|
131
|
+
item: CatalogRequest,
|
132
|
+
) -> Union[CatalogResponse, HTTPClientError]:
|
133
|
+
return await catalog(kbid, item)
|
134
|
+
|
135
|
+
|
136
|
+
async def catalog(
|
137
|
+
kbid: str,
|
138
|
+
item: CatalogRequest,
|
139
|
+
):
|
140
|
+
"""
|
141
|
+
Catalog endpoint is a simplified version of the search endpoint, it only
|
142
|
+
returns bm25 results on titles and it does not support vector search.
|
143
|
+
It is useful for listing resources in a knowledge box.
|
144
|
+
"""
|
145
|
+
if not pgcatalog_enabled(): # pragma: no cover
|
146
|
+
return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
|
147
|
+
|
148
|
+
maybe_log_request_payload(kbid, "/catalog", item)
|
149
|
+
start_time = time()
|
150
|
+
try:
|
151
|
+
with cache.request_caches():
|
152
|
+
query_parser = parse_catalog(kbid, item)
|
153
|
+
|
154
|
+
catalog_results = CatalogResponse()
|
155
|
+
catalog_results.fulltext = await pgcatalog_search(query_parser)
|
156
|
+
catalog_results.resources = await fetch_resources(
|
157
|
+
resources=[r.rid for r in catalog_results.fulltext.results],
|
158
|
+
kbid=kbid,
|
159
|
+
show=[ResourceProperties.BASIC, ResourceProperties.ERRORS],
|
160
|
+
field_type_filter=list(FieldTypeName),
|
161
|
+
extracted=[],
|
162
|
+
)
|
163
|
+
return catalog_results
|
164
|
+
except InvalidQueryError as exc:
|
165
|
+
return HTTPClientError(status_code=412, detail=str(exc))
|
166
|
+
except KnowledgeBoxNotFound:
|
167
|
+
return HTTPClientError(status_code=404, detail="Knowledge Box not found")
|
168
|
+
except LimitsExceededError as exc:
|
169
|
+
return HTTPClientError(status_code=exc.status_code, detail=exc.detail)
|
170
|
+
finally:
|
171
|
+
duration = time() - start_time
|
172
|
+
if duration > 2: # pragma: no cover
|
173
|
+
logger.warning(
|
174
|
+
"Slow catalog request",
|
175
|
+
extra={
|
176
|
+
"kbid": kbid,
|
177
|
+
"duration": duration,
|
178
|
+
"query": item.model_dump_json(),
|
179
|
+
},
|
180
|
+
)
|
181
|
+
|
182
|
+
|
183
|
+
def pgcatalog_enabled():
|
184
|
+
return isinstance(get_driver(), PGDriver)
|
@@ -18,18 +18,18 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
-
|
22
21
|
from fastapi import Header, Request, Response
|
23
22
|
from fastapi_versioning import version
|
24
23
|
|
24
|
+
from nucliadb.common.models_utils import to_proto
|
25
25
|
from nucliadb.models.responses import HTTPClientError
|
26
|
-
from nucliadb.search import logger
|
26
|
+
from nucliadb.search import logger
|
27
27
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
28
|
-
from nucliadb.search.utilities import get_predict
|
29
28
|
from nucliadb_models.resource import NucliaDBRoles
|
30
29
|
from nucliadb_models.search import FeedbackRequest, NucliaDBClientType
|
31
30
|
from nucliadb_telemetry import errors
|
32
31
|
from nucliadb_utils.authentication import requires
|
32
|
+
from nucliadb_utils.utilities import get_audit
|
33
33
|
|
34
34
|
|
35
35
|
@api.post(
|
@@ -51,28 +51,20 @@ async def send_feedback_endpoint(
|
|
51
51
|
x_forwarded_for: str = Header(""),
|
52
52
|
):
|
53
53
|
try:
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
54
|
+
audit = get_audit()
|
55
|
+
if audit is not None:
|
56
|
+
audit.feedback(
|
57
|
+
kbid=kbid,
|
58
|
+
user=x_nucliadb_user,
|
59
|
+
client_type=to_proto.client_type(x_ndb_client),
|
60
|
+
origin=x_forwarded_for,
|
61
|
+
learning_id=item.ident,
|
62
|
+
good=item.good,
|
63
|
+
task=to_proto.feedback_task(item.task),
|
64
|
+
feedback=item.feedback,
|
65
|
+
text_block_id=item.text_block_id,
|
66
|
+
)
|
62
67
|
except Exception as ex:
|
63
68
|
errors.capture_exception(ex)
|
64
69
|
logger.exception("Unexpected error sending feedback", extra={"kbid": kbid})
|
65
70
|
return HTTPClientError(status_code=500, detail=f"Internal server error")
|
66
|
-
|
67
|
-
|
68
|
-
async def send_feedback(
|
69
|
-
kbid: str,
|
70
|
-
item: FeedbackRequest,
|
71
|
-
x_nucliadb_user: str,
|
72
|
-
x_ndb_client: NucliaDBClientType,
|
73
|
-
x_forwarded_for: str,
|
74
|
-
):
|
75
|
-
predict = get_predict()
|
76
|
-
await predict.send_feedback(
|
77
|
-
kbid, item, x_nucliadb_user, x_ndb_client, x_forwarded_for
|
78
|
-
)
|
nucliadb/search/api/v1/find.py
CHANGED
@@ -18,7 +18,6 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import json
|
21
|
-
from datetime import datetime
|
22
21
|
from typing import Optional, Union
|
23
22
|
|
24
23
|
from fastapi import Body, Header, Query, Request, Response
|
@@ -31,20 +30,25 @@ from nucliadb.models.responses import HTTPClientError
|
|
31
30
|
from nucliadb.search import predict
|
32
31
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
33
32
|
from nucliadb.search.api.v1.utils import fastapi_query
|
33
|
+
from nucliadb.search.search import cache
|
34
34
|
from nucliadb.search.search.exceptions import InvalidQueryError
|
35
35
|
from nucliadb.search.search.find import find
|
36
|
-
from nucliadb.search.search.utils import min_score_from_query_params
|
36
|
+
from nucliadb.search.search.utils import maybe_log_request_payload, min_score_from_query_params
|
37
37
|
from nucliadb_models.common import FieldTypeName
|
38
38
|
from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
|
39
39
|
from nucliadb_models.search import (
|
40
40
|
FindRequest,
|
41
41
|
KnowledgeboxFindResults,
|
42
42
|
NucliaDBClientType,
|
43
|
+
RankFusionName,
|
44
|
+
Reranker,
|
45
|
+
RerankerName,
|
43
46
|
ResourceProperties,
|
44
47
|
SearchOptions,
|
45
48
|
SearchParamDefaults,
|
46
49
|
)
|
47
50
|
from nucliadb_models.security import RequestSecurity
|
51
|
+
from nucliadb_models.utils import DateTime
|
48
52
|
from nucliadb_utils.authentication import requires
|
49
53
|
from nucliadb_utils.exceptions import LimitsExceededError
|
50
54
|
|
@@ -54,7 +58,7 @@ FIND_EXAMPLES = {
|
|
54
58
|
description="Perform a hybrid search that will return text and semantic results matching the query",
|
55
59
|
value={
|
56
60
|
"query": "How can I be an effective product manager?",
|
57
|
-
"features": [SearchOptions.
|
61
|
+
"features": [SearchOptions.KEYWORD, SearchOptions.SEMANTIC],
|
58
62
|
},
|
59
63
|
)
|
60
64
|
}
|
@@ -78,39 +82,35 @@ async def find_knowledgebox(
|
|
78
82
|
query: str = fastapi_query(SearchParamDefaults.query),
|
79
83
|
fields: list[str] = fastapi_query(SearchParamDefaults.fields),
|
80
84
|
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
81
|
-
|
82
|
-
page_size: int = fastapi_query(SearchParamDefaults.page_size),
|
85
|
+
top_k: Optional[int] = fastapi_query(SearchParamDefaults.top_k),
|
83
86
|
min_score: Optional[float] = Query(
|
84
87
|
default=None,
|
85
|
-
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/
|
88
|
+
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
|
86
89
|
deprecated=True,
|
87
90
|
),
|
88
91
|
min_score_semantic: Optional[float] = Query(
|
89
92
|
default=None,
|
90
|
-
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/
|
93
|
+
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
|
91
94
|
),
|
92
95
|
min_score_bm25: float = Query(
|
93
96
|
default=0,
|
94
97
|
description="Minimum bm25 score to filter paragraph and document index results",
|
95
98
|
ge=0,
|
96
99
|
),
|
97
|
-
|
98
|
-
|
99
|
-
),
|
100
|
-
|
101
|
-
SearchParamDefaults.range_creation_end
|
102
|
-
),
|
103
|
-
range_modification_start: Optional[datetime] = fastapi_query(
|
100
|
+
vectorset: Optional[str] = fastapi_query(SearchParamDefaults.vectorset),
|
101
|
+
range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
|
102
|
+
range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
|
103
|
+
range_modification_start: Optional[DateTime] = fastapi_query(
|
104
104
|
SearchParamDefaults.range_modification_start
|
105
105
|
),
|
106
|
-
range_modification_end: Optional[
|
106
|
+
range_modification_end: Optional[DateTime] = fastapi_query(
|
107
107
|
SearchParamDefaults.range_modification_end
|
108
108
|
),
|
109
109
|
features: list[SearchOptions] = fastapi_query(
|
110
110
|
SearchParamDefaults.search_features,
|
111
111
|
default=[
|
112
|
-
SearchOptions.
|
113
|
-
SearchOptions.
|
112
|
+
SearchOptions.KEYWORD,
|
113
|
+
SearchOptions.SEMANTIC,
|
114
114
|
],
|
115
115
|
),
|
116
116
|
debug: bool = fastapi_query(SearchParamDefaults.debug),
|
@@ -119,13 +119,14 @@ async def find_knowledgebox(
|
|
119
119
|
field_type_filter: list[FieldTypeName] = fastapi_query(
|
120
120
|
SearchParamDefaults.field_type_filter, alias="field_type"
|
121
121
|
),
|
122
|
-
extracted: list[ExtractedDataTypeName] = fastapi_query(
|
123
|
-
SearchParamDefaults.extracted
|
124
|
-
),
|
122
|
+
extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
|
125
123
|
with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
|
126
124
|
with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
|
127
125
|
autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
|
128
126
|
security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
|
127
|
+
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
128
|
+
rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
|
129
|
+
reranker: Union[RerankerName, Reranker] = fastapi_query(SearchParamDefaults.reranker),
|
129
130
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
130
131
|
x_nucliadb_user: str = Header(""),
|
131
132
|
x_forwarded_for: str = Header(""),
|
@@ -138,11 +139,9 @@ async def find_knowledgebox(
|
|
138
139
|
query=query,
|
139
140
|
fields=fields,
|
140
141
|
filters=filters,
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
min_score_bm25, min_score_semantic, min_score
|
145
|
-
),
|
142
|
+
top_k=top_k, # type: ignore
|
143
|
+
min_score=min_score_from_query_params(min_score_bm25, min_score_semantic, min_score),
|
144
|
+
vectorset=vectorset,
|
146
145
|
range_creation_end=range_creation_end,
|
147
146
|
range_creation_start=range_creation_start,
|
148
147
|
range_modification_end=range_modification_end,
|
@@ -157,14 +156,15 @@ async def find_knowledgebox(
|
|
157
156
|
with_synonyms=with_synonyms,
|
158
157
|
autofilter=autofilter,
|
159
158
|
security=security,
|
159
|
+
show_hidden=show_hidden,
|
160
|
+
rank_fusion=rank_fusion,
|
161
|
+
reranker=reranker,
|
160
162
|
)
|
161
163
|
except ValidationError as exc:
|
162
164
|
detail = json.loads(exc.json())
|
163
165
|
return HTTPClientError(status_code=422, detail=detail)
|
164
166
|
|
165
|
-
return await _find_endpoint(
|
166
|
-
response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
|
167
|
-
)
|
167
|
+
return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
|
168
168
|
|
169
169
|
|
170
170
|
@api.post(
|
@@ -187,9 +187,7 @@ async def find_post_knowledgebox(
|
|
187
187
|
x_nucliadb_user: str = Header(""),
|
188
188
|
x_forwarded_for: str = Header(""),
|
189
189
|
) -> Union[KnowledgeboxFindResults, HTTPClientError]:
|
190
|
-
return await _find_endpoint(
|
191
|
-
response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
|
192
|
-
)
|
190
|
+
return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
|
193
191
|
|
194
192
|
|
195
193
|
async def _find_endpoint(
|
@@ -201,11 +199,13 @@ async def _find_endpoint(
|
|
201
199
|
x_forwarded_for: str,
|
202
200
|
) -> Union[KnowledgeboxFindResults, HTTPClientError]:
|
203
201
|
try:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
202
|
+
maybe_log_request_payload(kbid, "/find", item)
|
203
|
+
with cache.request_caches():
|
204
|
+
results, incomplete, _ = await find(
|
205
|
+
kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
|
206
|
+
)
|
207
|
+
response.status_code = 206 if incomplete else 200
|
208
|
+
return results
|
209
209
|
except KnowledgeBoxNotFound:
|
210
210
|
return HTTPClientError(status_code=404, detail="Knowledge Box not found")
|
211
211
|
except LimitsExceededError as exc:
|
@@ -24,29 +24,35 @@ from fastapi import HTTPException, Request
|
|
24
24
|
from fastapi_versioning import version
|
25
25
|
from grpc import StatusCode as GrpcStatusCode
|
26
26
|
from grpc.aio import AioRpcError
|
27
|
-
from nucliadb_protos.noderesources_pb2 import Shard
|
28
|
-
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
29
|
-
from nucliadb_protos.writer_pb2 import Shards
|
30
27
|
|
31
28
|
from nucliadb.common import datamanagers
|
32
29
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
33
30
|
from nucliadb.common.cluster.manager import choose_node
|
34
31
|
from nucliadb.common.cluster.utils import get_shard_manager
|
32
|
+
from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
|
33
|
+
from nucliadb.common.counters import IndexCounts
|
34
|
+
from nucliadb.common.external_index_providers.manager import get_external_index_manager
|
35
|
+
from nucliadb.common.models_utils import from_proto
|
35
36
|
from nucliadb.search import logger
|
36
37
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
37
38
|
from nucliadb.search.api.v1.utils import fastapi_query
|
38
39
|
from nucliadb.search.search.shards import get_shard
|
39
40
|
from nucliadb.search.settings import settings
|
41
|
+
from nucliadb_models.internal.shards import KnowledgeboxShards
|
40
42
|
from nucliadb_models.resource import NucliaDBRoles
|
41
43
|
from nucliadb_models.search import (
|
42
44
|
KnowledgeboxCounters,
|
43
|
-
KnowledgeboxShards,
|
44
45
|
SearchParamDefaults,
|
45
46
|
)
|
47
|
+
from nucliadb_protos.noderesources_pb2 import Shard
|
48
|
+
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
49
|
+
from nucliadb_protos.writer_pb2 import Shards
|
46
50
|
from nucliadb_telemetry import errors
|
51
|
+
from nucliadb_utils import const
|
47
52
|
from nucliadb_utils.authentication import requires, requires_one
|
53
|
+
from nucliadb_utils.utilities import has_feature
|
48
54
|
|
49
|
-
|
55
|
+
MAX_PARAGRAPHS_FOR_SMALL_KB = 250_000
|
50
56
|
|
51
57
|
|
52
58
|
@api.get(
|
@@ -68,7 +74,7 @@ async def knowledgebox_shards(request: Request, kbid: str) -> KnowledgeboxShards
|
|
68
74
|
status_code=404,
|
69
75
|
detail="The knowledgebox or its shards configuration is missing",
|
70
76
|
)
|
71
|
-
return
|
77
|
+
return from_proto.kb_shards(shards)
|
72
78
|
|
73
79
|
|
74
80
|
@api.get(
|
@@ -86,21 +92,83 @@ async def knowledgebox_counters(
|
|
86
92
|
kbid: str,
|
87
93
|
debug: bool = fastapi_query(SearchParamDefaults.debug),
|
88
94
|
) -> KnowledgeboxCounters:
|
89
|
-
shard_manager = get_shard_manager()
|
90
|
-
|
91
95
|
try:
|
92
|
-
|
96
|
+
return await _kb_counters(kbid, debug=debug)
|
93
97
|
except ShardsNotFound:
|
94
98
|
raise HTTPException(
|
95
99
|
status_code=404,
|
96
100
|
detail="The knowledgebox or its shards configuration is missing",
|
97
101
|
)
|
98
102
|
|
103
|
+
|
104
|
+
async def _kb_counters(
|
105
|
+
kbid: str,
|
106
|
+
debug: bool = False,
|
107
|
+
) -> KnowledgeboxCounters:
|
108
|
+
"""
|
109
|
+
Resources count is calculated from maindb and cached
|
110
|
+
Field count is calculated from the index node cluster
|
111
|
+
Paragraphs and Sentences count is calculated from the index node cluster or the external index provider.
|
112
|
+
Index size is estimated from the paragraphs count.
|
113
|
+
"""
|
114
|
+
counters = KnowledgeboxCounters(
|
115
|
+
resources=0,
|
116
|
+
paragraphs=0,
|
117
|
+
fields=0,
|
118
|
+
sentences=0,
|
119
|
+
index_size=0,
|
120
|
+
)
|
121
|
+
external_index_manager = await get_external_index_manager(kbid)
|
122
|
+
if external_index_manager is not None:
|
123
|
+
index_counts = await external_index_manager.get_index_counts()
|
124
|
+
counters.paragraphs = index_counts.paragraphs
|
125
|
+
counters.sentences = index_counts.sentences
|
126
|
+
is_small_kb = index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
|
127
|
+
resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
|
128
|
+
# TODO: Find a way to query the fields count from the external index provider or use the catalog
|
129
|
+
counters.resources = counters.fields = resource_count
|
130
|
+
else:
|
131
|
+
node_index_counts, queried_shards = await get_node_index_counts(kbid)
|
132
|
+
counters.fields = node_index_counts.fields
|
133
|
+
counters.paragraphs = node_index_counts.paragraphs
|
134
|
+
counters.sentences = node_index_counts.sentences
|
135
|
+
is_small_kb = node_index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
|
136
|
+
resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
|
137
|
+
counters.resources = resource_count
|
138
|
+
counters.index_size = counters.paragraphs * AVG_PARAGRAPH_SIZE_BYTES
|
139
|
+
if debug and queried_shards is not None:
|
140
|
+
counters.shards = queried_shards
|
141
|
+
return counters
|
142
|
+
|
143
|
+
|
144
|
+
async def get_resources_count(kbid: str, force_calculate: bool = False) -> int:
|
145
|
+
async with datamanagers.with_ro_transaction() as txn:
|
146
|
+
if force_calculate:
|
147
|
+
# For small kbs, this is faster and more up to date
|
148
|
+
resource_count = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
|
149
|
+
else:
|
150
|
+
resource_count = await datamanagers.resources.get_number_of_resources(txn, kbid=kbid)
|
151
|
+
if resource_count == -1:
|
152
|
+
# WARNING: standalone, this value will never be cached
|
153
|
+
resource_count = await datamanagers.resources.calculate_number_of_resources(
|
154
|
+
txn, kbid=kbid
|
155
|
+
)
|
156
|
+
return resource_count
|
157
|
+
|
158
|
+
|
159
|
+
async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
|
160
|
+
"""
|
161
|
+
Get the index counts for a knowledgebox that has an index in the index node cluster.
|
162
|
+
"""
|
163
|
+
shard_manager = get_shard_manager()
|
164
|
+
shard_groups: list[PBShardObject] = await shard_manager.get_shards_by_kbid(kbid)
|
99
165
|
ops = []
|
100
166
|
queried_shards = []
|
101
167
|
for shard_object in shard_groups:
|
102
168
|
try:
|
103
|
-
node, shard_id = choose_node(
|
169
|
+
node, shard_id = choose_node(
|
170
|
+
shard_object, use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid})
|
171
|
+
)
|
104
172
|
except KeyError:
|
105
173
|
raise HTTPException(
|
106
174
|
status_code=500,
|
@@ -121,7 +189,7 @@ async def knowledgebox_counters(
|
|
121
189
|
)
|
122
190
|
|
123
191
|
try:
|
124
|
-
results: Optional[list[Shard]] = await asyncio.wait_for(
|
192
|
+
results: Optional[list[Shard]] = await asyncio.wait_for(
|
125
193
|
asyncio.gather(*ops, return_exceptions=True), # type: ignore
|
126
194
|
timeout=settings.search_timeout,
|
127
195
|
)
|
@@ -138,56 +206,17 @@ async def knowledgebox_counters(
|
|
138
206
|
if results is None:
|
139
207
|
raise HTTPException(status_code=503, detail=f"No shards found")
|
140
208
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
209
|
+
counts = IndexCounts(
|
210
|
+
fields=0,
|
211
|
+
paragraphs=0,
|
212
|
+
sentences=0,
|
213
|
+
)
|
145
214
|
for shard in results:
|
146
215
|
if isinstance(shard, Exception):
|
147
216
|
logger.error("Error getting shard info", exc_info=shard)
|
148
217
|
errors.capture_exception(shard)
|
149
|
-
raise HTTPException(
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
paragraph_count += shard.paragraphs
|
155
|
-
sentence_count += shard.sentences
|
156
|
-
|
157
|
-
async with datamanagers.with_transaction() as txn:
|
158
|
-
try:
|
159
|
-
if len(shard_groups) <= 1:
|
160
|
-
# for smaller kbs, this is faster and more up to date
|
161
|
-
resource_count = (
|
162
|
-
await datamanagers.resources.calculate_number_of_resources(
|
163
|
-
txn, kbid=kbid
|
164
|
-
)
|
165
|
-
)
|
166
|
-
else:
|
167
|
-
resource_count = await datamanagers.resources.get_number_of_resources(
|
168
|
-
txn, kbid=kbid
|
169
|
-
)
|
170
|
-
if resource_count == -1:
|
171
|
-
# WARNING: standalone, this value will never be cached
|
172
|
-
resource_count = (
|
173
|
-
await datamanagers.resources.calculate_number_of_resources(
|
174
|
-
txn, kbid=kbid
|
175
|
-
)
|
176
|
-
)
|
177
|
-
except Exception as exc:
|
178
|
-
errors.capture_exception(exc)
|
179
|
-
raise HTTPException(
|
180
|
-
status_code=500, detail="Couldn't retrieve counters right now"
|
181
|
-
)
|
182
|
-
|
183
|
-
counters = KnowledgeboxCounters(
|
184
|
-
resources=resource_count,
|
185
|
-
paragraphs=paragraph_count,
|
186
|
-
fields=field_count,
|
187
|
-
sentences=sentence_count,
|
188
|
-
index_size=paragraph_count * AVG_PARAGRAPH_SIZE_BYTES,
|
189
|
-
)
|
190
|
-
|
191
|
-
if debug:
|
192
|
-
counters.shards = queried_shards
|
193
|
-
return counters
|
218
|
+
raise HTTPException(status_code=500, detail=f"Error while geting shard data")
|
219
|
+
counts.fields += shard.fields
|
220
|
+
counts.paragraphs += shard.paragraphs
|
221
|
+
counts.sentences += shard.sentences
|
222
|
+
return counts, queried_shards
|
@@ -40,8 +40,6 @@ from ..ask import create_ask_response
|
|
40
40
|
description="Ask questions to a resource",
|
41
41
|
tags=["Search"],
|
42
42
|
response_model=SyncAskResponse,
|
43
|
-
# Add this to OpenAPI schema when endpoint is not in beta anymore
|
44
|
-
include_in_schema=False,
|
45
43
|
)
|
46
44
|
@requires(NucliaDBRoles.READER)
|
47
45
|
@version(1)
|
@@ -77,8 +75,6 @@ async def resource_ask_endpoint_by_uuid(
|
|
77
75
|
description="Ask questions to a resource",
|
78
76
|
tags=["Search"],
|
79
77
|
response_model=SyncAskResponse,
|
80
|
-
# Add this to OpenAPI schema when endpoint is not in beta anymore
|
81
|
-
include_in_schema=False,
|
82
78
|
)
|
83
79
|
@requires(NucliaDBRoles.READER)
|
84
80
|
@version(1)
|
@@ -111,7 +107,5 @@ async def resource_ask_endpoint_by_slug(
|
|
111
107
|
|
112
108
|
|
113
109
|
async def get_resource_uuid_by_slug(kbid: str, slug: str) -> Optional[str]:
|
114
|
-
async with datamanagers.
|
115
|
-
return await datamanagers.resources.get_resource_uuid_from_slug(
|
116
|
-
txn, kbid=kbid, slug=slug
|
117
|
-
)
|
110
|
+
async with datamanagers.with_ro_transaction() as txn:
|
111
|
+
return await datamanagers.resources.get_resource_uuid_from_slug(txn, kbid=kbid, slug=slug)
|