nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/search/api/v1/find.py
CHANGED
@@ -18,33 +18,37 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import json
|
21
|
-
from datetime import datetime
|
22
21
|
from typing import Optional, Union
|
23
22
|
|
24
23
|
from fastapi import Body, Header, Query, Request, Response
|
25
24
|
from fastapi.openapi.models import Example
|
26
25
|
from fastapi_versioning import version
|
27
|
-
from pydantic
|
26
|
+
from pydantic import ValidationError
|
28
27
|
|
29
28
|
from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
|
30
29
|
from nucliadb.models.responses import HTTPClientError
|
31
30
|
from nucliadb.search import predict
|
32
31
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
33
32
|
from nucliadb.search.api.v1.utils import fastapi_query
|
33
|
+
from nucliadb.search.search import cache
|
34
34
|
from nucliadb.search.search.exceptions import InvalidQueryError
|
35
35
|
from nucliadb.search.search.find import find
|
36
|
-
from nucliadb.search.search.utils import min_score_from_query_params
|
36
|
+
from nucliadb.search.search.utils import maybe_log_request_payload, min_score_from_query_params
|
37
37
|
from nucliadb_models.common import FieldTypeName
|
38
38
|
from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
|
39
39
|
from nucliadb_models.search import (
|
40
40
|
FindRequest,
|
41
41
|
KnowledgeboxFindResults,
|
42
42
|
NucliaDBClientType,
|
43
|
+
RankFusionName,
|
44
|
+
Reranker,
|
45
|
+
RerankerName,
|
43
46
|
ResourceProperties,
|
44
47
|
SearchOptions,
|
45
48
|
SearchParamDefaults,
|
46
49
|
)
|
47
50
|
from nucliadb_models.security import RequestSecurity
|
51
|
+
from nucliadb_models.utils import DateTime
|
48
52
|
from nucliadb_utils.authentication import requires
|
49
53
|
from nucliadb_utils.exceptions import LimitsExceededError
|
50
54
|
|
@@ -54,7 +58,7 @@ FIND_EXAMPLES = {
|
|
54
58
|
description="Perform a hybrid search that will return text and semantic results matching the query",
|
55
59
|
value={
|
56
60
|
"query": "How can I be an effective product manager?",
|
57
|
-
"features": [SearchOptions.
|
61
|
+
"features": [SearchOptions.KEYWORD, SearchOptions.SEMANTIC],
|
58
62
|
},
|
59
63
|
)
|
60
64
|
}
|
@@ -63,7 +67,7 @@ FIND_EXAMPLES = {
|
|
63
67
|
@api.get(
|
64
68
|
f"/{KB_PREFIX}/{{kbid}}/find",
|
65
69
|
status_code=200,
|
66
|
-
|
70
|
+
summary="Find Knowledge Box",
|
67
71
|
description="Find on a Knowledge Box",
|
68
72
|
response_model=KnowledgeboxFindResults,
|
69
73
|
response_model_exclude_unset=True,
|
@@ -78,39 +82,35 @@ async def find_knowledgebox(
|
|
78
82
|
query: str = fastapi_query(SearchParamDefaults.query),
|
79
83
|
fields: list[str] = fastapi_query(SearchParamDefaults.fields),
|
80
84
|
filters: list[str] = fastapi_query(SearchParamDefaults.filters),
|
81
|
-
|
82
|
-
page_size: int = fastapi_query(SearchParamDefaults.page_size),
|
85
|
+
top_k: Optional[int] = fastapi_query(SearchParamDefaults.top_k),
|
83
86
|
min_score: Optional[float] = Query(
|
84
87
|
default=None,
|
85
|
-
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/
|
88
|
+
description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
|
86
89
|
deprecated=True,
|
87
90
|
),
|
88
91
|
min_score_semantic: Optional[float] = Query(
|
89
92
|
default=None,
|
90
|
-
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/
|
93
|
+
description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
|
91
94
|
),
|
92
95
|
min_score_bm25: float = Query(
|
93
96
|
default=0,
|
94
97
|
description="Minimum bm25 score to filter paragraph and document index results",
|
95
98
|
ge=0,
|
96
99
|
),
|
97
|
-
|
98
|
-
|
99
|
-
),
|
100
|
-
|
101
|
-
SearchParamDefaults.range_creation_end
|
102
|
-
),
|
103
|
-
range_modification_start: Optional[datetime] = fastapi_query(
|
100
|
+
vectorset: Optional[str] = fastapi_query(SearchParamDefaults.vectorset),
|
101
|
+
range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
|
102
|
+
range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
|
103
|
+
range_modification_start: Optional[DateTime] = fastapi_query(
|
104
104
|
SearchParamDefaults.range_modification_start
|
105
105
|
),
|
106
|
-
range_modification_end: Optional[
|
106
|
+
range_modification_end: Optional[DateTime] = fastapi_query(
|
107
107
|
SearchParamDefaults.range_modification_end
|
108
108
|
),
|
109
109
|
features: list[SearchOptions] = fastapi_query(
|
110
110
|
SearchParamDefaults.search_features,
|
111
111
|
default=[
|
112
|
-
SearchOptions.
|
113
|
-
SearchOptions.
|
112
|
+
SearchOptions.KEYWORD,
|
113
|
+
SearchOptions.SEMANTIC,
|
114
114
|
],
|
115
115
|
),
|
116
116
|
debug: bool = fastapi_query(SearchParamDefaults.debug),
|
@@ -119,13 +119,14 @@ async def find_knowledgebox(
|
|
119
119
|
field_type_filter: list[FieldTypeName] = fastapi_query(
|
120
120
|
SearchParamDefaults.field_type_filter, alias="field_type"
|
121
121
|
),
|
122
|
-
extracted: list[ExtractedDataTypeName] = fastapi_query(
|
123
|
-
SearchParamDefaults.extracted
|
124
|
-
),
|
122
|
+
extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
|
125
123
|
with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
|
126
124
|
with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
|
127
125
|
autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
|
128
126
|
security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
|
127
|
+
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
|
128
|
+
rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
|
129
|
+
reranker: Union[RerankerName, Reranker] = fastapi_query(SearchParamDefaults.reranker),
|
129
130
|
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
130
131
|
x_nucliadb_user: str = Header(""),
|
131
132
|
x_forwarded_for: str = Header(""),
|
@@ -138,11 +139,9 @@ async def find_knowledgebox(
|
|
138
139
|
query=query,
|
139
140
|
fields=fields,
|
140
141
|
filters=filters,
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
min_score_bm25, min_score_semantic, min_score
|
145
|
-
),
|
142
|
+
top_k=top_k, # type: ignore
|
143
|
+
min_score=min_score_from_query_params(min_score_bm25, min_score_semantic, min_score),
|
144
|
+
vectorset=vectorset,
|
146
145
|
range_creation_end=range_creation_end,
|
147
146
|
range_creation_start=range_creation_start,
|
148
147
|
range_modification_end=range_modification_end,
|
@@ -157,20 +156,21 @@ async def find_knowledgebox(
|
|
157
156
|
with_synonyms=with_synonyms,
|
158
157
|
autofilter=autofilter,
|
159
158
|
security=security,
|
159
|
+
show_hidden=show_hidden,
|
160
|
+
rank_fusion=rank_fusion,
|
161
|
+
reranker=reranker,
|
160
162
|
)
|
161
163
|
except ValidationError as exc:
|
162
164
|
detail = json.loads(exc.json())
|
163
165
|
return HTTPClientError(status_code=422, detail=detail)
|
164
166
|
|
165
|
-
return await _find_endpoint(
|
166
|
-
response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
|
167
|
-
)
|
167
|
+
return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
|
168
168
|
|
169
169
|
|
170
170
|
@api.post(
|
171
171
|
f"/{KB_PREFIX}/{{kbid}}/find",
|
172
172
|
status_code=200,
|
173
|
-
|
173
|
+
summary="Find Knowledge Box",
|
174
174
|
description="Find on a Knowledge Box",
|
175
175
|
response_model=KnowledgeboxFindResults,
|
176
176
|
response_model_exclude_unset=True,
|
@@ -187,9 +187,7 @@ async def find_post_knowledgebox(
|
|
187
187
|
x_nucliadb_user: str = Header(""),
|
188
188
|
x_forwarded_for: str = Header(""),
|
189
189
|
) -> Union[KnowledgeboxFindResults, HTTPClientError]:
|
190
|
-
return await _find_endpoint(
|
191
|
-
response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
|
192
|
-
)
|
190
|
+
return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
|
193
191
|
|
194
192
|
|
195
193
|
async def _find_endpoint(
|
@@ -201,11 +199,13 @@ async def _find_endpoint(
|
|
201
199
|
x_forwarded_for: str,
|
202
200
|
) -> Union[KnowledgeboxFindResults, HTTPClientError]:
|
203
201
|
try:
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
202
|
+
maybe_log_request_payload(kbid, "/find", item)
|
203
|
+
with cache.request_caches():
|
204
|
+
results, incomplete, _ = await find(
|
205
|
+
kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
|
206
|
+
)
|
207
|
+
response.status_code = 206 if incomplete else 200
|
208
|
+
return results
|
209
209
|
except KnowledgeBoxNotFound:
|
210
210
|
return HTTPClientError(status_code=404, detail="Knowledge Box not found")
|
211
211
|
except LimitsExceededError as exc:
|
@@ -214,6 +214,6 @@ async def _find_endpoint(
|
|
214
214
|
return HTTPClientError(status_code=412, detail=str(exc))
|
215
215
|
except predict.ProxiedPredictAPIError as err:
|
216
216
|
return HTTPClientError(
|
217
|
-
status_code=
|
218
|
-
detail=
|
217
|
+
status_code=err.status,
|
218
|
+
detail=err.detail,
|
219
219
|
)
|
@@ -24,29 +24,35 @@ from fastapi import HTTPException, Request
|
|
24
24
|
from fastapi_versioning import version
|
25
25
|
from grpc import StatusCode as GrpcStatusCode
|
26
26
|
from grpc.aio import AioRpcError
|
27
|
-
from nucliadb_protos.noderesources_pb2 import Shard
|
28
|
-
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
29
|
-
from nucliadb_protos.writer_pb2 import Shards
|
30
27
|
|
31
28
|
from nucliadb.common import datamanagers
|
32
29
|
from nucliadb.common.cluster.exceptions import ShardsNotFound
|
33
30
|
from nucliadb.common.cluster.manager import choose_node
|
34
31
|
from nucliadb.common.cluster.utils import get_shard_manager
|
32
|
+
from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
|
33
|
+
from nucliadb.common.counters import IndexCounts
|
34
|
+
from nucliadb.common.external_index_providers.manager import get_external_index_manager
|
35
|
+
from nucliadb.common.models_utils import from_proto
|
35
36
|
from nucliadb.search import logger
|
36
37
|
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
37
38
|
from nucliadb.search.api.v1.utils import fastapi_query
|
38
39
|
from nucliadb.search.search.shards import get_shard
|
39
40
|
from nucliadb.search.settings import settings
|
41
|
+
from nucliadb_models.internal.shards import KnowledgeboxShards
|
40
42
|
from nucliadb_models.resource import NucliaDBRoles
|
41
43
|
from nucliadb_models.search import (
|
42
44
|
KnowledgeboxCounters,
|
43
|
-
KnowledgeboxShards,
|
44
45
|
SearchParamDefaults,
|
45
46
|
)
|
47
|
+
from nucliadb_protos.noderesources_pb2 import Shard
|
48
|
+
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
49
|
+
from nucliadb_protos.writer_pb2 import Shards
|
46
50
|
from nucliadb_telemetry import errors
|
51
|
+
from nucliadb_utils import const
|
47
52
|
from nucliadb_utils.authentication import requires, requires_one
|
53
|
+
from nucliadb_utils.utilities import has_feature
|
48
54
|
|
49
|
-
|
55
|
+
MAX_PARAGRAPHS_FOR_SMALL_KB = 250_000
|
50
56
|
|
51
57
|
|
52
58
|
@api.get(
|
@@ -68,7 +74,7 @@ async def knowledgebox_shards(request: Request, kbid: str) -> KnowledgeboxShards
|
|
68
74
|
status_code=404,
|
69
75
|
detail="The knowledgebox or its shards configuration is missing",
|
70
76
|
)
|
71
|
-
return
|
77
|
+
return from_proto.kb_shards(shards)
|
72
78
|
|
73
79
|
|
74
80
|
@api.get(
|
@@ -84,24 +90,85 @@ async def knowledgebox_shards(request: Request, kbid: str) -> KnowledgeboxShards
|
|
84
90
|
async def knowledgebox_counters(
|
85
91
|
request: Request,
|
86
92
|
kbid: str,
|
87
|
-
vectorset: str = fastapi_query(SearchParamDefaults.vectorset),
|
88
93
|
debug: bool = fastapi_query(SearchParamDefaults.debug),
|
89
94
|
) -> KnowledgeboxCounters:
|
90
|
-
shard_manager = get_shard_manager()
|
91
|
-
|
92
95
|
try:
|
93
|
-
|
96
|
+
return await _kb_counters(kbid, debug=debug)
|
94
97
|
except ShardsNotFound:
|
95
98
|
raise HTTPException(
|
96
99
|
status_code=404,
|
97
100
|
detail="The knowledgebox or its shards configuration is missing",
|
98
101
|
)
|
99
102
|
|
103
|
+
|
104
|
+
async def _kb_counters(
|
105
|
+
kbid: str,
|
106
|
+
debug: bool = False,
|
107
|
+
) -> KnowledgeboxCounters:
|
108
|
+
"""
|
109
|
+
Resources count is calculated from maindb and cached
|
110
|
+
Field count is calculated from the index node cluster
|
111
|
+
Paragraphs and Sentences count is calculated from the index node cluster or the external index provider.
|
112
|
+
Index size is estimated from the paragraphs count.
|
113
|
+
"""
|
114
|
+
counters = KnowledgeboxCounters(
|
115
|
+
resources=0,
|
116
|
+
paragraphs=0,
|
117
|
+
fields=0,
|
118
|
+
sentences=0,
|
119
|
+
index_size=0,
|
120
|
+
)
|
121
|
+
external_index_manager = await get_external_index_manager(kbid)
|
122
|
+
if external_index_manager is not None:
|
123
|
+
index_counts = await external_index_manager.get_index_counts()
|
124
|
+
counters.paragraphs = index_counts.paragraphs
|
125
|
+
counters.sentences = index_counts.sentences
|
126
|
+
is_small_kb = index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
|
127
|
+
resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
|
128
|
+
# TODO: Find a way to query the fields count from the external index provider or use the catalog
|
129
|
+
counters.resources = counters.fields = resource_count
|
130
|
+
else:
|
131
|
+
node_index_counts, queried_shards = await get_node_index_counts(kbid)
|
132
|
+
counters.fields = node_index_counts.fields
|
133
|
+
counters.paragraphs = node_index_counts.paragraphs
|
134
|
+
counters.sentences = node_index_counts.sentences
|
135
|
+
is_small_kb = node_index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
|
136
|
+
resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
|
137
|
+
counters.resources = resource_count
|
138
|
+
counters.index_size = counters.paragraphs * AVG_PARAGRAPH_SIZE_BYTES
|
139
|
+
if debug and queried_shards is not None:
|
140
|
+
counters.shards = queried_shards
|
141
|
+
return counters
|
142
|
+
|
143
|
+
|
144
|
+
async def get_resources_count(kbid: str, force_calculate: bool = False) -> int:
|
145
|
+
async with datamanagers.with_ro_transaction() as txn:
|
146
|
+
if force_calculate:
|
147
|
+
# For small kbs, this is faster and more up to date
|
148
|
+
resource_count = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
|
149
|
+
else:
|
150
|
+
resource_count = await datamanagers.resources.get_number_of_resources(txn, kbid=kbid)
|
151
|
+
if resource_count == -1:
|
152
|
+
# WARNING: standalone, this value will never be cached
|
153
|
+
resource_count = await datamanagers.resources.calculate_number_of_resources(
|
154
|
+
txn, kbid=kbid
|
155
|
+
)
|
156
|
+
return resource_count
|
157
|
+
|
158
|
+
|
159
|
+
async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
|
160
|
+
"""
|
161
|
+
Get the index counts for a knowledgebox that has an index in the index node cluster.
|
162
|
+
"""
|
163
|
+
shard_manager = get_shard_manager()
|
164
|
+
shard_groups: list[PBShardObject] = await shard_manager.get_shards_by_kbid(kbid)
|
100
165
|
ops = []
|
101
166
|
queried_shards = []
|
102
167
|
for shard_object in shard_groups:
|
103
168
|
try:
|
104
|
-
node, shard_id = choose_node(
|
169
|
+
node, shard_id = choose_node(
|
170
|
+
shard_object, use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid})
|
171
|
+
)
|
105
172
|
except KeyError:
|
106
173
|
raise HTTPException(
|
107
174
|
status_code=500,
|
@@ -111,7 +178,7 @@ async def knowledgebox_counters(
|
|
111
178
|
if shard_id is not None:
|
112
179
|
# At least one node is alive for this shard group
|
113
180
|
# let's add it ot the query list if has a valid value
|
114
|
-
ops.append(get_shard(node, shard_id
|
181
|
+
ops.append(get_shard(node, shard_id))
|
115
182
|
queried_shards.append(shard_id)
|
116
183
|
|
117
184
|
if not ops:
|
@@ -122,7 +189,7 @@ async def knowledgebox_counters(
|
|
122
189
|
)
|
123
190
|
|
124
191
|
try:
|
125
|
-
results: Optional[list[Shard]] = await asyncio.wait_for(
|
192
|
+
results: Optional[list[Shard]] = await asyncio.wait_for(
|
126
193
|
asyncio.gather(*ops, return_exceptions=True), # type: ignore
|
127
194
|
timeout=settings.search_timeout,
|
128
195
|
)
|
@@ -139,56 +206,17 @@ async def knowledgebox_counters(
|
|
139
206
|
if results is None:
|
140
207
|
raise HTTPException(status_code=503, detail=f"No shards found")
|
141
208
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
209
|
+
counts = IndexCounts(
|
210
|
+
fields=0,
|
211
|
+
paragraphs=0,
|
212
|
+
sentences=0,
|
213
|
+
)
|
146
214
|
for shard in results:
|
147
215
|
if isinstance(shard, Exception):
|
148
216
|
logger.error("Error getting shard info", exc_info=shard)
|
149
217
|
errors.capture_exception(shard)
|
150
|
-
raise HTTPException(
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
paragraph_count += shard.paragraphs
|
156
|
-
sentence_count += shard.sentences
|
157
|
-
|
158
|
-
async with datamanagers.with_transaction() as txn:
|
159
|
-
try:
|
160
|
-
if len(shard_groups) <= 1:
|
161
|
-
# for smaller kbs, this is faster and more up to date
|
162
|
-
resource_count = (
|
163
|
-
await datamanagers.resources.calculate_number_of_resources(
|
164
|
-
txn, kbid=kbid
|
165
|
-
)
|
166
|
-
)
|
167
|
-
else:
|
168
|
-
resource_count = await datamanagers.resources.get_number_of_resources(
|
169
|
-
txn, kbid=kbid
|
170
|
-
)
|
171
|
-
if resource_count == -1:
|
172
|
-
# WARNING: standalone, this value will never be cached
|
173
|
-
resource_count = (
|
174
|
-
await datamanagers.resources.calculate_number_of_resources(
|
175
|
-
txn, kbid=kbid
|
176
|
-
)
|
177
|
-
)
|
178
|
-
except Exception as exc:
|
179
|
-
errors.capture_exception(exc)
|
180
|
-
raise HTTPException(
|
181
|
-
status_code=500, detail="Couldn't retrieve counters right now"
|
182
|
-
)
|
183
|
-
|
184
|
-
counters = KnowledgeboxCounters(
|
185
|
-
resources=resource_count,
|
186
|
-
paragraphs=paragraph_count,
|
187
|
-
fields=field_count,
|
188
|
-
sentences=sentence_count,
|
189
|
-
index_size=paragraph_count * AVG_PARAGRAPH_SIZE_BYTES,
|
190
|
-
)
|
191
|
-
|
192
|
-
if debug:
|
193
|
-
counters.shards = queried_shards
|
194
|
-
return counters
|
218
|
+
raise HTTPException(status_code=500, detail=f"Error while geting shard data")
|
219
|
+
counts.fields += shard.fields
|
220
|
+
counts.paragraphs += shard.paragraphs
|
221
|
+
counts.sentences += shard.sentences
|
222
|
+
return counts, queried_shards
|
@@ -39,7 +39,7 @@ DESCRIPTION = "Convenience endpoint that proxies requests to the Predict API. It
|
|
39
39
|
@api.get(
|
40
40
|
path=f"/{KB_PREFIX}/{{kbid}}/predict/{{endpoint}}",
|
41
41
|
status_code=200,
|
42
|
-
|
42
|
+
summary="Predict API Proxy",
|
43
43
|
description=DESCRIPTION,
|
44
44
|
response_model=None,
|
45
45
|
tags=["Search"],
|
@@ -47,7 +47,7 @@ DESCRIPTION = "Convenience endpoint that proxies requests to the Predict API. It
|
|
47
47
|
@api.post(
|
48
48
|
path=f"/{KB_PREFIX}/{{kbid}}/predict/{{endpoint}}",
|
49
49
|
status_code=200,
|
50
|
-
|
50
|
+
summary="Predict API Proxy",
|
51
51
|
description=DESCRIPTION,
|
52
52
|
response_model=None,
|
53
53
|
tags=["Search"],
|
@@ -17,146 +17,95 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
from typing import Union
|
20
|
+
from typing import Optional, Union
|
21
21
|
|
22
|
-
from fastapi import
|
23
|
-
from fastapi.openapi.models import Example
|
22
|
+
from fastapi import Header, Request, Response
|
24
23
|
from fastapi_versioning import version
|
25
|
-
from
|
26
|
-
from nucliadb_protos.utils_pb2 import ExtractedText
|
24
|
+
from starlette.responses import StreamingResponse
|
27
25
|
|
28
|
-
from nucliadb.common
|
29
|
-
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
26
|
+
from nucliadb.common import datamanagers
|
30
27
|
from nucliadb.models.responses import HTTPClientError
|
31
|
-
from nucliadb.search import
|
32
|
-
from nucliadb.search.api.v1.router import KB_PREFIX, api
|
33
|
-
from nucliadb.search.predict import SendToPredictError
|
34
|
-
from nucliadb.search.search.exceptions import InvalidQueryError, ResourceNotFoundError
|
35
|
-
from nucliadb.search.utilities import get_predict
|
28
|
+
from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_SLUG_PREFIX, api
|
36
29
|
from nucliadb_models.resource import NucliaDBRoles
|
37
|
-
from nucliadb_models.search import AskRequest,
|
38
|
-
from nucliadb_utils import const
|
30
|
+
from nucliadb_models.search import AskRequest, NucliaDBClientType, SyncAskResponse
|
39
31
|
from nucliadb_utils.authentication import requires
|
40
|
-
from nucliadb_utils.exceptions import LimitsExceededError
|
41
|
-
from nucliadb_utils.utilities import get_storage, has_feature
|
42
32
|
|
43
|
-
|
44
|
-
"Ask a Resource": Example(
|
45
|
-
summary="Ask a question to the document",
|
46
|
-
description="Ask a question to the document. The whole document is sent as context to the generative AI",
|
47
|
-
value={
|
48
|
-
"question": "Does this document contain personal information?",
|
49
|
-
},
|
50
|
-
)
|
51
|
-
}
|
33
|
+
from ..ask import create_ask_response
|
52
34
|
|
53
35
|
|
54
36
|
@api.post(
|
55
37
|
f"/{KB_PREFIX}/{{kbid}}/resource/{{rid}}/ask",
|
56
38
|
status_code=200,
|
57
|
-
|
58
|
-
|
59
|
-
description="Ask to the complete content of the resource",
|
39
|
+
summary="Ask a resource (by id)",
|
40
|
+
description="Ask questions to a resource",
|
60
41
|
tags=["Search"],
|
61
|
-
response_model=
|
62
|
-
# TODO: set to True once feature is fully enabled
|
63
|
-
include_in_schema=False,
|
42
|
+
response_model=SyncAskResponse,
|
64
43
|
)
|
65
44
|
@requires(NucliaDBRoles.READER)
|
66
45
|
@version(1)
|
67
|
-
async def
|
46
|
+
async def resource_ask_endpoint_by_uuid(
|
68
47
|
request: Request,
|
69
|
-
response: Response,
|
70
48
|
kbid: str,
|
71
49
|
rid: str,
|
72
|
-
item: AskRequest
|
73
|
-
|
50
|
+
item: AskRequest,
|
51
|
+
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
52
|
+
x_nucliadb_user: str = Header(""),
|
53
|
+
x_forwarded_for: str = Header(""),
|
54
|
+
x_synchronous: bool = Header(
|
55
|
+
False,
|
56
|
+
description="When set to true, outputs response as JSON in a non-streaming way. "
|
57
|
+
"This is slower and requires waiting for entire answer to be ready.",
|
74
58
|
),
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
return HTTPClientError(status_code=exc.status_code, detail=exc.detail)
|
86
|
-
except SendToPredictError:
|
87
|
-
return HTTPClientError(status_code=503, detail="Ask service not available")
|
88
|
-
except InvalidQueryError as exc:
|
89
|
-
return HTTPClientError(status_code=412, detail=str(exc))
|
59
|
+
) -> Union[StreamingResponse, HTTPClientError, Response]:
|
60
|
+
return await create_ask_response(
|
61
|
+
kbid,
|
62
|
+
item,
|
63
|
+
x_nucliadb_user,
|
64
|
+
x_ndb_client,
|
65
|
+
x_forwarded_for,
|
66
|
+
x_synchronous,
|
67
|
+
resource=rid,
|
68
|
+
)
|
90
69
|
|
91
70
|
|
92
|
-
|
71
|
+
@api.post(
|
72
|
+
f"/{KB_PREFIX}/{{kbid}}/{RESOURCE_SLUG_PREFIX}/{{slug}}/ask",
|
73
|
+
status_code=200,
|
74
|
+
summary="Ask a resource (by slug)",
|
75
|
+
description="Ask questions to a resource",
|
76
|
+
tags=["Search"],
|
77
|
+
response_model=SyncAskResponse,
|
78
|
+
)
|
79
|
+
@requires(NucliaDBRoles.READER)
|
80
|
+
@version(1)
|
81
|
+
async def resource_ask_endpoint_by_slug(
|
82
|
+
request: Request,
|
93
83
|
kbid: str,
|
94
|
-
|
84
|
+
slug: str,
|
95
85
|
item: AskRequest,
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
if orm_resource is None:
|
118
|
-
raise ResourceNotFoundError()
|
119
|
-
|
120
|
-
for field_type, field_id in await orm_resource.get_fields_ids():
|
121
|
-
field_obj = await orm_resource.get_field(field_id, field_type, load=False)
|
122
|
-
etxt = await field_obj.get_extracted_text()
|
123
|
-
if etxt is None:
|
124
|
-
logger.warning(
|
125
|
-
f"Skipping field {field_id}, as it does not have extracted text yet!"
|
126
|
-
)
|
127
|
-
continue
|
128
|
-
|
129
|
-
fcm = await field_obj.get_field_metadata()
|
130
|
-
if fcm is None:
|
131
|
-
logger.warning(f"Field metadata not found for {field_id}")
|
132
|
-
blocks.append(get_field_blocks(etxt))
|
133
|
-
else:
|
134
|
-
blocks.append(get_field_blocks_split_by_paragraphs(etxt, fcm))
|
135
|
-
return blocks
|
136
|
-
|
137
|
-
|
138
|
-
def get_field_blocks_split_by_paragraphs(
|
139
|
-
etxt: ExtractedText, fcm: FieldComputedMetadata
|
140
|
-
) -> list[str]:
|
141
|
-
block = []
|
142
|
-
for paragraph in fcm.metadata.paragraphs:
|
143
|
-
block.append(etxt.text[paragraph.start : paragraph.end])
|
144
|
-
|
145
|
-
for split, metadata in fcm.split_metadata.items():
|
146
|
-
for split_paragraph in metadata.paragraphs:
|
147
|
-
split_text = etxt.split_text.get(split)
|
148
|
-
if split_text is None:
|
149
|
-
logger.warning(f"Split {split} not found in extracted text")
|
150
|
-
continue
|
151
|
-
block.append(split_text[split_paragraph.start : split_paragraph.end])
|
152
|
-
return block
|
86
|
+
x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
|
87
|
+
x_nucliadb_user: str = Header(""),
|
88
|
+
x_forwarded_for: str = Header(""),
|
89
|
+
x_synchronous: bool = Header(
|
90
|
+
False,
|
91
|
+
description="When set to true, outputs response as JSON in a non-streaming way. "
|
92
|
+
"This is slower and requires waiting for entire answer to be ready.",
|
93
|
+
),
|
94
|
+
) -> Union[StreamingResponse, HTTPClientError, Response]:
|
95
|
+
resource_id = await get_resource_uuid_by_slug(kbid, slug)
|
96
|
+
if resource_id is None:
|
97
|
+
return HTTPClientError(status_code=404, detail="Resource not found")
|
98
|
+
return await create_ask_response(
|
99
|
+
kbid,
|
100
|
+
item,
|
101
|
+
x_nucliadb_user,
|
102
|
+
x_ndb_client,
|
103
|
+
x_forwarded_for,
|
104
|
+
x_synchronous,
|
105
|
+
resource=resource_id,
|
106
|
+
)
|
153
107
|
|
154
108
|
|
155
|
-
def
|
156
|
-
|
157
|
-
|
158
|
-
blocks.append(etxt.text)
|
159
|
-
for split_etxt in etxt.split_text.values():
|
160
|
-
if split_etxt:
|
161
|
-
blocks.append(split_etxt)
|
162
|
-
return blocks
|
109
|
+
async def get_resource_uuid_by_slug(kbid: str, slug: str) -> Optional[str]:
|
110
|
+
async with datamanagers.with_ro_transaction() as txn:
|
111
|
+
return await datamanagers.resources.get_resource_uuid_from_slug(txn, kbid=kbid, slug=slug)
|