nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -19,24 +19,13 @@
|
|
19
19
|
|
20
20
|
import asyncio
|
21
21
|
import json
|
22
|
-
from enum import Enum
|
22
|
+
from enum import Enum, auto
|
23
23
|
from typing import Any, Optional, Sequence, TypeVar, Union, overload
|
24
24
|
|
25
25
|
from fastapi import HTTPException
|
26
26
|
from google.protobuf.json_format import MessageToDict
|
27
27
|
from grpc import StatusCode as GrpcStatusCode
|
28
28
|
from grpc.aio import AioRpcError
|
29
|
-
from nucliadb_protos.nodereader_pb2 import (
|
30
|
-
ParagraphSearchRequest,
|
31
|
-
ParagraphSearchResponse,
|
32
|
-
RelationSearchRequest,
|
33
|
-
RelationSearchResponse,
|
34
|
-
SearchRequest,
|
35
|
-
SearchResponse,
|
36
|
-
SuggestRequest,
|
37
|
-
SuggestResponse,
|
38
|
-
)
|
39
|
-
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
40
29
|
|
41
30
|
from nucliadb.common.cluster import manager as cluster_manager
|
42
31
|
from nucliadb.common.cluster.base import AbstractIndexNode
|
@@ -44,66 +33,53 @@ from nucliadb.common.cluster.exceptions import ShardsNotFound
|
|
44
33
|
from nucliadb.common.cluster.utils import get_shard_manager
|
45
34
|
from nucliadb.search import logger
|
46
35
|
from nucliadb.search.search.shards import (
|
47
|
-
query_paragraph_shard,
|
48
36
|
query_shard,
|
49
|
-
relations_shard,
|
50
37
|
suggest_shard,
|
51
38
|
)
|
52
39
|
from nucliadb.search.settings import settings
|
40
|
+
from nucliadb_protos.nodereader_pb2 import (
|
41
|
+
SearchRequest,
|
42
|
+
SearchResponse,
|
43
|
+
SuggestRequest,
|
44
|
+
SuggestResponse,
|
45
|
+
)
|
46
|
+
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
53
47
|
from nucliadb_telemetry import errors
|
54
48
|
from nucliadb_utils import const
|
55
49
|
from nucliadb_utils.utilities import has_feature
|
56
50
|
|
57
51
|
|
58
52
|
class Method(Enum):
|
59
|
-
SEARCH =
|
60
|
-
|
61
|
-
SUGGEST = 3
|
62
|
-
RELATIONS = 4
|
53
|
+
SEARCH = auto()
|
54
|
+
SUGGEST = auto()
|
63
55
|
|
64
56
|
|
65
57
|
METHODS = {
|
66
58
|
Method.SEARCH: query_shard,
|
67
|
-
Method.PARAGRAPH: query_paragraph_shard,
|
68
59
|
Method.SUGGEST: suggest_shard,
|
69
|
-
Method.RELATIONS: relations_shard,
|
70
60
|
}
|
71
61
|
|
72
|
-
REQUEST_TYPE = Union[
|
73
|
-
SuggestRequest, ParagraphSearchRequest, SearchRequest, RelationSearchRequest
|
74
|
-
]
|
62
|
+
REQUEST_TYPE = Union[SuggestRequest, SearchRequest]
|
75
63
|
|
76
64
|
T = TypeVar(
|
77
65
|
"T",
|
78
66
|
SuggestResponse,
|
79
|
-
ParagraphSearchResponse,
|
80
67
|
SearchResponse,
|
81
|
-
RelationSearchResponse,
|
82
68
|
)
|
83
69
|
|
84
70
|
|
85
|
-
@overload
|
71
|
+
@overload
|
86
72
|
async def node_query(
|
87
73
|
kbid: str,
|
88
74
|
method: Method,
|
89
75
|
pb_query: SuggestRequest,
|
90
76
|
target_shard_replicas: Optional[list[str]] = None,
|
91
77
|
use_read_replica_nodes: bool = True,
|
78
|
+
timeout: Optional[float] = None,
|
79
|
+
retry_on_primary: bool = True,
|
92
80
|
) -> tuple[list[SuggestResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
|
93
81
|
|
94
82
|
|
95
|
-
@overload
|
96
|
-
async def node_query(
|
97
|
-
kbid: str,
|
98
|
-
method: Method,
|
99
|
-
pb_query: ParagraphSearchRequest,
|
100
|
-
target_shard_replicas: Optional[list[str]] = None,
|
101
|
-
use_read_replica_nodes: bool = True,
|
102
|
-
) -> tuple[
|
103
|
-
list[ParagraphSearchResponse], bool, list[tuple[AbstractIndexNode, str]]
|
104
|
-
]: ...
|
105
|
-
|
106
|
-
|
107
83
|
@overload
|
108
84
|
async def node_query(
|
109
85
|
kbid: str,
|
@@ -111,28 +87,21 @@ async def node_query(
|
|
111
87
|
pb_query: SearchRequest,
|
112
88
|
target_shard_replicas: Optional[list[str]] = None,
|
113
89
|
use_read_replica_nodes: bool = True,
|
90
|
+
timeout: Optional[float] = None,
|
91
|
+
retry_on_primary: bool = True,
|
114
92
|
) -> tuple[list[SearchResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
|
115
93
|
|
116
94
|
|
117
|
-
@overload
|
118
|
-
async def node_query(
|
119
|
-
kbid: str,
|
120
|
-
method: Method,
|
121
|
-
pb_query: RelationSearchRequest,
|
122
|
-
target_shard_replicas: Optional[list[str]] = None,
|
123
|
-
use_read_replica_nodes: bool = True,
|
124
|
-
) -> tuple[list[RelationSearchResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
|
125
|
-
|
126
|
-
|
127
95
|
async def node_query(
|
128
96
|
kbid: str,
|
129
97
|
method: Method,
|
130
98
|
pb_query: REQUEST_TYPE,
|
131
99
|
target_shard_replicas: Optional[list[str]] = None,
|
132
100
|
use_read_replica_nodes: bool = True,
|
133
|
-
|
134
|
-
|
135
|
-
]:
|
101
|
+
timeout: Optional[float] = None,
|
102
|
+
retry_on_primary: bool = True,
|
103
|
+
) -> tuple[Sequence[Union[T, BaseException]], bool, list[tuple[AbstractIndexNode, str]]]:
|
104
|
+
timeout = timeout or settings.search_timeout
|
136
105
|
use_read_replica_nodes = use_read_replica_nodes and has_feature(
|
137
106
|
const.Features.READ_REPLICA_SEARCHES, context={"kbid": kbid}
|
138
107
|
)
|
@@ -154,6 +123,7 @@ async def node_query(
|
|
154
123
|
try:
|
155
124
|
node, shard_id = cluster_manager.choose_node(
|
156
125
|
shard_obj,
|
126
|
+
use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid}),
|
157
127
|
use_read_replica_nodes=use_read_replica_nodes,
|
158
128
|
target_shard_replicas=target_shard_replicas,
|
159
129
|
)
|
@@ -177,7 +147,7 @@ async def node_query(
|
|
177
147
|
try:
|
178
148
|
results: list[Union[T, BaseException]] = await asyncio.wait_for(
|
179
149
|
asyncio.gather(*ops, return_exceptions=True),
|
180
|
-
timeout=
|
150
|
+
timeout=timeout,
|
181
151
|
)
|
182
152
|
except asyncio.TimeoutError as exc: # pragma: no cover
|
183
153
|
logger.warning(
|
@@ -201,6 +171,7 @@ async def node_query(
|
|
201
171
|
error.status_code >= 500
|
202
172
|
and use_read_replica_nodes
|
203
173
|
and any([node.is_read_replica() for node, _ in queried_nodes])
|
174
|
+
and retry_on_primary
|
204
175
|
):
|
205
176
|
# We had an error querying a secondary node, instead of raising an
|
206
177
|
# error directly, retry query to primaries and hope it works
|
@@ -231,9 +202,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
|
|
231
202
|
Handling of exception is responsibility of caller.
|
232
203
|
"""
|
233
204
|
if results is None or len(results) == 0:
|
234
|
-
return HTTPException(
|
235
|
-
status_code=500, detail=f"Error while executing shard queries. No results."
|
236
|
-
)
|
205
|
+
return HTTPException(status_code=500, detail=f"Error while executing shard queries. No results.")
|
237
206
|
|
238
207
|
for result in results:
|
239
208
|
if isinstance(result, Exception):
|
@@ -262,9 +231,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
|
|
262
231
|
return None
|
263
232
|
|
264
233
|
|
265
|
-
def debug_nodes_info(
|
266
|
-
nodes: list[tuple[AbstractIndexNode, str]]
|
267
|
-
) -> list[dict[str, str]]:
|
234
|
+
def debug_nodes_info(nodes: list[tuple[AbstractIndexNode, str]]) -> list[dict[str, str]]:
|
268
235
|
details: list[dict[str, str]] = []
|
269
236
|
for node, shard_id in nodes:
|
270
237
|
info = {
|
nucliadb/search/search/cache.py
CHANGED
@@ -18,42 +18,77 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
|
20
20
|
import asyncio
|
21
|
+
import contextlib
|
22
|
+
import logging
|
21
23
|
from contextvars import ContextVar
|
22
24
|
from typing import Optional
|
23
25
|
|
24
|
-
from lru import LRU
|
26
|
+
from lru import LRU
|
25
27
|
|
26
|
-
from nucliadb.common.
|
28
|
+
from nucliadb.common.ids import FieldId
|
29
|
+
from nucliadb.common.maindb.utils import get_driver
|
30
|
+
from nucliadb.ingest.fields.base import Field
|
27
31
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
28
32
|
from nucliadb.ingest.orm.resource import Resource as ResourceORM
|
29
|
-
from nucliadb.middleware.transaction import get_read_only_transaction
|
30
33
|
from nucliadb.search import SERVICE_NAME
|
34
|
+
from nucliadb_protos.utils_pb2 import ExtractedText
|
31
35
|
from nucliadb_telemetry import metrics
|
32
36
|
from nucliadb_utils.utilities import get_storage
|
33
37
|
|
34
|
-
|
35
|
-
|
36
|
-
)
|
38
|
+
logger = logging.getLogger(__name__)
|
39
|
+
|
40
|
+
rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar("rcache", default=None)
|
41
|
+
etcache: ContextVar[Optional["ExtractedTextCache"]] = ContextVar("etcache", default=None)
|
37
42
|
|
38
43
|
|
39
44
|
RESOURCE_LOCKS: dict[str, asyncio.Lock] = LRU(1000) # type: ignore
|
40
45
|
RESOURCE_CACHE_OPS = metrics.Counter("nucliadb_resource_cache_ops", labels={"type": ""})
|
46
|
+
EXTRACTED_CACHE_OPS = metrics.Counter("nucliadb_extracted_text_cache_ops", labels={"type": ""})
|
47
|
+
|
48
|
+
|
49
|
+
def set_extracted_text_cache() -> None:
|
50
|
+
value = ExtractedTextCache()
|
51
|
+
etcache.set(value)
|
52
|
+
|
53
|
+
|
54
|
+
def get_extracted_text_cache() -> Optional["ExtractedTextCache"]:
|
55
|
+
return etcache.get()
|
56
|
+
|
57
|
+
|
58
|
+
def clear_extracted_text_cache() -> None:
|
59
|
+
value = etcache.get()
|
60
|
+
if value is not None:
|
61
|
+
value.clear()
|
62
|
+
etcache.set(None)
|
63
|
+
|
41
64
|
|
65
|
+
def set_resource_cache() -> None:
|
66
|
+
value: dict[str, ResourceORM] = {}
|
67
|
+
rcache.set(value)
|
42
68
|
|
43
|
-
def get_resource_cache(clear: bool = False) -> dict[str, ResourceORM]:
|
44
|
-
value: Optional[dict[str, ResourceORM]] = rcache.get()
|
45
|
-
if value is None or clear:
|
46
|
-
value = {}
|
47
|
-
rcache.set(value)
|
48
|
-
return value
|
49
69
|
|
70
|
+
def get_resource_cache() -> Optional[dict[str, ResourceORM]]:
|
71
|
+
return rcache.get()
|
50
72
|
|
51
|
-
|
52
|
-
|
53
|
-
|
73
|
+
|
74
|
+
def clear_resource_cache() -> None:
|
75
|
+
value = rcache.get()
|
76
|
+
if value is not None:
|
77
|
+
value.clear()
|
78
|
+
rcache.set(None)
|
79
|
+
|
80
|
+
|
81
|
+
async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
82
|
+
"""
|
83
|
+
Will try to get the resource from the cache, if it's not there it will fetch it from the ORM and cache it.
|
84
|
+
"""
|
54
85
|
orm_resource: Optional[ResourceORM] = None
|
55
86
|
|
56
87
|
resource_cache = get_resource_cache()
|
88
|
+
if resource_cache is None:
|
89
|
+
RESOURCE_CACHE_OPS.inc({"type": "miss"})
|
90
|
+
logger.warning("Resource cache not set")
|
91
|
+
return await _orm_get_resource(kbid, uuid)
|
57
92
|
|
58
93
|
if uuid not in RESOURCE_LOCKS:
|
59
94
|
RESOURCE_LOCKS[uuid] = asyncio.Lock()
|
@@ -61,11 +96,7 @@ async def get_resource_from_cache(
|
|
61
96
|
async with RESOURCE_LOCKS[uuid]:
|
62
97
|
if uuid not in resource_cache:
|
63
98
|
RESOURCE_CACHE_OPS.inc({"type": "miss"})
|
64
|
-
|
65
|
-
txn = await get_read_only_transaction()
|
66
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
67
|
-
kb = KnowledgeBoxORM(txn, storage, kbid)
|
68
|
-
orm_resource = await kb.get(uuid)
|
99
|
+
orm_resource = await _orm_get_resource(kbid, uuid)
|
69
100
|
else:
|
70
101
|
RESOURCE_CACHE_OPS.inc({"type": "hit"})
|
71
102
|
|
@@ -75,3 +106,101 @@ async def get_resource_from_cache(
|
|
75
106
|
orm_resource = resource_cache.get(uuid)
|
76
107
|
|
77
108
|
return orm_resource
|
109
|
+
|
110
|
+
|
111
|
+
async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
112
|
+
async with get_driver().transaction(read_only=True) as txn:
|
113
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
114
|
+
kb = KnowledgeBoxORM(txn, storage, kbid)
|
115
|
+
return await kb.get(uuid)
|
116
|
+
|
117
|
+
|
118
|
+
class ExtractedTextCache:
|
119
|
+
"""
|
120
|
+
Used to cache extracted text from a resource in memory during the process
|
121
|
+
of search results hydration.
|
122
|
+
|
123
|
+
This is needed to avoid fetching the same extracted text multiple times,
|
124
|
+
as matching text blocks are processed in parallel and the extracted text is
|
125
|
+
fetched for each field where the text block is found.
|
126
|
+
"""
|
127
|
+
|
128
|
+
def __init__(self):
|
129
|
+
self.locks = {}
|
130
|
+
self.values = {}
|
131
|
+
|
132
|
+
def get_value(self, key: str) -> Optional[ExtractedText]:
|
133
|
+
return self.values.get(key)
|
134
|
+
|
135
|
+
def get_lock(self, key: str) -> asyncio.Lock:
|
136
|
+
return self.locks.setdefault(key, asyncio.Lock())
|
137
|
+
|
138
|
+
def set_value(self, key: str, value: ExtractedText) -> None:
|
139
|
+
self.values[key] = value
|
140
|
+
|
141
|
+
def clear(self):
|
142
|
+
self.values.clear()
|
143
|
+
self.locks.clear()
|
144
|
+
|
145
|
+
|
146
|
+
async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
|
147
|
+
cache = get_extracted_text_cache()
|
148
|
+
if cache is None:
|
149
|
+
logger.warning("Extracted text cache not set")
|
150
|
+
EXTRACTED_CACHE_OPS.inc({"type": "miss"})
|
151
|
+
return await field.get_extracted_text()
|
152
|
+
|
153
|
+
key = f"{field.kbid}/{field.uuid}/{field.id}"
|
154
|
+
extracted_text = cache.get_value(key)
|
155
|
+
if extracted_text is not None:
|
156
|
+
EXTRACTED_CACHE_OPS.inc({"type": "hit"})
|
157
|
+
return extracted_text
|
158
|
+
|
159
|
+
async with cache.get_lock(key):
|
160
|
+
# Check again in case another task already fetched it
|
161
|
+
extracted_text = cache.get_value(key)
|
162
|
+
if extracted_text is not None:
|
163
|
+
EXTRACTED_CACHE_OPS.inc({"type": "hit"})
|
164
|
+
return extracted_text
|
165
|
+
|
166
|
+
EXTRACTED_CACHE_OPS.inc({"type": "miss"})
|
167
|
+
extracted_text = await field.get_extracted_text()
|
168
|
+
if extracted_text is not None:
|
169
|
+
# Only cache if we actually have extracted text
|
170
|
+
cache.set_value(key, extracted_text)
|
171
|
+
return extracted_text
|
172
|
+
|
173
|
+
|
174
|
+
async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
|
175
|
+
rid = field.rid
|
176
|
+
orm_resource = await get_resource(kbid, rid)
|
177
|
+
if orm_resource is None:
|
178
|
+
return None
|
179
|
+
field_obj = await orm_resource.get_field(
|
180
|
+
key=field.key,
|
181
|
+
type=field.pb_type,
|
182
|
+
load=False,
|
183
|
+
)
|
184
|
+
return await get_field_extracted_text(field_obj)
|
185
|
+
|
186
|
+
|
187
|
+
@contextlib.contextmanager
|
188
|
+
def request_caches():
|
189
|
+
"""
|
190
|
+
This context manager sets the caches for extracted text and resources for a request.
|
191
|
+
|
192
|
+
It should used at the beginning of a request handler to avoid fetching the same
|
193
|
+
resources and extracted text multiple times.
|
194
|
+
|
195
|
+
Makes sure to clean the caches at the end of the context manager.
|
196
|
+
>>> with request_caches():
|
197
|
+
... resource = await get_resource(kbid, uuid)
|
198
|
+
... extracted_text = await get_extracted_text_from_field_id(kbid, rid, field_id)
|
199
|
+
"""
|
200
|
+
set_resource_cache()
|
201
|
+
set_extracted_text_cache()
|
202
|
+
try:
|
203
|
+
yield
|
204
|
+
finally:
|
205
|
+
clear_resource_cache()
|
206
|
+
clear_extracted_text_cache()
|