nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,23 +18,14 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
|
20
20
|
import asyncio
|
21
|
-
|
22
|
-
from
|
21
|
+
import json
|
22
|
+
from enum import Enum, auto
|
23
|
+
from typing import Any, Optional, Sequence, TypeVar, Union, overload
|
23
24
|
|
24
25
|
from fastapi import HTTPException
|
26
|
+
from google.protobuf.json_format import MessageToDict
|
25
27
|
from grpc import StatusCode as GrpcStatusCode
|
26
28
|
from grpc.aio import AioRpcError
|
27
|
-
from nucliadb_protos.nodereader_pb2 import (
|
28
|
-
ParagraphSearchRequest,
|
29
|
-
ParagraphSearchResponse,
|
30
|
-
RelationSearchRequest,
|
31
|
-
RelationSearchResponse,
|
32
|
-
SearchRequest,
|
33
|
-
SearchResponse,
|
34
|
-
SuggestRequest,
|
35
|
-
SuggestResponse,
|
36
|
-
)
|
37
|
-
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
38
29
|
|
39
30
|
from nucliadb.common.cluster import manager as cluster_manager
|
40
31
|
from nucliadb.common.cluster.base import AbstractIndexNode
|
@@ -42,64 +33,51 @@ from nucliadb.common.cluster.exceptions import ShardsNotFound
|
|
42
33
|
from nucliadb.common.cluster.utils import get_shard_manager
|
43
34
|
from nucliadb.search import logger
|
44
35
|
from nucliadb.search.search.shards import (
|
45
|
-
query_paragraph_shard,
|
46
36
|
query_shard,
|
47
|
-
relations_shard,
|
48
37
|
suggest_shard,
|
49
38
|
)
|
50
39
|
from nucliadb.search.settings import settings
|
40
|
+
from nucliadb_protos.nodereader_pb2 import (
|
41
|
+
SearchRequest,
|
42
|
+
SearchResponse,
|
43
|
+
SuggestRequest,
|
44
|
+
SuggestResponse,
|
45
|
+
)
|
46
|
+
from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
|
51
47
|
from nucliadb_telemetry import errors
|
52
48
|
from nucliadb_utils import const
|
53
49
|
from nucliadb_utils.utilities import has_feature
|
54
50
|
|
55
51
|
|
56
52
|
class Method(Enum):
|
57
|
-
SEARCH =
|
58
|
-
|
59
|
-
SUGGEST = 3
|
60
|
-
RELATIONS = 4
|
53
|
+
SEARCH = auto()
|
54
|
+
SUGGEST = auto()
|
61
55
|
|
62
56
|
|
63
57
|
METHODS = {
|
64
58
|
Method.SEARCH: query_shard,
|
65
|
-
Method.PARAGRAPH: query_paragraph_shard,
|
66
59
|
Method.SUGGEST: suggest_shard,
|
67
|
-
Method.RELATIONS: relations_shard,
|
68
60
|
}
|
69
61
|
|
70
|
-
REQUEST_TYPE = Union[
|
71
|
-
SuggestRequest, ParagraphSearchRequest, SearchRequest, RelationSearchRequest
|
72
|
-
]
|
62
|
+
REQUEST_TYPE = Union[SuggestRequest, SearchRequest]
|
73
63
|
|
74
64
|
T = TypeVar(
|
75
65
|
"T",
|
76
66
|
SuggestResponse,
|
77
|
-
ParagraphSearchResponse,
|
78
67
|
SearchResponse,
|
79
|
-
RelationSearchResponse,
|
80
68
|
)
|
81
69
|
|
82
70
|
|
83
|
-
@overload # type: ignore
|
84
|
-
async def node_query(
|
85
|
-
kbid: str,
|
86
|
-
method: Method,
|
87
|
-
pb_query: SuggestRequest,
|
88
|
-
target_shard_replicas: Optional[list[str]] = None,
|
89
|
-
use_read_replica_nodes: bool = True,
|
90
|
-
) -> tuple[list[SuggestResponse], bool, list[tuple[AbstractIndexNode, str]]]:
|
91
|
-
...
|
92
|
-
|
93
|
-
|
94
71
|
@overload
|
95
72
|
async def node_query(
|
96
73
|
kbid: str,
|
97
74
|
method: Method,
|
98
|
-
pb_query:
|
75
|
+
pb_query: SuggestRequest,
|
99
76
|
target_shard_replicas: Optional[list[str]] = None,
|
100
77
|
use_read_replica_nodes: bool = True,
|
101
|
-
|
102
|
-
|
78
|
+
timeout: Optional[float] = None,
|
79
|
+
retry_on_primary: bool = True,
|
80
|
+
) -> tuple[list[SuggestResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
|
103
81
|
|
104
82
|
|
105
83
|
@overload
|
@@ -109,19 +87,9 @@ async def node_query(
|
|
109
87
|
pb_query: SearchRequest,
|
110
88
|
target_shard_replicas: Optional[list[str]] = None,
|
111
89
|
use_read_replica_nodes: bool = True,
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
@overload
|
117
|
-
async def node_query(
|
118
|
-
kbid: str,
|
119
|
-
method: Method,
|
120
|
-
pb_query: RelationSearchRequest,
|
121
|
-
target_shard_replicas: Optional[list[str]] = None,
|
122
|
-
use_read_replica_nodes: bool = True,
|
123
|
-
) -> tuple[list[RelationSearchResponse], bool, list[tuple[AbstractIndexNode, str]]]:
|
124
|
-
...
|
90
|
+
timeout: Optional[float] = None,
|
91
|
+
retry_on_primary: bool = True,
|
92
|
+
) -> tuple[list[SearchResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
|
125
93
|
|
126
94
|
|
127
95
|
async def node_query(
|
@@ -130,7 +98,10 @@ async def node_query(
|
|
130
98
|
pb_query: REQUEST_TYPE,
|
131
99
|
target_shard_replicas: Optional[list[str]] = None,
|
132
100
|
use_read_replica_nodes: bool = True,
|
133
|
-
|
101
|
+
timeout: Optional[float] = None,
|
102
|
+
retry_on_primary: bool = True,
|
103
|
+
) -> tuple[Sequence[Union[T, BaseException]], bool, list[tuple[AbstractIndexNode, str]]]:
|
104
|
+
timeout = timeout or settings.search_timeout
|
134
105
|
use_read_replica_nodes = use_read_replica_nodes and has_feature(
|
135
106
|
const.Features.READ_REPLICA_SEARCHES, context={"kbid": kbid}
|
136
107
|
)
|
@@ -152,6 +123,7 @@ async def node_query(
|
|
152
123
|
try:
|
153
124
|
node, shard_id = cluster_manager.choose_node(
|
154
125
|
shard_obj,
|
126
|
+
use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid}),
|
155
127
|
use_read_replica_nodes=use_read_replica_nodes,
|
156
128
|
target_shard_replicas=target_shard_replicas,
|
157
129
|
)
|
@@ -173,9 +145,9 @@ async def node_query(
|
|
173
145
|
)
|
174
146
|
|
175
147
|
try:
|
176
|
-
results = await asyncio.wait_for(
|
177
|
-
asyncio.gather(*ops, return_exceptions=True),
|
178
|
-
timeout=
|
148
|
+
results: list[Union[T, BaseException]] = await asyncio.wait_for(
|
149
|
+
asyncio.gather(*ops, return_exceptions=True),
|
150
|
+
timeout=timeout,
|
179
151
|
)
|
180
152
|
except asyncio.TimeoutError as exc: # pragma: no cover
|
181
153
|
logger.warning(
|
@@ -186,10 +158,20 @@ async def node_query(
|
|
186
158
|
|
187
159
|
error = validate_node_query_results(results or [])
|
188
160
|
if error is not None:
|
161
|
+
query_dict = MessageToDict(pb_query)
|
162
|
+
query_dict.pop("vector", None)
|
163
|
+
logger.error(
|
164
|
+
"Error while querying nodes",
|
165
|
+
extra={
|
166
|
+
"kbid": kbid,
|
167
|
+
"query": json.dumps(query_dict),
|
168
|
+
},
|
169
|
+
)
|
189
170
|
if (
|
190
171
|
error.status_code >= 500
|
191
172
|
and use_read_replica_nodes
|
192
173
|
and any([node.is_read_replica() for node, _ in queried_nodes])
|
174
|
+
and retry_on_primary
|
193
175
|
):
|
194
176
|
# We had an error querying a secondary node, instead of raising an
|
195
177
|
# error directly, retry query to primaries and hope it works
|
@@ -220,9 +202,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
|
|
220
202
|
Handling of exception is responsibility of caller.
|
221
203
|
"""
|
222
204
|
if results is None or len(results) == 0:
|
223
|
-
return HTTPException(
|
224
|
-
status_code=500, detail=f"Error while executing shard queries. No results."
|
225
|
-
)
|
205
|
+
return HTTPException(status_code=500, detail=f"Error while executing shard queries. No results.")
|
226
206
|
|
227
207
|
for result in results:
|
228
208
|
if isinstance(result, Exception):
|
@@ -251,9 +231,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
|
|
251
231
|
return None
|
252
232
|
|
253
233
|
|
254
|
-
def debug_nodes_info(
|
255
|
-
nodes: list[tuple[AbstractIndexNode, str]]
|
256
|
-
) -> list[dict[str, str]]:
|
234
|
+
def debug_nodes_info(nodes: list[tuple[AbstractIndexNode, str]]) -> list[dict[str, str]]:
|
257
235
|
details: list[dict[str, str]] = []
|
258
236
|
for node, shard_id in nodes:
|
259
237
|
info = {
|
nucliadb/search/search/cache.py
CHANGED
@@ -18,42 +18,77 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
|
20
20
|
import asyncio
|
21
|
+
import contextlib
|
22
|
+
import logging
|
21
23
|
from contextvars import ContextVar
|
22
24
|
from typing import Optional
|
23
25
|
|
24
|
-
from lru import LRU
|
26
|
+
from lru import LRU
|
25
27
|
|
26
|
-
from nucliadb.common.
|
28
|
+
from nucliadb.common.ids import FieldId
|
29
|
+
from nucliadb.common.maindb.utils import get_driver
|
30
|
+
from nucliadb.ingest.fields.base import Field
|
27
31
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
|
28
32
|
from nucliadb.ingest.orm.resource import Resource as ResourceORM
|
29
|
-
from nucliadb.middleware.transaction import get_read_only_transaction
|
30
33
|
from nucliadb.search import SERVICE_NAME
|
34
|
+
from nucliadb_protos.utils_pb2 import ExtractedText
|
31
35
|
from nucliadb_telemetry import metrics
|
32
36
|
from nucliadb_utils.utilities import get_storage
|
33
37
|
|
34
|
-
|
35
|
-
|
36
|
-
)
|
38
|
+
logger = logging.getLogger(__name__)
|
39
|
+
|
40
|
+
rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar("rcache", default=None)
|
41
|
+
etcache: ContextVar[Optional["ExtractedTextCache"]] = ContextVar("etcache", default=None)
|
37
42
|
|
38
43
|
|
39
44
|
RESOURCE_LOCKS: dict[str, asyncio.Lock] = LRU(1000) # type: ignore
|
40
45
|
RESOURCE_CACHE_OPS = metrics.Counter("nucliadb_resource_cache_ops", labels={"type": ""})
|
46
|
+
EXTRACTED_CACHE_OPS = metrics.Counter("nucliadb_extracted_text_cache_ops", labels={"type": ""})
|
47
|
+
|
48
|
+
|
49
|
+
def set_extracted_text_cache() -> None:
|
50
|
+
value = ExtractedTextCache()
|
51
|
+
etcache.set(value)
|
52
|
+
|
53
|
+
|
54
|
+
def get_extracted_text_cache() -> Optional["ExtractedTextCache"]:
|
55
|
+
return etcache.get()
|
56
|
+
|
57
|
+
|
58
|
+
def clear_extracted_text_cache() -> None:
|
59
|
+
value = etcache.get()
|
60
|
+
if value is not None:
|
61
|
+
value.clear()
|
62
|
+
etcache.set(None)
|
63
|
+
|
41
64
|
|
65
|
+
def set_resource_cache() -> None:
|
66
|
+
value: dict[str, ResourceORM] = {}
|
67
|
+
rcache.set(value)
|
42
68
|
|
43
|
-
def get_resource_cache(clear: bool = False) -> dict[str, ResourceORM]:
|
44
|
-
value: Optional[dict[str, ResourceORM]] = rcache.get()
|
45
|
-
if value is None or clear:
|
46
|
-
value = {}
|
47
|
-
rcache.set(value)
|
48
|
-
return value
|
49
69
|
|
70
|
+
def get_resource_cache() -> Optional[dict[str, ResourceORM]]:
|
71
|
+
return rcache.get()
|
50
72
|
|
51
|
-
|
52
|
-
|
53
|
-
|
73
|
+
|
74
|
+
def clear_resource_cache() -> None:
|
75
|
+
value = rcache.get()
|
76
|
+
if value is not None:
|
77
|
+
value.clear()
|
78
|
+
rcache.set(None)
|
79
|
+
|
80
|
+
|
81
|
+
async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
82
|
+
"""
|
83
|
+
Will try to get the resource from the cache, if it's not there it will fetch it from the ORM and cache it.
|
84
|
+
"""
|
54
85
|
orm_resource: Optional[ResourceORM] = None
|
55
86
|
|
56
87
|
resource_cache = get_resource_cache()
|
88
|
+
if resource_cache is None:
|
89
|
+
RESOURCE_CACHE_OPS.inc({"type": "miss"})
|
90
|
+
logger.warning("Resource cache not set")
|
91
|
+
return await _orm_get_resource(kbid, uuid)
|
57
92
|
|
58
93
|
if uuid not in RESOURCE_LOCKS:
|
59
94
|
RESOURCE_LOCKS[uuid] = asyncio.Lock()
|
@@ -61,11 +96,7 @@ async def get_resource_from_cache(
|
|
61
96
|
async with RESOURCE_LOCKS[uuid]:
|
62
97
|
if uuid not in resource_cache:
|
63
98
|
RESOURCE_CACHE_OPS.inc({"type": "miss"})
|
64
|
-
|
65
|
-
txn = await get_read_only_transaction()
|
66
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
67
|
-
kb = KnowledgeBoxORM(txn, storage, kbid)
|
68
|
-
orm_resource = await kb.get(uuid)
|
99
|
+
orm_resource = await _orm_get_resource(kbid, uuid)
|
69
100
|
else:
|
70
101
|
RESOURCE_CACHE_OPS.inc({"type": "hit"})
|
71
102
|
|
@@ -75,3 +106,101 @@ async def get_resource_from_cache(
|
|
75
106
|
orm_resource = resource_cache.get(uuid)
|
76
107
|
|
77
108
|
return orm_resource
|
109
|
+
|
110
|
+
|
111
|
+
async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
|
112
|
+
async with get_driver().transaction(read_only=True) as txn:
|
113
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
114
|
+
kb = KnowledgeBoxORM(txn, storage, kbid)
|
115
|
+
return await kb.get(uuid)
|
116
|
+
|
117
|
+
|
118
|
+
class ExtractedTextCache:
|
119
|
+
"""
|
120
|
+
Used to cache extracted text from a resource in memory during the process
|
121
|
+
of search results hydration.
|
122
|
+
|
123
|
+
This is needed to avoid fetching the same extracted text multiple times,
|
124
|
+
as matching text blocks are processed in parallel and the extracted text is
|
125
|
+
fetched for each field where the text block is found.
|
126
|
+
"""
|
127
|
+
|
128
|
+
def __init__(self):
|
129
|
+
self.locks = {}
|
130
|
+
self.values = {}
|
131
|
+
|
132
|
+
def get_value(self, key: str) -> Optional[ExtractedText]:
|
133
|
+
return self.values.get(key)
|
134
|
+
|
135
|
+
def get_lock(self, key: str) -> asyncio.Lock:
|
136
|
+
return self.locks.setdefault(key, asyncio.Lock())
|
137
|
+
|
138
|
+
def set_value(self, key: str, value: ExtractedText) -> None:
|
139
|
+
self.values[key] = value
|
140
|
+
|
141
|
+
def clear(self):
|
142
|
+
self.values.clear()
|
143
|
+
self.locks.clear()
|
144
|
+
|
145
|
+
|
146
|
+
async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
|
147
|
+
cache = get_extracted_text_cache()
|
148
|
+
if cache is None:
|
149
|
+
logger.warning("Extracted text cache not set")
|
150
|
+
EXTRACTED_CACHE_OPS.inc({"type": "miss"})
|
151
|
+
return await field.get_extracted_text()
|
152
|
+
|
153
|
+
key = f"{field.kbid}/{field.uuid}/{field.id}"
|
154
|
+
extracted_text = cache.get_value(key)
|
155
|
+
if extracted_text is not None:
|
156
|
+
EXTRACTED_CACHE_OPS.inc({"type": "hit"})
|
157
|
+
return extracted_text
|
158
|
+
|
159
|
+
async with cache.get_lock(key):
|
160
|
+
# Check again in case another task already fetched it
|
161
|
+
extracted_text = cache.get_value(key)
|
162
|
+
if extracted_text is not None:
|
163
|
+
EXTRACTED_CACHE_OPS.inc({"type": "hit"})
|
164
|
+
return extracted_text
|
165
|
+
|
166
|
+
EXTRACTED_CACHE_OPS.inc({"type": "miss"})
|
167
|
+
extracted_text = await field.get_extracted_text()
|
168
|
+
if extracted_text is not None:
|
169
|
+
# Only cache if we actually have extracted text
|
170
|
+
cache.set_value(key, extracted_text)
|
171
|
+
return extracted_text
|
172
|
+
|
173
|
+
|
174
|
+
async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
|
175
|
+
rid = field.rid
|
176
|
+
orm_resource = await get_resource(kbid, rid)
|
177
|
+
if orm_resource is None:
|
178
|
+
return None
|
179
|
+
field_obj = await orm_resource.get_field(
|
180
|
+
key=field.key,
|
181
|
+
type=field.pb_type,
|
182
|
+
load=False,
|
183
|
+
)
|
184
|
+
return await get_field_extracted_text(field_obj)
|
185
|
+
|
186
|
+
|
187
|
+
@contextlib.contextmanager
|
188
|
+
def request_caches():
|
189
|
+
"""
|
190
|
+
This context manager sets the caches for extracted text and resources for a request.
|
191
|
+
|
192
|
+
It should used at the beginning of a request handler to avoid fetching the same
|
193
|
+
resources and extracted text multiple times.
|
194
|
+
|
195
|
+
Makes sure to clean the caches at the end of the context manager.
|
196
|
+
>>> with request_caches():
|
197
|
+
... resource = await get_resource(kbid, uuid)
|
198
|
+
... extracted_text = await get_extracted_text_from_field_id(kbid, rid, field_id)
|
199
|
+
"""
|
200
|
+
set_resource_cache()
|
201
|
+
set_extracted_text_cache()
|
202
|
+
try:
|
203
|
+
yield
|
204
|
+
finally:
|
205
|
+
clear_resource_cache()
|
206
|
+
clear_extracted_text_cache()
|