nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -1,331 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
from unittest.mock import AsyncMock, MagicMock, Mock, patch
|
21
|
-
|
22
|
-
import pytest
|
23
|
-
from nucliadb_protos.resources_pb2 import (
|
24
|
-
AllFieldIDs,
|
25
|
-
Basic,
|
26
|
-
CloudFile,
|
27
|
-
FieldID,
|
28
|
-
FieldText,
|
29
|
-
FieldType,
|
30
|
-
FileExtractedData,
|
31
|
-
PagePositions,
|
32
|
-
)
|
33
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage
|
34
|
-
|
35
|
-
from nucliadb.ingest.orm.resource import (
|
36
|
-
Resource,
|
37
|
-
get_file_page_positions,
|
38
|
-
get_text_field_mimetype,
|
39
|
-
maybe_update_basic_icon,
|
40
|
-
maybe_update_basic_summary,
|
41
|
-
maybe_update_basic_thumbnail,
|
42
|
-
update_basic_languages,
|
43
|
-
)
|
44
|
-
from nucliadb_protos import utils_pb2, writer_pb2
|
45
|
-
|
46
|
-
|
47
|
-
@pytest.mark.asyncio
|
48
|
-
async def test_get_file_page_positions():
|
49
|
-
extracted_data = FileExtractedData()
|
50
|
-
extracted_data.file_pages_previews.positions.extend(
|
51
|
-
[PagePositions(start=0, end=10), PagePositions(start=11, end=20)]
|
52
|
-
)
|
53
|
-
file_field = AsyncMock(
|
54
|
-
get_file_extracted_data=AsyncMock(return_value=extracted_data)
|
55
|
-
)
|
56
|
-
assert await get_file_page_positions(file_field) == {0: (0, 10), 1: (11, 20)}
|
57
|
-
|
58
|
-
|
59
|
-
@pytest.mark.parametrize(
|
60
|
-
"basic,summary,updated",
|
61
|
-
[
|
62
|
-
(Basic(), "new_summary", True),
|
63
|
-
(Basic(summary="summary"), "new_summary", False),
|
64
|
-
(Basic(summary="summary"), "", False),
|
65
|
-
],
|
66
|
-
)
|
67
|
-
def test_maybe_update_basic_summary(basic, summary, updated):
|
68
|
-
assert maybe_update_basic_summary(basic, summary) is updated
|
69
|
-
if updated:
|
70
|
-
assert basic.summary == summary
|
71
|
-
else:
|
72
|
-
assert basic.summary != summary
|
73
|
-
|
74
|
-
|
75
|
-
def test_update_basic_languages():
|
76
|
-
basic = Basic()
|
77
|
-
# Languages are updated the first time
|
78
|
-
assert update_basic_languages(basic, ["en"]) is True
|
79
|
-
assert basic.metadata.language == "en"
|
80
|
-
assert basic.metadata.languages == ["en"]
|
81
|
-
|
82
|
-
# Languages are not updated
|
83
|
-
assert update_basic_languages(basic, ["en"]) is False
|
84
|
-
assert basic.metadata.language == "en"
|
85
|
-
assert basic.metadata.languages == ["en"]
|
86
|
-
|
87
|
-
# Main language is not updated but new language is added
|
88
|
-
assert update_basic_languages(basic, ["de"]) is True
|
89
|
-
assert basic.metadata.language == "en"
|
90
|
-
assert basic.metadata.languages == ["en", "de"]
|
91
|
-
|
92
|
-
# Null values
|
93
|
-
assert update_basic_languages(basic, [""]) is False
|
94
|
-
assert update_basic_languages(basic, [None]) is False # type: ignore
|
95
|
-
assert basic.metadata.language == "en"
|
96
|
-
assert basic.metadata.languages == ["en", "de"]
|
97
|
-
|
98
|
-
|
99
|
-
@pytest.mark.parametrize(
|
100
|
-
"basic,thumbnail,updated",
|
101
|
-
[
|
102
|
-
(Basic(), CloudFile(uri="new_thumbnail_url"), True),
|
103
|
-
(
|
104
|
-
Basic(thumbnail="old_thumbnail_url"),
|
105
|
-
CloudFile(uri="new_thumbnail_url"),
|
106
|
-
False,
|
107
|
-
),
|
108
|
-
(Basic(thumbnail="old_thumbnail_url"), None, False),
|
109
|
-
],
|
110
|
-
)
|
111
|
-
def test_maybe_update_basic_thumbnail(basic, thumbnail, updated):
|
112
|
-
assert maybe_update_basic_thumbnail(basic, thumbnail) == updated
|
113
|
-
if updated:
|
114
|
-
assert basic.thumbnail == thumbnail.uri
|
115
|
-
else:
|
116
|
-
assert basic.thumbnail == "old_thumbnail_url"
|
117
|
-
|
118
|
-
|
119
|
-
@pytest.mark.parametrize(
|
120
|
-
"text_format,mimetype",
|
121
|
-
[
|
122
|
-
(None, None),
|
123
|
-
(FieldText.Format.PLAIN, "text/plain"),
|
124
|
-
(FieldText.Format.HTML, "text/html"),
|
125
|
-
(FieldText.Format.RST, "text/x-rst"),
|
126
|
-
(FieldText.Format.MARKDOWN, "text/markdown"),
|
127
|
-
(FieldText.Format.KEEP_MARKDOWN, "text/markdown"),
|
128
|
-
],
|
129
|
-
)
|
130
|
-
def test_get_text_field_mimetype(text_format, mimetype):
|
131
|
-
message = BrokerMessage()
|
132
|
-
if text_format is not None:
|
133
|
-
message.texts["foo"].body = "foo"
|
134
|
-
message.texts["foo"].format = text_format
|
135
|
-
assert get_text_field_mimetype(message) == mimetype
|
136
|
-
|
137
|
-
|
138
|
-
@pytest.mark.parametrize(
|
139
|
-
"basic,icon,updated",
|
140
|
-
[
|
141
|
-
(Basic(), None, False),
|
142
|
-
(Basic(icon="text/plain"), "text/html", False),
|
143
|
-
(Basic(), "text/html", True),
|
144
|
-
(Basic(icon=""), "text/html", True),
|
145
|
-
(Basic(icon="application/octet-stream"), "text/html", True),
|
146
|
-
],
|
147
|
-
)
|
148
|
-
def test_maybe_update_basic_icon(basic, icon, updated):
|
149
|
-
assert maybe_update_basic_icon(basic, icon) == updated
|
150
|
-
if updated:
|
151
|
-
assert basic.icon == icon
|
152
|
-
|
153
|
-
|
154
|
-
class Transaction:
|
155
|
-
def __init__(self):
|
156
|
-
self.kv = {}
|
157
|
-
|
158
|
-
async def get(self, key):
|
159
|
-
return self.kv.get(key)
|
160
|
-
|
161
|
-
async def set(self, key, value):
|
162
|
-
self.kv[key] = value
|
163
|
-
|
164
|
-
|
165
|
-
@pytest.fixture(scope="function")
|
166
|
-
def txn():
|
167
|
-
return Transaction()
|
168
|
-
|
169
|
-
|
170
|
-
@pytest.fixture(scope="function")
|
171
|
-
def storage():
|
172
|
-
mock = AsyncMock()
|
173
|
-
return mock
|
174
|
-
|
175
|
-
|
176
|
-
@pytest.fixture(scope="function")
|
177
|
-
def kb():
|
178
|
-
mock = AsyncMock()
|
179
|
-
mock.kbid = "mock-kbid"
|
180
|
-
return mock
|
181
|
-
|
182
|
-
|
183
|
-
async def test_get_fields_ids_caches_keys(txn, storage, kb):
|
184
|
-
resource = Resource(txn, storage, kb, "rid")
|
185
|
-
cached_field_keys = [(0, "foo"), (1, "bar")]
|
186
|
-
new_field_keys = [(2, "baz")]
|
187
|
-
resource._inner_get_fields_ids = AsyncMock(return_value=new_field_keys) # type: ignore
|
188
|
-
resource.all_fields_keys = cached_field_keys
|
189
|
-
|
190
|
-
assert await resource.get_fields_ids() == cached_field_keys
|
191
|
-
resource._inner_get_fields_ids.assert_not_awaited()
|
192
|
-
|
193
|
-
assert await resource.get_fields_ids(force=True) == new_field_keys
|
194
|
-
resource._inner_get_fields_ids.assert_awaited_once()
|
195
|
-
assert resource.all_fields_keys == new_field_keys
|
196
|
-
|
197
|
-
# If the all_field_keys is an empty list,
|
198
|
-
# we should not be calling the inner_get_fields_ids
|
199
|
-
resource.all_fields_keys = []
|
200
|
-
resource._inner_get_fields_ids.reset_mock()
|
201
|
-
assert await resource.get_fields_ids() == []
|
202
|
-
resource._inner_get_fields_ids.assert_not_awaited()
|
203
|
-
|
204
|
-
|
205
|
-
async def test_get_set_all_field_ids(txn, storage, kb):
|
206
|
-
resource = Resource(txn, storage, kb, "rid")
|
207
|
-
|
208
|
-
assert await resource.get_all_field_ids() is None
|
209
|
-
|
210
|
-
all_fields = AllFieldIDs()
|
211
|
-
all_fields.fields.append(FieldID(field_type=FieldType.TEXT, field="text"))
|
212
|
-
|
213
|
-
await resource.set_all_field_ids(all_fields)
|
214
|
-
|
215
|
-
assert await resource.get_all_field_ids() == all_fields
|
216
|
-
|
217
|
-
|
218
|
-
async def test_update_all_fields_key(txn, storage, kb):
|
219
|
-
resource = Resource(txn, storage, kb, "rid")
|
220
|
-
|
221
|
-
await resource.update_all_field_ids(updated=[], deleted=[])
|
222
|
-
|
223
|
-
# Initial value is Empty
|
224
|
-
assert (await resource.get_all_field_ids()) == AllFieldIDs()
|
225
|
-
|
226
|
-
all_fields = AllFieldIDs()
|
227
|
-
all_fields.fields.append(FieldID(field_type=FieldType.TEXT, field="text1"))
|
228
|
-
all_fields.fields.append(FieldID(field_type=FieldType.TEXT, field="text2"))
|
229
|
-
|
230
|
-
await resource.update_all_field_ids(updated=all_fields.fields)
|
231
|
-
|
232
|
-
# Check updates
|
233
|
-
assert await resource.get_all_field_ids() == all_fields
|
234
|
-
|
235
|
-
file_field = FieldID(field_type=FieldType.FILE, field="file")
|
236
|
-
await resource.update_all_field_ids(updated=[file_field])
|
237
|
-
|
238
|
-
result = await resource.get_all_field_ids()
|
239
|
-
assert list(result.fields) == list(all_fields.fields) + [file_field]
|
240
|
-
|
241
|
-
# Check deletes
|
242
|
-
await resource.update_all_field_ids(deleted=[file_field])
|
243
|
-
|
244
|
-
assert await resource.get_all_field_ids() == all_fields
|
245
|
-
|
246
|
-
|
247
|
-
async def test_apply_fields_calls_update_all_field_ids(txn, storage, kb):
|
248
|
-
resource = Resource(txn, storage, kb, "rid")
|
249
|
-
resource.update_all_field_ids = AsyncMock() # type: ignore
|
250
|
-
resource.set_field = AsyncMock() # type: ignore
|
251
|
-
|
252
|
-
bm = MagicMock()
|
253
|
-
bm.layouts = {"layout": MagicMock()}
|
254
|
-
bm.texts = {"text": MagicMock()}
|
255
|
-
bm.keywordsets = {"keywordset": MagicMock()}
|
256
|
-
bm.datetimes = {"datetime": MagicMock()}
|
257
|
-
bm.links = {"link": MagicMock()}
|
258
|
-
bm.files = {"file": MagicMock()}
|
259
|
-
bm.conversations = {"conversation": MagicMock()}
|
260
|
-
bm.delete_fields.append(FieldID(field_type=FieldType.LAYOUT, field="to_delete"))
|
261
|
-
|
262
|
-
await resource.apply_fields(bm)
|
263
|
-
|
264
|
-
resource.update_all_field_ids.assert_awaited_once()
|
265
|
-
|
266
|
-
resource.update_all_field_ids.call_args[1]["updated"] == [
|
267
|
-
FieldID(field_type=FieldType.LAYOUT, field="layout"),
|
268
|
-
FieldID(field_type=FieldType.TEXT, field="text"),
|
269
|
-
FieldID(field_type=FieldType.KEYWORDSET, field="keywordset"),
|
270
|
-
FieldID(field_type=FieldType.DATETIME, field="datetime"),
|
271
|
-
FieldID(field_type=FieldType.LINK, field="link"),
|
272
|
-
FieldID(field_type=FieldType.FILE, field="file"),
|
273
|
-
FieldID(field_type=FieldType.CONVERSATION, field="conversation"),
|
274
|
-
]
|
275
|
-
resource.update_all_field_ids.call_args[1]["deleted"] == [
|
276
|
-
FieldID(field_type=FieldType.LAYOUT, field="to_delete"),
|
277
|
-
]
|
278
|
-
|
279
|
-
|
280
|
-
async def test_apply_extracted_vectors_matryoshka_embeddings(txn, storage, kb):
|
281
|
-
STORED_VECTOR_DIMENSION = 100
|
282
|
-
MATRYOSHKA_DIMENSION = 10
|
283
|
-
|
284
|
-
mock_field = AsyncMock()
|
285
|
-
vectors = utils_pb2.VectorObject(
|
286
|
-
vectors=utils_pb2.Vectors(
|
287
|
-
vectors=[
|
288
|
-
utils_pb2.Vector(
|
289
|
-
start=0,
|
290
|
-
end=10,
|
291
|
-
start_paragraph=0,
|
292
|
-
end_paragraph=10,
|
293
|
-
vector=[1.0] * STORED_VECTOR_DIMENSION,
|
294
|
-
)
|
295
|
-
]
|
296
|
-
)
|
297
|
-
)
|
298
|
-
mock_field.set_vectors.return_value = (vectors, False, [])
|
299
|
-
|
300
|
-
resource = Resource(txn, storage, kb, "matryoshka-rid")
|
301
|
-
with (
|
302
|
-
patch.object(resource, "has_field", Mock(return_value=True)),
|
303
|
-
patch.object(resource, "get_field", AsyncMock(return_value=mock_field)),
|
304
|
-
patch.object(resource, "generate_field_id", Mock(return_value="field_id")),
|
305
|
-
patch("nucliadb.ingest.orm.resource.datamanagers") as mock_datamanagers,
|
306
|
-
patch.object(
|
307
|
-
resource.indexer, "apply_field_vectors", AsyncMock()
|
308
|
-
) as apply_field_vectors,
|
309
|
-
):
|
310
|
-
mock_datamanagers.kb.get_matryoshka_vector_dimension = AsyncMock(
|
311
|
-
return_value=None
|
312
|
-
)
|
313
|
-
await resource._apply_extracted_vectors(
|
314
|
-
writer_pb2.ExtractedVectorsWrapper(vectors=vectors)
|
315
|
-
)
|
316
|
-
assert apply_field_vectors.call_count == 1
|
317
|
-
assert (
|
318
|
-
apply_field_vectors.call_args.kwargs["matryoshka_vector_dimension"] is None
|
319
|
-
)
|
320
|
-
|
321
|
-
mock_datamanagers.kb.get_matryoshka_vector_dimension = AsyncMock(
|
322
|
-
return_value=MATRYOSHKA_DIMENSION
|
323
|
-
)
|
324
|
-
await resource._apply_extracted_vectors(
|
325
|
-
writer_pb2.ExtractedVectorsWrapper(vectors=vectors)
|
326
|
-
)
|
327
|
-
assert apply_field_vectors.call_count == 2
|
328
|
-
assert (
|
329
|
-
apply_field_vectors.call_args.kwargs["matryoshka_vector_dimension"]
|
330
|
-
== MATRYOSHKA_DIMENSION
|
331
|
-
)
|
@@ -1,31 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
from nucliadb.common.cluster.index_node import READ_CONNECTIONS, WRITE_CONNECTIONS
|
21
|
-
from nucliadb.ingest.cache import clear_ingest_cache
|
22
|
-
|
23
|
-
|
24
|
-
def test_clear_ingest_cache():
|
25
|
-
READ_CONNECTIONS["addr1"] = "conn1"
|
26
|
-
WRITE_CONNECTIONS["addr2"] = "conn2"
|
27
|
-
|
28
|
-
clear_ingest_cache()
|
29
|
-
|
30
|
-
assert len(READ_CONNECTIONS) == 0
|
31
|
-
assert len(WRITE_CONNECTIONS) == 0
|
@@ -1,40 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import json
|
21
|
-
import os
|
22
|
-
|
23
|
-
import pytest
|
24
|
-
|
25
|
-
from nucliadb.ingest.partitions import assign_partitions
|
26
|
-
|
27
|
-
|
28
|
-
@pytest.mark.asyncio
|
29
|
-
async def test_assign_partitions(partition_settings):
|
30
|
-
expected_partition_list = []
|
31
|
-
part = partition_settings.replica_number
|
32
|
-
|
33
|
-
while part < partition_settings.nuclia_partitions:
|
34
|
-
expected_partition_list.append(str(part + 1))
|
35
|
-
part += partition_settings.total_replicas
|
36
|
-
|
37
|
-
assign_partitions(partition_settings)
|
38
|
-
|
39
|
-
assert partition_settings.partitions == expected_partition_list
|
40
|
-
assert os.environ["PARTITIONS"] == json.dumps(expected_partition_list)
|
@@ -1,171 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
from unittest.mock import Mock
|
21
|
-
|
22
|
-
import pytest
|
23
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
24
|
-
|
25
|
-
from nucliadb.ingest.processing import (
|
26
|
-
DummyProcessingEngine,
|
27
|
-
ProcessingEngine,
|
28
|
-
PushPayload,
|
29
|
-
)
|
30
|
-
from nucliadb.tests.utils.aiohttp_session import get_mocked_session
|
31
|
-
from nucliadb_models import File, FileField
|
32
|
-
from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
|
33
|
-
|
34
|
-
TEST_FILE = FileField(
|
35
|
-
password="mypassword", file=File(filename="myfile.pdf", payload="")
|
36
|
-
)
|
37
|
-
|
38
|
-
TEST_CLOUD_FILE = CloudFile(
|
39
|
-
uri="file.png",
|
40
|
-
source=CloudFile.Source.LOCAL,
|
41
|
-
bucket_name="/integration/ingest/assets",
|
42
|
-
size=4,
|
43
|
-
content_type="image/png",
|
44
|
-
filename="file.png",
|
45
|
-
)
|
46
|
-
|
47
|
-
TEST_ITEM = PushPayload(uuid="foo", kbid="bar", userid="baz", partition=1)
|
48
|
-
|
49
|
-
|
50
|
-
@pytest.mark.asyncio
|
51
|
-
async def test_dummy_processing_engine():
|
52
|
-
engine = DummyProcessingEngine()
|
53
|
-
await engine.initialize()
|
54
|
-
await engine.finalize()
|
55
|
-
await engine.convert_filefield_to_str(None)
|
56
|
-
engine.convert_external_filefield_to_str(None)
|
57
|
-
await engine.convert_internal_filefield_to_str(None, None)
|
58
|
-
await engine.convert_internal_cf_to_str(None, None)
|
59
|
-
await engine.send_to_process(Mock(kbid="foo"), 1)
|
60
|
-
|
61
|
-
|
62
|
-
@pytest.fixture(scope="function")
|
63
|
-
def engine():
|
64
|
-
pe = ProcessingEngine(
|
65
|
-
onprem=True,
|
66
|
-
nuclia_processing_cluster_url="cluster_url",
|
67
|
-
nuclia_public_url="public_url",
|
68
|
-
)
|
69
|
-
yield pe
|
70
|
-
|
71
|
-
|
72
|
-
async def test_convert_filefield_to_str_200(engine):
|
73
|
-
engine.session = get_mocked_session("POST", 200, text="jwt")
|
74
|
-
|
75
|
-
assert await engine.convert_filefield_to_str(TEST_FILE) == "jwt"
|
76
|
-
|
77
|
-
|
78
|
-
async def test_convert_filefield_to_str_402(engine):
|
79
|
-
engine.session = get_mocked_session("POST", 402, json={"detail": "limits exceeded"})
|
80
|
-
|
81
|
-
with pytest.raises(LimitsExceededError) as exc:
|
82
|
-
await engine.convert_filefield_to_str(TEST_FILE)
|
83
|
-
assert exc.value.status_code == 402
|
84
|
-
|
85
|
-
|
86
|
-
async def test_convert_filefield_to_str_429(engine):
|
87
|
-
engine.session = get_mocked_session("POST", 429, json={"detail": "limits exceeded"})
|
88
|
-
|
89
|
-
with pytest.raises(LimitsExceededError) as exc:
|
90
|
-
await engine.convert_filefield_to_str(TEST_FILE)
|
91
|
-
assert exc.value.status_code == 429
|
92
|
-
|
93
|
-
|
94
|
-
async def test_convert_filefield_to_str_500(engine):
|
95
|
-
engine.session = get_mocked_session("POST", 500, text="error")
|
96
|
-
|
97
|
-
with pytest.raises(Exception) as exc:
|
98
|
-
await engine.convert_filefield_to_str(TEST_FILE)
|
99
|
-
assert str(exc.value) == "STATUS: 500 - error"
|
100
|
-
|
101
|
-
|
102
|
-
async def test_convert_internal_cf_to_str_200(engine):
|
103
|
-
engine.session = get_mocked_session("POST", 200, text="jwt")
|
104
|
-
|
105
|
-
assert await engine.convert_internal_cf_to_str(TEST_CLOUD_FILE, Mock()) == "jwt"
|
106
|
-
|
107
|
-
|
108
|
-
async def test_convert_internal_cf_to_str_402(engine):
|
109
|
-
engine.session = get_mocked_session("POST", 402, json={"detail": "limits exceeded"})
|
110
|
-
|
111
|
-
with pytest.raises(LimitsExceededError) as exc:
|
112
|
-
await engine.convert_internal_cf_to_str(TEST_CLOUD_FILE, Mock())
|
113
|
-
assert exc.value.status_code == 402
|
114
|
-
|
115
|
-
|
116
|
-
async def test_convert_internal_cf_to_str_429(engine):
|
117
|
-
engine.session = get_mocked_session("POST", 429, json={"detail": "limits exceeded"})
|
118
|
-
|
119
|
-
with pytest.raises(LimitsExceededError) as exc:
|
120
|
-
await engine.convert_internal_cf_to_str(TEST_CLOUD_FILE, Mock())
|
121
|
-
assert exc.value.status_code == 429
|
122
|
-
|
123
|
-
|
124
|
-
async def test_convert_internal_cf_to_str_500(engine):
|
125
|
-
engine.session = get_mocked_session("POST", 500, text="error")
|
126
|
-
|
127
|
-
with pytest.raises(Exception) as exc:
|
128
|
-
await engine.convert_internal_cf_to_str(TEST_CLOUD_FILE, Mock())
|
129
|
-
assert str(exc.value) == "STATUS: 500 - error"
|
130
|
-
|
131
|
-
|
132
|
-
async def test_send_to_process_200(engine):
|
133
|
-
json_data = {"seqid": 11, "account_seq": 22, "queue": "private"}
|
134
|
-
engine.session = get_mocked_session(
|
135
|
-
"POST", 200, json=json_data, context_manager=False
|
136
|
-
)
|
137
|
-
|
138
|
-
processing_info = await engine.send_to_process(TEST_ITEM, 1)
|
139
|
-
assert processing_info.seqid == 11
|
140
|
-
assert processing_info.account_seq == 22
|
141
|
-
assert processing_info.queue == "private"
|
142
|
-
|
143
|
-
|
144
|
-
@pytest.mark.parametrize("status", [402, 413])
|
145
|
-
async def test_send_to_process_limits_exceeded(status, engine):
|
146
|
-
engine.session = get_mocked_session(
|
147
|
-
"POST", status, json={"detail": "limits exceeded"}, context_manager=False
|
148
|
-
)
|
149
|
-
|
150
|
-
with pytest.raises(LimitsExceededError) as exc:
|
151
|
-
await engine.send_to_process(TEST_ITEM, 1)
|
152
|
-
assert exc.value.status_code == status
|
153
|
-
|
154
|
-
|
155
|
-
async def test_send_to_process_limits_exceeded_429(engine):
|
156
|
-
engine.session = get_mocked_session(
|
157
|
-
"POST", 429, json={"detail": "limits exceeded"}, context_manager=False
|
158
|
-
)
|
159
|
-
|
160
|
-
with pytest.raises(LimitsExceededError) as exc:
|
161
|
-
await engine.send_to_process(TEST_ITEM, 1)
|
162
|
-
assert exc.value.status_code == 429
|
163
|
-
|
164
|
-
|
165
|
-
async def test_send_to_process_500(engine):
|
166
|
-
engine.session = get_mocked_session(
|
167
|
-
"POST", 500, text="error", context_manager=False
|
168
|
-
)
|
169
|
-
|
170
|
-
with pytest.raises(SendToProcessError):
|
171
|
-
await engine.send_to_process(TEST_ITEM, 1)
|
@@ -1,117 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
|
20
|
-
import asyncio
|
21
|
-
from contextvars import ContextVar
|
22
|
-
from typing import Optional
|
23
|
-
|
24
|
-
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
25
|
-
from starlette.requests import Request
|
26
|
-
from starlette.responses import Response
|
27
|
-
|
28
|
-
from nucliadb.common.maindb.driver import Transaction
|
29
|
-
from nucliadb.common.maindb.utils import get_driver
|
30
|
-
|
31
|
-
txn_manager: ContextVar[Optional["ReadOnlyTransactionManager"]] = ContextVar(
|
32
|
-
"txn_manager", default=None
|
33
|
-
)
|
34
|
-
|
35
|
-
|
36
|
-
class ReadOnlyTransactionMiddleware(BaseHTTPMiddleware):
|
37
|
-
"""
|
38
|
-
This middleware provides a unique read-only transaction for each request. The transaction is
|
39
|
-
created lazily, so if it's not used, it's not created. The middleware also ensures that the
|
40
|
-
transaction is aborted at the end of the request.
|
41
|
-
|
42
|
-
This is useful, for instance, on search endpoints where we want to minimize the number
|
43
|
-
of transactions that are created.
|
44
|
-
|
45
|
-
Usage:
|
46
|
-
- Add this middleware to the FastAPI app:
|
47
|
-
|
48
|
-
app = FastAPI()
|
49
|
-
app.add_middleware(ReadOnlyTransactionMiddleware)
|
50
|
-
|
51
|
-
- Where needed, get the transaction:
|
52
|
-
|
53
|
-
txn = await get_read_only_transaction()
|
54
|
-
"""
|
55
|
-
|
56
|
-
async def dispatch(
|
57
|
-
self, request: Request, call_next: RequestResponseEndpoint
|
58
|
-
) -> Response:
|
59
|
-
mgr = ReadOnlyTransactionManager()
|
60
|
-
txn_manager.set(mgr)
|
61
|
-
try:
|
62
|
-
return await call_next(request)
|
63
|
-
finally:
|
64
|
-
await mgr.maybe_abort()
|
65
|
-
txn_manager.set(None)
|
66
|
-
|
67
|
-
|
68
|
-
class TransactionNotFoundException(Exception):
|
69
|
-
pass
|
70
|
-
|
71
|
-
|
72
|
-
class ReadOnlyTransactionManager:
|
73
|
-
def __init__(self):
|
74
|
-
self._transaction: Optional[Transaction] = None
|
75
|
-
self._lock = asyncio.Lock()
|
76
|
-
self.aborted: bool = False
|
77
|
-
|
78
|
-
async def get_transaction(self) -> Transaction:
|
79
|
-
if self.aborted:
|
80
|
-
raise TransactionNotFoundException("Transaction was aborted")
|
81
|
-
|
82
|
-
if self._transaction is not None:
|
83
|
-
return self._transaction
|
84
|
-
|
85
|
-
async with self._lock:
|
86
|
-
# Check again in case it was set while waiting for the lock
|
87
|
-
if self._transaction is not None:
|
88
|
-
return self._transaction
|
89
|
-
|
90
|
-
self._transaction = await self._get_transaction()
|
91
|
-
return self._transaction
|
92
|
-
|
93
|
-
async def _get_transaction(self) -> Transaction:
|
94
|
-
driver = get_driver()
|
95
|
-
txn = await driver.begin(read_only=True)
|
96
|
-
return txn
|
97
|
-
|
98
|
-
async def maybe_abort(self):
|
99
|
-
if self.aborted or self._transaction is None:
|
100
|
-
return
|
101
|
-
|
102
|
-
await self._transaction.abort()
|
103
|
-
self._transaction = None
|
104
|
-
self._lock = None
|
105
|
-
self.aborted = True
|
106
|
-
|
107
|
-
|
108
|
-
async def get_read_only_transaction() -> Transaction:
|
109
|
-
"""
|
110
|
-
Returns the read-only transaction for the current request
|
111
|
-
"""
|
112
|
-
manager: Optional[ReadOnlyTransactionManager] = txn_manager.get()
|
113
|
-
if manager is None:
|
114
|
-
raise TransactionNotFoundException(
|
115
|
-
"Context var is not set. Did you forget to add the ReadOnlyTransactionMiddleware to the app?"
|
116
|
-
)
|
117
|
-
return await manager.get_transaction()
|