nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -16,10 +16,9 @@
|
|
16
16
|
#
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
from nucliadb_protos.resources_pb2 import Origin
|
20
|
-
|
21
19
|
from nucliadb_models import Extra, InputOrigin
|
22
20
|
from nucliadb_protos import resources_pb2
|
21
|
+
from nucliadb_protos.resources_pb2 import Origin
|
23
22
|
|
24
23
|
|
25
24
|
def parse_origin(origin: Origin, origin_payload: InputOrigin):
|
nucliadb/writer/settings.py
CHANGED
@@ -48,16 +48,20 @@ class BackPressureSettings(BaseSettings):
|
|
48
48
|
description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time", # noqa
|
49
49
|
)
|
50
50
|
max_indexing_pending: int = Field(
|
51
|
-
default=
|
51
|
+
default=200,
|
52
52
|
description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks", # noqa
|
53
|
+
alias="back_pressure_max_indexing_pending",
|
53
54
|
)
|
54
55
|
max_ingest_pending: int = Field(
|
55
|
-
default
|
56
|
+
# Disabled by default
|
57
|
+
default=0,
|
56
58
|
description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks", # noqa
|
59
|
+
alias="back_pressure_max_ingest_pending",
|
57
60
|
)
|
58
61
|
max_processing_pending: int = Field(
|
59
62
|
default=1000,
|
60
63
|
description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks", # noqa
|
64
|
+
alias="back_pressure_max_processing_pending",
|
61
65
|
)
|
62
66
|
indexing_check_interval: int = Field(
|
63
67
|
default=30,
|
nucliadb/writer/tus/__init__.py
CHANGED
@@ -23,10 +23,6 @@ from typing import Optional
|
|
23
23
|
from nucliadb.writer.settings import settings as writer_settings
|
24
24
|
from nucliadb.writer.tus.dm import FileDataManager, RedisFileDataManagerFactory
|
25
25
|
from nucliadb.writer.tus.exceptions import ManagerNotAvailable
|
26
|
-
from nucliadb.writer.tus.gcs import GCloudBlobStore, GCloudFileStorageManager
|
27
|
-
from nucliadb.writer.tus.local import LocalBlobStore, LocalFileStorageManager
|
28
|
-
from nucliadb.writer.tus.pg import PGBlobStore, PGFileStorageManager
|
29
|
-
from nucliadb.writer.tus.s3 import S3BlobStore, S3FileStorageManager
|
30
26
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
31
27
|
from nucliadb_utils.exceptions import ConfigurationError
|
32
28
|
from nucliadb_utils.settings import FileBackendConfig, storage_settings
|
@@ -48,6 +44,8 @@ REDIS_FILE_DATA_MANAGER_FACTORY: Optional[RedisFileDataManagerFactory] = None
|
|
48
44
|
async def initialize():
|
49
45
|
global DRIVER
|
50
46
|
if storage_settings.file_backend == FileBackendConfig.GCS:
|
47
|
+
from nucliadb.writer.tus.gcs import GCloudBlobStore, GCloudFileStorageManager
|
48
|
+
|
51
49
|
storage_backend = GCloudBlobStore()
|
52
50
|
|
53
51
|
await storage_backend.initialize(
|
@@ -64,6 +62,8 @@ async def initialize():
|
|
64
62
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
65
63
|
|
66
64
|
elif storage_settings.file_backend == FileBackendConfig.S3:
|
65
|
+
from nucliadb.writer.tus.s3 import S3BlobStore, S3FileStorageManager
|
66
|
+
|
67
67
|
storage_backend = S3BlobStore()
|
68
68
|
|
69
69
|
await storage_backend.initialize(
|
@@ -83,6 +83,8 @@ async def initialize():
|
|
83
83
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
84
84
|
|
85
85
|
elif storage_settings.file_backend == FileBackendConfig.LOCAL:
|
86
|
+
from nucliadb.writer.tus.local import LocalBlobStore, LocalFileStorageManager
|
87
|
+
|
86
88
|
storage_backend = LocalBlobStore(storage_settings.local_files)
|
87
89
|
|
88
90
|
await storage_backend.initialize()
|
@@ -91,12 +93,18 @@ async def initialize():
|
|
91
93
|
|
92
94
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
93
95
|
|
94
|
-
elif storage_settings.file_backend == FileBackendConfig.
|
95
|
-
|
96
|
+
elif storage_settings.file_backend == FileBackendConfig.AZURE:
|
97
|
+
from nucliadb.writer.tus.azure import AzureBlobStore, AzureFileStorageManager
|
96
98
|
|
97
|
-
|
99
|
+
if storage_settings.azure_account_url is None:
|
100
|
+
raise ConfigurationError("AZURE_ACCOUNT_URL env variable not configured")
|
98
101
|
|
99
|
-
|
102
|
+
storage_backend = AzureBlobStore()
|
103
|
+
await storage_backend.initialize(
|
104
|
+
storage_settings.azure_account_url,
|
105
|
+
connection_string=storage_settings.azure_connection_string,
|
106
|
+
)
|
107
|
+
storage_manager = AzureFileStorageManager(storage_backend)
|
100
108
|
|
101
109
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
102
110
|
|
@@ -117,7 +125,7 @@ async def finalize():
|
|
117
125
|
REDIS_FILE_DATA_MANAGER_FACTORY = None
|
118
126
|
|
119
127
|
|
120
|
-
def get_dm() -> FileDataManager:
|
128
|
+
def get_dm() -> FileDataManager:
|
121
129
|
if writer_settings.dm_enabled:
|
122
130
|
global REDIS_FILE_DATA_MANAGER_FACTORY
|
123
131
|
if REDIS_FILE_DATA_MANAGER_FACTORY is None:
|
@@ -136,9 +144,3 @@ def get_storage_manager() -> FileStorageManager:
|
|
136
144
|
if DRIVER is None:
|
137
145
|
raise ManagerNotAvailable()
|
138
146
|
return DRIVER.manager
|
139
|
-
|
140
|
-
|
141
|
-
def clear_storage():
|
142
|
-
global DRIVER
|
143
|
-
|
144
|
-
DRIVER = None
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from __future__ import annotations
|
21
|
+
|
22
|
+
from typing import Optional
|
23
|
+
|
24
|
+
from nucliadb.writer import logger
|
25
|
+
from nucliadb.writer.tus.dm import FileDataManager
|
26
|
+
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
27
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
28
|
+
from nucliadb_utils.storages import CHUNK_SIZE
|
29
|
+
from nucliadb_utils.storages.azure import AzureObjectStore
|
30
|
+
from nucliadb_utils.storages.exceptions import ObjectNotFoundError
|
31
|
+
from nucliadb_utils.storages.utils import ObjectMetadata
|
32
|
+
|
33
|
+
|
34
|
+
class AzureBlobStore(BlobStore):
|
35
|
+
async def finalize(self):
|
36
|
+
if self._object_store is None:
|
37
|
+
return
|
38
|
+
try:
|
39
|
+
await self._object_store.finalize()
|
40
|
+
except Exception:
|
41
|
+
logger.exception("Error closing AzureBlobStore")
|
42
|
+
self._object_store = None
|
43
|
+
|
44
|
+
async def initialize(self, account_url: str, connection_string: Optional[str] = None):
|
45
|
+
self.bucket = "nucliadb-{kbid}"
|
46
|
+
self.source = CloudFile.Source.AZURE
|
47
|
+
self._object_store = AzureObjectStore(account_url, connection_string=connection_string)
|
48
|
+
await self._object_store.initialize()
|
49
|
+
|
50
|
+
@property
|
51
|
+
def object_store(self) -> AzureObjectStore:
|
52
|
+
assert self._object_store is not None
|
53
|
+
return self._object_store
|
54
|
+
|
55
|
+
async def check_exists(self, bucket_name: str) -> bool:
|
56
|
+
return await self.object_store.bucket_exists(bucket_name)
|
57
|
+
|
58
|
+
async def create_bucket(self, bucket_name: str) -> bool:
|
59
|
+
created = await self.object_store.bucket_create(bucket_name)
|
60
|
+
return not created
|
61
|
+
|
62
|
+
|
63
|
+
class AzureFileStorageManager(FileStorageManager):
|
64
|
+
storage: AzureBlobStore
|
65
|
+
chunk_size = CHUNK_SIZE
|
66
|
+
min_upload_size = None
|
67
|
+
|
68
|
+
@property
|
69
|
+
def object_store(self) -> AzureObjectStore:
|
70
|
+
return self.storage.object_store
|
71
|
+
|
72
|
+
async def start(self, dm: FileDataManager, path: str, kbid: str):
|
73
|
+
bucket = self.storage.get_bucket_name(kbid)
|
74
|
+
if dm.filename == 0:
|
75
|
+
filename = "file"
|
76
|
+
else:
|
77
|
+
filename = dm.filename
|
78
|
+
metadata = ObjectMetadata(
|
79
|
+
filename=filename,
|
80
|
+
content_type=dm.content_type,
|
81
|
+
size=dm.size,
|
82
|
+
)
|
83
|
+
await self.object_store.upload_multipart_start(bucket, path, metadata)
|
84
|
+
await dm.update(path=path, bucket=bucket)
|
85
|
+
|
86
|
+
async def delete_upload(self, uri: str, kbid: str) -> None:
|
87
|
+
bucket = self.storage.get_bucket_name(kbid)
|
88
|
+
try:
|
89
|
+
await self.object_store.delete(bucket, uri)
|
90
|
+
except ObjectNotFoundError:
|
91
|
+
logger.warning(
|
92
|
+
"Attempt to delete an upload but not found",
|
93
|
+
extra={"uri": uri, "kbid": kbid, "bucket": bucket},
|
94
|
+
)
|
95
|
+
|
96
|
+
async def append(self, dm: FileDataManager, iterable, offset: int) -> int:
|
97
|
+
bucket = dm.get("bucket")
|
98
|
+
assert bucket is not None
|
99
|
+
path = dm.get("path")
|
100
|
+
assert path is not None
|
101
|
+
uploaded_bytes = await self.object_store.upload_multipart_append(bucket, path, iterable)
|
102
|
+
await dm.update(offset=offset)
|
103
|
+
return uploaded_bytes
|
104
|
+
|
105
|
+
async def finish(self, dm: FileDataManager):
|
106
|
+
path = dm.get("path")
|
107
|
+
await dm.finish()
|
108
|
+
return path
|
109
|
+
|
110
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
111
|
+
pass
|
nucliadb/writer/tus/dm.py
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
import time
|
21
21
|
from typing import Any, Optional
|
22
22
|
|
23
|
+
import backoff
|
23
24
|
import orjson
|
24
25
|
from redis import asyncio as aioredis
|
25
26
|
from starlette.requests import Request
|
@@ -33,6 +34,11 @@ class NoRedisConfigured(Exception):
|
|
33
34
|
pass
|
34
35
|
|
35
36
|
|
37
|
+
RETRIABLE_REDIS_ERRORS = (
|
38
|
+
aioredis.ConnectionError,
|
39
|
+
aioredis.TimeoutError,
|
40
|
+
)
|
41
|
+
|
36
42
|
DATA: dict[str, Any] = {}
|
37
43
|
|
38
44
|
|
@@ -59,10 +65,7 @@ class FileDataManager:
|
|
59
65
|
# someone else
|
60
66
|
last_activity: Optional[int] = self._data.get("last_activity")
|
61
67
|
if last_activity and (time.time() - last_activity) < self._ttl:
|
62
|
-
if (
|
63
|
-
request.headers
|
64
|
-
and request.headers.get("tus-override-upload", "0") != "1"
|
65
|
-
):
|
68
|
+
if request.headers and request.headers.get("tus-override-upload", "0") != "1":
|
66
69
|
raise HTTPPreconditionFailed(
|
67
70
|
detail="There is already an active tusupload that conflicts with this one."
|
68
71
|
)
|
@@ -136,7 +139,7 @@ class RedisFileDataManagerFactory:
|
|
136
139
|
|
137
140
|
async def finalize(self):
|
138
141
|
try:
|
139
|
-
await self.redis.
|
142
|
+
await self.redis.aclose(close_connection_pool=True)
|
140
143
|
except Exception:
|
141
144
|
logger.warning("Error closing redis connection", exc_info=True)
|
142
145
|
pass
|
@@ -146,6 +149,9 @@ class RedisFileDataManager(FileDataManager):
|
|
146
149
|
def __init__(self, redis: aioredis.Redis):
|
147
150
|
self.redis = redis
|
148
151
|
|
152
|
+
@backoff.on_exception(
|
153
|
+
backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
|
154
|
+
)
|
149
155
|
async def load(self, key):
|
150
156
|
# preload data
|
151
157
|
self.key = key
|
@@ -157,6 +163,9 @@ class RedisFileDataManager(FileDataManager):
|
|
157
163
|
self._data = orjson.loads(data)
|
158
164
|
self._loaded = True
|
159
165
|
|
166
|
+
@backoff.on_exception(
|
167
|
+
backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
|
168
|
+
)
|
160
169
|
async def save(self):
|
161
170
|
if self.key is None:
|
162
171
|
raise Exception("Not initialized")
|
@@ -164,6 +173,9 @@ class RedisFileDataManager(FileDataManager):
|
|
164
173
|
value = orjson.dumps(self._data)
|
165
174
|
await self.redis.set(self.key, value, ex=self._ttl)
|
166
175
|
|
176
|
+
@backoff.on_exception(
|
177
|
+
backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
|
178
|
+
)
|
167
179
|
async def _delete_key(self):
|
168
180
|
if self.key is None:
|
169
181
|
raise Exception("Not initialized")
|
@@ -31,9 +31,7 @@ class HTTPException(StarletteHTTPException):
|
|
31
31
|
|
32
32
|
def __init__(self, detail: Optional[str] = None):
|
33
33
|
if self._status_code:
|
34
|
-
super(HTTPException, self).__init__(
|
35
|
-
status_code=self._status_code, detail=detail
|
36
|
-
)
|
34
|
+
super(HTTPException, self).__init__(status_code=self._status_code, detail=detail)
|
37
35
|
else:
|
38
36
|
raise AttributeError("Status code not defined")
|
39
37
|
|
nucliadb/writer/tus/gcs.py
CHANGED
@@ -28,27 +28,28 @@ import tempfile
|
|
28
28
|
import uuid
|
29
29
|
from concurrent.futures import ThreadPoolExecutor
|
30
30
|
from copy import deepcopy
|
31
|
-
from
|
32
|
-
from typing import AsyncIterator, Optional
|
31
|
+
from typing import Optional
|
33
32
|
from urllib.parse import quote_plus
|
34
33
|
|
35
34
|
import aiohttp
|
36
35
|
import backoff
|
37
|
-
|
36
|
+
import google.auth.compute_engine.credentials # type: ignore
|
37
|
+
import google.auth.transport.requests # type: ignore
|
38
|
+
import google.oauth2.credentials # type: ignore
|
39
|
+
from google.auth.exceptions import DefaultCredentialsError # type: ignore
|
38
40
|
from oauth2client.service_account import ServiceAccountCredentials # type: ignore
|
39
41
|
|
40
42
|
from nucliadb.writer import logger
|
41
43
|
from nucliadb.writer.tus.dm import FileDataManager
|
42
44
|
from nucliadb.writer.tus.exceptions import (
|
43
|
-
CloudFileNotFound,
|
44
45
|
HTTPBadRequest,
|
45
|
-
HTTPNotFound,
|
46
46
|
HTTPPreconditionFailed,
|
47
47
|
ResumableURINotAvailable,
|
48
48
|
)
|
49
49
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
50
50
|
from nucliadb.writer.tus.utils import to_str
|
51
|
-
from
|
51
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
52
|
+
from nucliadb_utils.storages.gcs import CHUNK_SIZE, MIN_UPLOAD_SIZE, TIMEOUT
|
52
53
|
|
53
54
|
|
54
55
|
class GoogleCloudException(Exception):
|
@@ -76,7 +77,7 @@ class GCloudBlobStore(BlobStore):
|
|
76
77
|
loop = None
|
77
78
|
upload_url: str
|
78
79
|
object_base_url: str
|
79
|
-
json_credentials: str
|
80
|
+
json_credentials: Optional[str]
|
80
81
|
bucket: str
|
81
82
|
location: str
|
82
83
|
project: str
|
@@ -90,9 +91,18 @@ class GCloudBlobStore(BlobStore):
|
|
90
91
|
return {"AUTHORIZATION": f"Bearer {token}"}
|
91
92
|
|
92
93
|
def _get_access_token(self):
|
93
|
-
|
94
|
-
|
95
|
-
|
94
|
+
if isinstance(
|
95
|
+
self._credentials, google.auth.compute_engine.credentials.Credentials
|
96
|
+
) or isinstance(self._credentials, google.oauth2.credentials.Credentials):
|
97
|
+
# google default auth object
|
98
|
+
if self._credentials.expired or self._credentials.valid is False:
|
99
|
+
request = google.auth.transport.requests.Request()
|
100
|
+
self._credentials.refresh(request)
|
101
|
+
|
102
|
+
return self._credentials.token
|
103
|
+
else:
|
104
|
+
access_token = self._credentials.get_access_token()
|
105
|
+
return access_token.access_token
|
96
106
|
|
97
107
|
async def finalize(self):
|
98
108
|
if self.session is not None:
|
@@ -113,32 +123,35 @@ class GCloudBlobStore(BlobStore):
|
|
113
123
|
self.project = project
|
114
124
|
self.bucket_labels = bucket_labels
|
115
125
|
self.object_base_url = object_base_url + "/storage/v1/b"
|
116
|
-
self.upload_url =
|
117
|
-
|
118
|
-
) # noqa
|
119
|
-
|
126
|
+
self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable" # noqa
|
127
|
+
self.json_credentials = json_credentials
|
120
128
|
self._credentials = None
|
121
129
|
|
122
|
-
if json_credentials is not None:
|
123
|
-
self.json_credentials_file = os.path.join(
|
124
|
-
|
125
|
-
|
126
|
-
open(self.json_credentials_file, "w").write(
|
127
|
-
base64.b64decode(json_credentials).decode("utf-8")
|
128
|
-
)
|
130
|
+
if self.json_credentials is not None and self.json_credentials.strip() != "":
|
131
|
+
self.json_credentials_file = os.path.join(tempfile.mkdtemp(), "gcs_credentials.json")
|
132
|
+
with open(self.json_credentials_file, "w") as file:
|
133
|
+
file.write(base64.b64decode(self.json_credentials).decode("utf-8"))
|
129
134
|
self._credentials = ServiceAccountCredentials.from_json_keyfile_name(
|
130
135
|
self.json_credentials_file, SCOPES
|
131
136
|
)
|
137
|
+
else:
|
138
|
+
try:
|
139
|
+
self._credentials, self.project = google.auth.default()
|
140
|
+
except DefaultCredentialsError:
|
141
|
+
logger.warning("Setting up without credentials as couldn't find workload identity")
|
142
|
+
self._credentials = None
|
132
143
|
|
133
144
|
loop = asyncio.get_event_loop()
|
134
|
-
self.session = aiohttp.ClientSession(loop=loop)
|
145
|
+
self.session = aiohttp.ClientSession(loop=loop, timeout=TIMEOUT)
|
135
146
|
|
136
147
|
async def check_exists(self, bucket_name: str):
|
137
148
|
if self.session is None:
|
138
149
|
raise AttributeError()
|
139
150
|
|
140
151
|
headers = await self.get_access_headers()
|
141
|
-
url
|
152
|
+
# Using object access url instead of bucket access to avoid
|
153
|
+
# giving admin permission to the SA, needed to GET a bucket
|
154
|
+
url = f"{self.object_base_url}/{bucket_name}/o"
|
142
155
|
async with self.session.get(
|
143
156
|
url,
|
144
157
|
headers=headers,
|
@@ -177,9 +190,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
177
190
|
chunk_size = CHUNK_SIZE
|
178
191
|
min_upload_size = MIN_UPLOAD_SIZE
|
179
192
|
|
180
|
-
@backoff.on_exception(
|
181
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
182
|
-
)
|
193
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
183
194
|
async def start(self, dm: FileDataManager, path: str, kbid: str):
|
184
195
|
"""Init an upload.
|
185
196
|
|
@@ -187,12 +198,15 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
187
198
|
_resumable_uri : uri to resumable upload
|
188
199
|
_uri : finished uploaded image
|
189
200
|
"""
|
201
|
+
|
190
202
|
if self.storage.session is None:
|
191
203
|
raise AttributeError()
|
192
204
|
|
193
|
-
upload_file_id = dm.get("upload_file_id"
|
205
|
+
upload_file_id = dm.get("upload_file_id")
|
194
206
|
if upload_file_id is not None:
|
195
207
|
await self.delete_upload(upload_file_id, kbid)
|
208
|
+
else:
|
209
|
+
upload_file_id = str(uuid.uuid4())
|
196
210
|
|
197
211
|
bucket = self.storage.get_bucket_name(kbid)
|
198
212
|
init_url = "{}&name={}".format(
|
@@ -237,13 +251,9 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
237
251
|
raise GoogleCloudException(text)
|
238
252
|
resumable_uri = call.headers["Location"]
|
239
253
|
|
240
|
-
await dm.update(
|
241
|
-
resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path
|
242
|
-
)
|
254
|
+
await dm.update(resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path)
|
243
255
|
|
244
|
-
@backoff.on_exception(
|
245
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
246
|
-
)
|
256
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
247
257
|
async def delete_upload(self, uri, kbid):
|
248
258
|
bucket = self.storage.get_bucket_name(kbid)
|
249
259
|
|
@@ -266,8 +276,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
266
276
|
if resp.status not in (200, 204, 404):
|
267
277
|
if resp.status == 404:
|
268
278
|
logger.error(
|
269
|
-
f"Attempt to delete not found gcloud: {data}, "
|
270
|
-
f"status: {resp.status}",
|
279
|
+
f"Attempt to delete not found gcloud: {data}, " f"status: {resp.status}",
|
271
280
|
exc_info=True,
|
272
281
|
)
|
273
282
|
else:
|
@@ -275,9 +284,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
275
284
|
else:
|
276
285
|
raise AttributeError("No valid uri")
|
277
286
|
|
278
|
-
@backoff.on_exception(
|
279
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
280
|
-
)
|
287
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
281
288
|
async def _append(self, dm: FileDataManager, data, offset):
|
282
289
|
if self.storage.session is None:
|
283
290
|
raise AttributeError()
|
@@ -342,9 +349,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
342
349
|
break
|
343
350
|
return count
|
344
351
|
|
345
|
-
@backoff.on_exception(
|
346
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
347
|
-
)
|
352
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
348
353
|
async def finish(self, dm: FileDataManager):
|
349
354
|
if dm.size == 0:
|
350
355
|
if self.storage.session is None:
|
@@ -370,46 +375,6 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
370
375
|
await dm.finish()
|
371
376
|
return path
|
372
377
|
|
373
|
-
|
374
|
-
if self.
|
375
|
-
raise
|
376
|
-
if headers is None:
|
377
|
-
headers = {}
|
378
|
-
|
379
|
-
url = "{}/{}/o/{}".format(
|
380
|
-
self.storage.object_base_url,
|
381
|
-
self.storage.get_bucket_name(kbid),
|
382
|
-
quote_plus(uri),
|
383
|
-
)
|
384
|
-
headers_auth = await self.storage.get_access_headers()
|
385
|
-
headers.update(headers_auth)
|
386
|
-
async with self.storage.session.get(
|
387
|
-
url, headers=headers, params={"alt": "media"}, timeout=-1
|
388
|
-
) as api_resp:
|
389
|
-
if api_resp.status not in (200, 206):
|
390
|
-
text = await api_resp.text()
|
391
|
-
if api_resp.status == 404:
|
392
|
-
raise CloudFileNotFound("Google cloud file not found")
|
393
|
-
elif api_resp.status == 401:
|
394
|
-
logger.warning(f"Invalid google cloud credentials error: {text}")
|
395
|
-
raise HTTPNotFound(
|
396
|
-
detail=f"Google cloud invalid credentials: {text}"
|
397
|
-
)
|
398
|
-
raise GoogleCloudException(f"{api_resp.status}: {text}")
|
399
|
-
while True:
|
400
|
-
chunk = await api_resp.content.read(1024 * 1024)
|
401
|
-
if len(chunk) > 0:
|
402
|
-
yield chunk
|
403
|
-
else:
|
404
|
-
break
|
405
|
-
|
406
|
-
async def read_range(
|
407
|
-
self, uri: str, kbid: str, start: int, end: int
|
408
|
-
) -> AsyncIterator[bytes]:
|
409
|
-
"""
|
410
|
-
Iterate through ranges of data
|
411
|
-
"""
|
412
|
-
async for chunk in self.iter_data(
|
413
|
-
uri, kbid, headers={"Range": f"bytes={start}-{end - 1}"}
|
414
|
-
):
|
415
|
-
yield chunk
|
378
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
379
|
+
if uploaded_bytes < self.min_upload_size:
|
380
|
+
raise ValueError(f"Intermediate chunks cannot be smaller than {self.min_upload_size} bytes")
|
nucliadb/writer/tus/local.py
CHANGED
@@ -22,14 +22,13 @@ from __future__ import annotations
|
|
22
22
|
import json
|
23
23
|
import os
|
24
24
|
import uuid
|
25
|
-
from typing import
|
25
|
+
from typing import Any
|
26
26
|
|
27
27
|
import aiofiles
|
28
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
29
28
|
|
30
29
|
from nucliadb.writer.tus.dm import FileDataManager
|
31
|
-
from nucliadb.writer.tus.exceptions import CloudFileNotFound
|
32
30
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
31
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
33
32
|
from nucliadb_utils.storages import CHUNK_SIZE
|
34
33
|
|
35
34
|
|
@@ -50,51 +49,24 @@ class LocalFileStorageManager(FileStorageManager):
|
|
50
49
|
bucket = self.storage.get_bucket_name(kbid)
|
51
50
|
upload_file_id = dm.get("upload_file_id", str(uuid.uuid4()))
|
52
51
|
init_url = self.get_file_path(bucket, upload_file_id)
|
53
|
-
metadata_init_url = self.metadata_key(init_url)
|
54
52
|
metadata = {
|
55
53
|
"FILENAME": dm.filename,
|
56
54
|
"CONTENT_TYPE": dm.content_type,
|
57
55
|
"SIZE": dm.size,
|
58
56
|
}
|
59
|
-
|
60
|
-
await resp.write(json.dumps(metadata))
|
57
|
+
await self.set_metadata(kbid, upload_file_id, metadata)
|
61
58
|
|
62
59
|
async with aiofiles.open(init_url, "wb+") as aio_fi:
|
63
60
|
await aio_fi.write(b"")
|
64
61
|
|
65
|
-
await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket)
|
62
|
+
await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket, kbid=kbid)
|
66
63
|
|
67
|
-
async def
|
68
|
-
bucket = self.storage.get_bucket_name(kbid)
|
69
|
-
file_path = self.get_file_path(bucket, uri)
|
70
|
-
async with aiofiles.open(file_path) as resp:
|
71
|
-
data = await resp.read(CHUNK_SIZE)
|
72
|
-
while data is not None:
|
73
|
-
yield data
|
74
|
-
data = await resp.read(CHUNK_SIZE)
|
75
|
-
|
76
|
-
async def read_range(
|
77
|
-
self, uri: str, kbid: str, start: int, end: int
|
78
|
-
) -> AsyncIterator[bytes]:
|
79
|
-
"""
|
80
|
-
Iterate through ranges of data
|
81
|
-
"""
|
64
|
+
async def set_metadata(self, kbid: str, upload_file_id: str, metadata: dict[str, Any]):
|
82
65
|
bucket = self.storage.get_bucket_name(kbid)
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
count = 0
|
88
|
-
data = await resp.read(CHUNK_SIZE)
|
89
|
-
while data and count < end:
|
90
|
-
if count + len(data) > end:
|
91
|
-
new_end = end - count
|
92
|
-
data = data[:new_end]
|
93
|
-
yield data
|
94
|
-
count += len(data)
|
95
|
-
data = await resp.read(CHUNK_SIZE)
|
96
|
-
except FileNotFoundError:
|
97
|
-
raise CloudFileNotFound()
|
66
|
+
init_url = self.get_file_path(bucket, upload_file_id)
|
67
|
+
metadata_init_url = self.metadata_key(init_url)
|
68
|
+
async with aiofiles.open(metadata_init_url, "w+") as resp:
|
69
|
+
await resp.write(json.dumps(metadata))
|
98
70
|
|
99
71
|
async def append(self, dm: FileDataManager, iterable, offset) -> int:
|
100
72
|
count = 0
|
@@ -118,6 +90,15 @@ class LocalFileStorageManager(FileStorageManager):
|
|
118
90
|
upload_file_id = dm.get("upload_file_id")
|
119
91
|
from_url = self.get_file_path(bucket, upload_file_id)
|
120
92
|
|
93
|
+
if dm.size > 0:
|
94
|
+
kbid = dm.get("kbid")
|
95
|
+
metadata = {
|
96
|
+
"FILENAME": dm.filename,
|
97
|
+
"CONTENT_TYPE": dm.content_type,
|
98
|
+
"SIZE": dm.size,
|
99
|
+
}
|
100
|
+
await self.set_metadata(kbid, upload_file_id, metadata)
|
101
|
+
|
121
102
|
path = dm.get("path")
|
122
103
|
to_url = self.get_file_path(bucket, path)
|
123
104
|
to_url_dirs = os.path.dirname(to_url)
|
@@ -138,6 +119,9 @@ class LocalFileStorageManager(FileStorageManager):
|
|
138
119
|
file_path = self.get_file_path(bucket, uri)
|
139
120
|
os.remove(file_path)
|
140
121
|
|
122
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
123
|
+
pass
|
124
|
+
|
141
125
|
|
142
126
|
class LocalBlobStore(BlobStore):
|
143
127
|
def __init__(self, local_testing_files: str):
|