nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/writer/tus/gcs.py
CHANGED
@@ -23,31 +23,33 @@ import asyncio
|
|
23
23
|
import base64
|
24
24
|
import json
|
25
25
|
import os
|
26
|
+
import socket
|
26
27
|
import tempfile
|
27
28
|
import uuid
|
28
29
|
from concurrent.futures import ThreadPoolExecutor
|
29
30
|
from copy import deepcopy
|
30
|
-
from
|
31
|
-
from typing import AsyncIterator, Optional
|
31
|
+
from typing import Optional
|
32
32
|
from urllib.parse import quote_plus
|
33
33
|
|
34
34
|
import aiohttp
|
35
35
|
import backoff
|
36
|
-
|
36
|
+
import google.auth.compute_engine.credentials # type: ignore
|
37
|
+
import google.auth.transport.requests # type: ignore
|
38
|
+
import google.oauth2.credentials # type: ignore
|
39
|
+
from google.auth.exceptions import DefaultCredentialsError # type: ignore
|
37
40
|
from oauth2client.service_account import ServiceAccountCredentials # type: ignore
|
38
41
|
|
39
42
|
from nucliadb.writer import logger
|
40
43
|
from nucliadb.writer.tus.dm import FileDataManager
|
41
44
|
from nucliadb.writer.tus.exceptions import (
|
42
|
-
CloudFileNotFound,
|
43
45
|
HTTPBadRequest,
|
44
|
-
HTTPNotFound,
|
45
46
|
HTTPPreconditionFailed,
|
46
47
|
ResumableURINotAvailable,
|
47
48
|
)
|
48
49
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
49
50
|
from nucliadb.writer.tus.utils import to_str
|
50
|
-
from
|
51
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
52
|
+
from nucliadb_utils.storages.gcs import CHUNK_SIZE, MIN_UPLOAD_SIZE, TIMEOUT
|
51
53
|
|
52
54
|
|
53
55
|
class GoogleCloudException(Exception):
|
@@ -61,6 +63,12 @@ MAX_RETRIES = 5
|
|
61
63
|
RETRIABLE_EXCEPTIONS = (
|
62
64
|
GoogleCloudException,
|
63
65
|
aiohttp.client_exceptions.ClientPayloadError,
|
66
|
+
aiohttp.client_exceptions.ClientConnectorError,
|
67
|
+
aiohttp.client_exceptions.ClientConnectionError,
|
68
|
+
aiohttp.client_exceptions.ClientOSError,
|
69
|
+
aiohttp.client_exceptions.ServerConnectionError,
|
70
|
+
aiohttp.client_exceptions.ServerDisconnectedError,
|
71
|
+
socket.gaierror,
|
64
72
|
)
|
65
73
|
|
66
74
|
|
@@ -69,7 +77,7 @@ class GCloudBlobStore(BlobStore):
|
|
69
77
|
loop = None
|
70
78
|
upload_url: str
|
71
79
|
object_base_url: str
|
72
|
-
json_credentials: str
|
80
|
+
json_credentials: Optional[str]
|
73
81
|
bucket: str
|
74
82
|
location: str
|
75
83
|
project: str
|
@@ -83,9 +91,18 @@ class GCloudBlobStore(BlobStore):
|
|
83
91
|
return {"AUTHORIZATION": f"Bearer {token}"}
|
84
92
|
|
85
93
|
def _get_access_token(self):
|
86
|
-
|
87
|
-
|
88
|
-
|
94
|
+
if isinstance(
|
95
|
+
self._credentials, google.auth.compute_engine.credentials.Credentials
|
96
|
+
) or isinstance(self._credentials, google.oauth2.credentials.Credentials):
|
97
|
+
# google default auth object
|
98
|
+
if self._credentials.expired or self._credentials.valid is False:
|
99
|
+
request = google.auth.transport.requests.Request()
|
100
|
+
self._credentials.refresh(request)
|
101
|
+
|
102
|
+
return self._credentials.token
|
103
|
+
else:
|
104
|
+
access_token = self._credentials.get_access_token()
|
105
|
+
return access_token.access_token
|
89
106
|
|
90
107
|
async def finalize(self):
|
91
108
|
if self.session is not None:
|
@@ -106,32 +123,35 @@ class GCloudBlobStore(BlobStore):
|
|
106
123
|
self.project = project
|
107
124
|
self.bucket_labels = bucket_labels
|
108
125
|
self.object_base_url = object_base_url + "/storage/v1/b"
|
109
|
-
self.upload_url =
|
110
|
-
|
111
|
-
) # noqa
|
112
|
-
|
126
|
+
self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable" # noqa
|
127
|
+
self.json_credentials = json_credentials
|
113
128
|
self._credentials = None
|
114
129
|
|
115
|
-
if json_credentials is not None:
|
116
|
-
self.json_credentials_file = os.path.join(
|
117
|
-
|
118
|
-
|
119
|
-
open(self.json_credentials_file, "w").write(
|
120
|
-
base64.b64decode(json_credentials).decode("utf-8")
|
121
|
-
)
|
130
|
+
if self.json_credentials is not None and self.json_credentials.strip() != "":
|
131
|
+
self.json_credentials_file = os.path.join(tempfile.mkdtemp(), "gcs_credentials.json")
|
132
|
+
with open(self.json_credentials_file, "w") as file:
|
133
|
+
file.write(base64.b64decode(self.json_credentials).decode("utf-8"))
|
122
134
|
self._credentials = ServiceAccountCredentials.from_json_keyfile_name(
|
123
135
|
self.json_credentials_file, SCOPES
|
124
136
|
)
|
137
|
+
else:
|
138
|
+
try:
|
139
|
+
self._credentials, self.project = google.auth.default()
|
140
|
+
except DefaultCredentialsError:
|
141
|
+
logger.warning("Setting up without credentials as couldn't find workload identity")
|
142
|
+
self._credentials = None
|
125
143
|
|
126
144
|
loop = asyncio.get_event_loop()
|
127
|
-
self.session = aiohttp.ClientSession(loop=loop)
|
145
|
+
self.session = aiohttp.ClientSession(loop=loop, timeout=TIMEOUT)
|
128
146
|
|
129
147
|
async def check_exists(self, bucket_name: str):
|
130
148
|
if self.session is None:
|
131
149
|
raise AttributeError()
|
132
150
|
|
133
151
|
headers = await self.get_access_headers()
|
134
|
-
url
|
152
|
+
# Using object access url instead of bucket access to avoid
|
153
|
+
# giving admin permission to the SA, needed to GET a bucket
|
154
|
+
url = f"{self.object_base_url}/{bucket_name}/o"
|
135
155
|
async with self.session.get(
|
136
156
|
url,
|
137
157
|
headers=headers,
|
@@ -170,9 +190,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
170
190
|
chunk_size = CHUNK_SIZE
|
171
191
|
min_upload_size = MIN_UPLOAD_SIZE
|
172
192
|
|
173
|
-
@backoff.on_exception(
|
174
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
175
|
-
)
|
193
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
176
194
|
async def start(self, dm: FileDataManager, path: str, kbid: str):
|
177
195
|
"""Init an upload.
|
178
196
|
|
@@ -180,12 +198,15 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
180
198
|
_resumable_uri : uri to resumable upload
|
181
199
|
_uri : finished uploaded image
|
182
200
|
"""
|
201
|
+
|
183
202
|
if self.storage.session is None:
|
184
203
|
raise AttributeError()
|
185
204
|
|
186
|
-
upload_file_id = dm.get("upload_file_id"
|
205
|
+
upload_file_id = dm.get("upload_file_id")
|
187
206
|
if upload_file_id is not None:
|
188
207
|
await self.delete_upload(upload_file_id, kbid)
|
208
|
+
else:
|
209
|
+
upload_file_id = str(uuid.uuid4())
|
189
210
|
|
190
211
|
bucket = self.storage.get_bucket_name(kbid)
|
191
212
|
init_url = "{}&name={}".format(
|
@@ -230,13 +251,9 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
230
251
|
raise GoogleCloudException(text)
|
231
252
|
resumable_uri = call.headers["Location"]
|
232
253
|
|
233
|
-
await dm.update(
|
234
|
-
resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path
|
235
|
-
)
|
254
|
+
await dm.update(resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path)
|
236
255
|
|
237
|
-
@backoff.on_exception(
|
238
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
239
|
-
)
|
256
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
240
257
|
async def delete_upload(self, uri, kbid):
|
241
258
|
bucket = self.storage.get_bucket_name(kbid)
|
242
259
|
|
@@ -259,8 +276,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
259
276
|
if resp.status not in (200, 204, 404):
|
260
277
|
if resp.status == 404:
|
261
278
|
logger.error(
|
262
|
-
f"Attempt to delete not found gcloud: {data}, "
|
263
|
-
f"status: {resp.status}",
|
279
|
+
f"Attempt to delete not found gcloud: {data}, " f"status: {resp.status}",
|
264
280
|
exc_info=True,
|
265
281
|
)
|
266
282
|
else:
|
@@ -268,9 +284,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
268
284
|
else:
|
269
285
|
raise AttributeError("No valid uri")
|
270
286
|
|
271
|
-
@backoff.on_exception(
|
272
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
273
|
-
)
|
287
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
274
288
|
async def _append(self, dm: FileDataManager, data, offset):
|
275
289
|
if self.storage.session is None:
|
276
290
|
raise AttributeError()
|
@@ -335,9 +349,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
335
349
|
break
|
336
350
|
return count
|
337
351
|
|
338
|
-
@backoff.on_exception(
|
339
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
|
340
|
-
)
|
352
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
341
353
|
async def finish(self, dm: FileDataManager):
|
342
354
|
if dm.size == 0:
|
343
355
|
if self.storage.session is None:
|
@@ -363,46 +375,6 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
363
375
|
await dm.finish()
|
364
376
|
return path
|
365
377
|
|
366
|
-
|
367
|
-
if self.
|
368
|
-
raise
|
369
|
-
if headers is None:
|
370
|
-
headers = {}
|
371
|
-
|
372
|
-
url = "{}/{}/o/{}".format(
|
373
|
-
self.storage.object_base_url,
|
374
|
-
self.storage.get_bucket_name(kbid),
|
375
|
-
quote_plus(uri),
|
376
|
-
)
|
377
|
-
headers_auth = await self.storage.get_access_headers()
|
378
|
-
headers.update(headers_auth)
|
379
|
-
async with self.storage.session.get(
|
380
|
-
url, headers=headers, params={"alt": "media"}, timeout=-1
|
381
|
-
) as api_resp:
|
382
|
-
if api_resp.status not in (200, 206):
|
383
|
-
text = await api_resp.text()
|
384
|
-
if api_resp.status == 404:
|
385
|
-
raise CloudFileNotFound("Google cloud file not found")
|
386
|
-
elif api_resp.status == 401:
|
387
|
-
logger.warning(f"Invalid google cloud credentials error: {text}")
|
388
|
-
raise HTTPNotFound(
|
389
|
-
detail=f"Google cloud invalid credentials: {text}"
|
390
|
-
)
|
391
|
-
raise GoogleCloudException(f"{api_resp.status}: {text}")
|
392
|
-
while True:
|
393
|
-
chunk = await api_resp.content.read(1024 * 1024)
|
394
|
-
if len(chunk) > 0:
|
395
|
-
yield chunk
|
396
|
-
else:
|
397
|
-
break
|
398
|
-
|
399
|
-
async def read_range(
|
400
|
-
self, uri: str, kbid: str, start: int, end: int
|
401
|
-
) -> AsyncIterator[bytes]:
|
402
|
-
"""
|
403
|
-
Iterate through ranges of data
|
404
|
-
"""
|
405
|
-
async for chunk in self.iter_data(
|
406
|
-
uri, kbid, headers={"Range": f"bytes={start}-{end - 1}"}
|
407
|
-
):
|
408
|
-
yield chunk
|
378
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
379
|
+
if uploaded_bytes < self.min_upload_size:
|
380
|
+
raise ValueError(f"Intermediate chunks cannot be smaller than {self.min_upload_size} bytes")
|
nucliadb/writer/tus/local.py
CHANGED
@@ -22,14 +22,13 @@ from __future__ import annotations
|
|
22
22
|
import json
|
23
23
|
import os
|
24
24
|
import uuid
|
25
|
-
from typing import
|
25
|
+
from typing import Any
|
26
26
|
|
27
27
|
import aiofiles
|
28
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
29
28
|
|
30
29
|
from nucliadb.writer.tus.dm import FileDataManager
|
31
|
-
from nucliadb.writer.tus.exceptions import CloudFileNotFound
|
32
30
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
31
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
33
32
|
from nucliadb_utils.storages import CHUNK_SIZE
|
34
33
|
|
35
34
|
|
@@ -50,51 +49,24 @@ class LocalFileStorageManager(FileStorageManager):
|
|
50
49
|
bucket = self.storage.get_bucket_name(kbid)
|
51
50
|
upload_file_id = dm.get("upload_file_id", str(uuid.uuid4()))
|
52
51
|
init_url = self.get_file_path(bucket, upload_file_id)
|
53
|
-
metadata_init_url = self.metadata_key(init_url)
|
54
52
|
metadata = {
|
55
53
|
"FILENAME": dm.filename,
|
56
54
|
"CONTENT_TYPE": dm.content_type,
|
57
55
|
"SIZE": dm.size,
|
58
56
|
}
|
59
|
-
|
60
|
-
await resp.write(json.dumps(metadata))
|
57
|
+
await self.set_metadata(kbid, upload_file_id, metadata)
|
61
58
|
|
62
59
|
async with aiofiles.open(init_url, "wb+") as aio_fi:
|
63
60
|
await aio_fi.write(b"")
|
64
61
|
|
65
|
-
await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket)
|
62
|
+
await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket, kbid=kbid)
|
66
63
|
|
67
|
-
async def
|
68
|
-
bucket = self.storage.get_bucket_name(kbid)
|
69
|
-
file_path = self.get_file_path(bucket, uri)
|
70
|
-
async with aiofiles.open(file_path) as resp:
|
71
|
-
data = await resp.read(CHUNK_SIZE)
|
72
|
-
while data is not None:
|
73
|
-
yield data
|
74
|
-
data = await resp.read(CHUNK_SIZE)
|
75
|
-
|
76
|
-
async def read_range(
|
77
|
-
self, uri: str, kbid: str, start: int, end: int
|
78
|
-
) -> AsyncIterator[bytes]:
|
79
|
-
"""
|
80
|
-
Iterate through ranges of data
|
81
|
-
"""
|
64
|
+
async def set_metadata(self, kbid: str, upload_file_id: str, metadata: dict[str, Any]):
|
82
65
|
bucket = self.storage.get_bucket_name(kbid)
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
count = 0
|
88
|
-
data = await resp.read(CHUNK_SIZE)
|
89
|
-
while data and count < end:
|
90
|
-
if count + len(data) > end:
|
91
|
-
new_end = end - count
|
92
|
-
data = data[:new_end]
|
93
|
-
yield data
|
94
|
-
count += len(data)
|
95
|
-
data = await resp.read(CHUNK_SIZE)
|
96
|
-
except FileNotFoundError:
|
97
|
-
raise CloudFileNotFound()
|
66
|
+
init_url = self.get_file_path(bucket, upload_file_id)
|
67
|
+
metadata_init_url = self.metadata_key(init_url)
|
68
|
+
async with aiofiles.open(metadata_init_url, "w+") as resp:
|
69
|
+
await resp.write(json.dumps(metadata))
|
98
70
|
|
99
71
|
async def append(self, dm: FileDataManager, iterable, offset) -> int:
|
100
72
|
count = 0
|
@@ -118,6 +90,15 @@ class LocalFileStorageManager(FileStorageManager):
|
|
118
90
|
upload_file_id = dm.get("upload_file_id")
|
119
91
|
from_url = self.get_file_path(bucket, upload_file_id)
|
120
92
|
|
93
|
+
if dm.size > 0:
|
94
|
+
kbid = dm.get("kbid")
|
95
|
+
metadata = {
|
96
|
+
"FILENAME": dm.filename,
|
97
|
+
"CONTENT_TYPE": dm.content_type,
|
98
|
+
"SIZE": dm.size,
|
99
|
+
}
|
100
|
+
await self.set_metadata(kbid, upload_file_id, metadata)
|
101
|
+
|
121
102
|
path = dm.get("path")
|
122
103
|
to_url = self.get_file_path(bucket, path)
|
123
104
|
to_url_dirs = os.path.dirname(to_url)
|
@@ -138,6 +119,9 @@ class LocalFileStorageManager(FileStorageManager):
|
|
138
119
|
file_path = self.get_file_path(bucket, uri)
|
139
120
|
os.remove(file_path)
|
140
121
|
|
122
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
123
|
+
pass
|
124
|
+
|
141
125
|
|
142
126
|
class LocalBlobStore(BlobStore):
|
143
127
|
def __init__(self, local_testing_files: str):
|
nucliadb/writer/tus/s3.py
CHANGED
@@ -19,21 +19,22 @@
|
|
19
19
|
#
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
+
import base64
|
22
23
|
import uuid
|
23
24
|
from contextlib import AsyncExitStack
|
24
|
-
from typing import
|
25
|
+
from typing import Optional
|
25
26
|
|
26
27
|
import aiobotocore # type: ignore
|
27
28
|
import aiohttp
|
28
|
-
import backoff
|
29
|
+
import backoff
|
29
30
|
import botocore # type: ignore
|
30
31
|
from aiobotocore.session import AioSession # type: ignore
|
31
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
32
32
|
|
33
33
|
from nucliadb.writer import logger
|
34
34
|
from nucliadb.writer.tus.dm import FileDataManager
|
35
|
-
from nucliadb.writer.tus.exceptions import
|
35
|
+
from nucliadb.writer.tus.exceptions import ResumableURINotAvailable
|
36
36
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
37
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
37
38
|
from nucliadb_utils.storages.s3 import (
|
38
39
|
CHUNK_SIZE,
|
39
40
|
MIN_UPLOAD_SIZE,
|
@@ -53,9 +54,7 @@ class S3FileStorageManager(FileStorageManager):
|
|
53
54
|
chunk_size = CHUNK_SIZE
|
54
55
|
min_upload_size = MIN_UPLOAD_SIZE
|
55
56
|
|
56
|
-
@backoff.on_exception(
|
57
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
58
|
-
)
|
57
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
59
58
|
async def _abort_multipart(self, dm: FileDataManager):
|
60
59
|
try:
|
61
60
|
mpu = dm.get("mpu")
|
@@ -72,21 +71,25 @@ class S3FileStorageManager(FileStorageManager):
|
|
72
71
|
if dm.get("mpu") is not None:
|
73
72
|
await self._abort_multipart(dm)
|
74
73
|
|
74
|
+
custom_metadata: dict[str, str] = {
|
75
|
+
"base64_filename": base64.b64encode((dm.filename or "").encode()).decode(),
|
76
|
+
"content_type": dm.content_type or "",
|
77
|
+
"size": str(dm.size),
|
78
|
+
}
|
79
|
+
|
75
80
|
await dm.update(
|
76
81
|
path=path,
|
77
82
|
upload_file_id=upload_file_id,
|
78
83
|
multipart={"Parts": []},
|
79
84
|
block=1,
|
80
|
-
mpu=await self._create_multipart(path, bucket),
|
85
|
+
mpu=await self._create_multipart(path, bucket, custom_metadata),
|
81
86
|
bucket=bucket,
|
82
87
|
)
|
83
88
|
|
84
|
-
@backoff.on_exception(
|
85
|
-
|
86
|
-
)
|
87
|
-
async def _create_multipart(self, path, bucket):
|
89
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
90
|
+
async def _create_multipart(self, path, bucket, custom_metadata: dict[str, str]):
|
88
91
|
return await self.storage._s3aioclient.create_multipart_upload(
|
89
|
-
Bucket=bucket, Key=path
|
92
|
+
Bucket=bucket, Key=path, Metadata=custom_metadata
|
90
93
|
)
|
91
94
|
|
92
95
|
async def append(self, dm: FileDataManager, iterable, offset) -> int:
|
@@ -96,16 +99,12 @@ class S3FileStorageManager(FileStorageManager):
|
|
96
99
|
size += len(chunk)
|
97
100
|
part = await self._upload_part(dm, chunk)
|
98
101
|
multipart = dm.get("multipart")
|
99
|
-
multipart["Parts"].append(
|
100
|
-
{"PartNumber": dm.get("block"), "ETag": part["ETag"]}
|
101
|
-
)
|
102
|
+
multipart["Parts"].append({"PartNumber": dm.get("block"), "ETag": part["ETag"]})
|
102
103
|
await dm.update(multipart=multipart, block=dm.get("block") + 1)
|
103
104
|
|
104
105
|
return size
|
105
106
|
|
106
|
-
@backoff.on_exception(
|
107
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
108
|
-
)
|
107
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
109
108
|
async def _upload_part(self, dm: FileDataManager, data):
|
110
109
|
mpu = dm.get("mpu")
|
111
110
|
if mpu is None:
|
@@ -128,18 +127,14 @@ class S3FileStorageManager(FileStorageManager):
|
|
128
127
|
await dm.finish()
|
129
128
|
return path
|
130
129
|
|
131
|
-
@backoff.on_exception(
|
132
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
133
|
-
)
|
130
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
134
131
|
async def _complete_multipart_upload(self, dm: FileDataManager):
|
135
132
|
# if blocks is 0, it means the file is of zero length so we need to
|
136
133
|
# trick it to finish a multiple part with no data.
|
137
134
|
if dm.get("block") == 1:
|
138
135
|
part = await self._upload_part(dm, b"")
|
139
136
|
multipart = dm.get("multipart")
|
140
|
-
multipart["Parts"].append(
|
141
|
-
{"PartNumber": dm.get("block"), "ETag": part["ETag"]}
|
142
|
-
)
|
137
|
+
multipart["Parts"].append({"PartNumber": dm.get("block"), "ETag": part["ETag"]})
|
143
138
|
await dm.update(multipart=multipart, block=dm.get("block") + 1)
|
144
139
|
await self.storage._s3aioclient.complete_multipart_upload(
|
145
140
|
Bucket=dm.get("bucket"),
|
@@ -148,45 +143,10 @@ class S3FileStorageManager(FileStorageManager):
|
|
148
143
|
MultipartUpload=dm.get("multipart"),
|
149
144
|
)
|
150
145
|
|
151
|
-
@backoff.on_exception(
|
152
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
153
|
-
)
|
146
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
154
147
|
async def _download(self, uri: str, kbid: str, **kwargs):
|
155
148
|
bucket = self.storage.get_bucket_name(kbid)
|
156
|
-
return await self.storage._s3aioclient.get_object(
|
157
|
-
Bucket=bucket, Key=uri, **kwargs
|
158
|
-
)
|
159
|
-
|
160
|
-
async def iter_data(
|
161
|
-
self, uri: str, kbid: str, headers: Optional[dict[str, str]] = None
|
162
|
-
):
|
163
|
-
if headers is None:
|
164
|
-
headers = {}
|
165
|
-
try:
|
166
|
-
downloader = await self._download(uri, kbid, **headers)
|
167
|
-
except self.storage._s3aioclient.exceptions.NoSuchKey:
|
168
|
-
raise CloudFileNotFound()
|
169
|
-
|
170
|
-
# we do not want to timeout ever from this...
|
171
|
-
# downloader['Body'].set_socket_timeout(999999)
|
172
|
-
stream = downloader["Body"]
|
173
|
-
data = await stream.read(CHUNK_SIZE)
|
174
|
-
while True:
|
175
|
-
if not data:
|
176
|
-
break
|
177
|
-
yield data
|
178
|
-
data = await stream.read(CHUNK_SIZE)
|
179
|
-
|
180
|
-
async def read_range(
|
181
|
-
self, uri, kbid: str, start: int, end: int
|
182
|
-
) -> AsyncIterator[bytes]:
|
183
|
-
"""
|
184
|
-
Iterate through ranges of data
|
185
|
-
"""
|
186
|
-
async for chunk in self.iter_data(
|
187
|
-
uri, kbid, headers={"Range": f"bytes={start}-{end - 1}"}
|
188
|
-
):
|
189
|
-
yield chunk
|
149
|
+
return await self.storage._s3aioclient.get_object(Bucket=bucket, Key=uri, **kwargs)
|
190
150
|
|
191
151
|
async def delete_upload(self, uri: str, kbid: str):
|
192
152
|
bucket = self.storage.get_bucket_name(kbid)
|
@@ -198,6 +158,10 @@ class S3FileStorageManager(FileStorageManager):
|
|
198
158
|
else:
|
199
159
|
raise AttributeError("No valid uri")
|
200
160
|
|
161
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
162
|
+
if uploaded_bytes % self.min_upload_size != 0:
|
163
|
+
raise ValueError(f"Intermediate chunks need to be multiples of {self.min_upload_size} bytes")
|
164
|
+
|
201
165
|
|
202
166
|
class S3BlobStore(BlobStore):
|
203
167
|
async def check_exists(self, bucket_name: str) -> bool:
|
@@ -213,9 +177,7 @@ class S3BlobStore(BlobStore):
|
|
213
177
|
async def create_bucket(self, bucket):
|
214
178
|
exists = await self.check_exists(bucket)
|
215
179
|
if not exists:
|
216
|
-
await create_bucket(
|
217
|
-
self._s3aioclient, bucket, self.bucket_tags, self.region_name
|
218
|
-
)
|
180
|
+
await create_bucket(self._s3aioclient, bucket, self.bucket_tags, self.region_name)
|
219
181
|
return exists
|
220
182
|
|
221
183
|
async def finalize(self):
|
@@ -247,9 +209,7 @@ class S3BlobStore(BlobStore):
|
|
247
209
|
verify=verify_ssl,
|
248
210
|
use_ssl=ssl,
|
249
211
|
region_name=region_name,
|
250
|
-
config=aiobotocore.config.AioConfig(
|
251
|
-
None, max_pool_connections=max_pool_connections
|
252
|
-
),
|
212
|
+
config=aiobotocore.config.AioConfig(None, max_pool_connections=max_pool_connections),
|
253
213
|
)
|
254
214
|
session = AioSession()
|
255
215
|
self._s3aioclient = await self._exit_stack.enter_async_context(
|
nucliadb/writer/tus/storage.py
CHANGED
@@ -21,15 +21,8 @@ from __future__ import annotations
|
|
21
21
|
|
22
22
|
from typing import AsyncIterator, Optional
|
23
23
|
|
24
|
-
from lru import LRU # type: ignore
|
25
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
26
|
-
from starlette.responses import StreamingResponse
|
27
|
-
|
28
|
-
from nucliadb.writer import logger
|
29
24
|
from nucliadb.writer.tus.dm import FileDataManager
|
30
|
-
from
|
31
|
-
|
32
|
-
CACHED_BUCKETS = LRU(50) # type: ignore
|
25
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
33
26
|
|
34
27
|
|
35
28
|
class BlobStore:
|
@@ -56,14 +49,9 @@ class FileStorageManager:
|
|
56
49
|
chunk_size: int
|
57
50
|
min_upload_size: Optional[int] = None
|
58
51
|
|
59
|
-
def __init__(self, storage):
|
52
|
+
def __init__(self, storage: BlobStore):
|
60
53
|
self.storage = storage
|
61
54
|
|
62
|
-
def read_range(
|
63
|
-
self, uri: str, kbid: str, start: int, end: int
|
64
|
-
) -> AsyncIterator[bytes]:
|
65
|
-
raise NotImplementedError()
|
66
|
-
|
67
55
|
def iter_data(
|
68
56
|
self, uri: str, kbid: str, headers: Optional[dict[str, str]] = None
|
69
57
|
) -> AsyncIterator[bytes]:
|
@@ -81,48 +69,6 @@ class FileStorageManager:
|
|
81
69
|
async def delete_upload(self, uri, kbid):
|
82
70
|
raise NotImplementedError()
|
83
71
|
|
84
|
-
async def full_download(self, content_length, content_type, upload_id):
|
85
|
-
return StreamingResponse(
|
86
|
-
self.iter_data(upload_id),
|
87
|
-
media_type=content_type,
|
88
|
-
headers={
|
89
|
-
"Content-Length": str(content_length),
|
90
|
-
"Content-Type": content_type,
|
91
|
-
},
|
92
|
-
)
|
93
|
-
|
94
|
-
async def range_download(
|
95
|
-
self, content_length, content_type, upload_id, range_header
|
96
|
-
):
|
97
|
-
try:
|
98
|
-
start, _, end = range_header.split("bytes=")[-1].partition("-")
|
99
|
-
start = int(start)
|
100
|
-
if len(end) == 0:
|
101
|
-
# bytes=0- is valid
|
102
|
-
end = content_length - 1
|
103
|
-
end = int(end) + 1 # python is inclusive, http is exclusive
|
104
|
-
except (IndexError, ValueError):
|
105
|
-
# range errors fallback to full download
|
106
|
-
raise HTTPRangeNotSatisfiable(detail=f"Range not parsable {range_header}")
|
107
|
-
if start > end or start < 0:
|
108
|
-
raise HTTPRangeNotSatisfiable(detail="Invalid range {start}-{end}")
|
109
|
-
if end > content_length:
|
110
|
-
raise HTTPRangeNotSatisfiable(
|
111
|
-
detail="Invalid range {start}-{end}, too large end value"
|
112
|
-
)
|
113
|
-
|
114
|
-
logger.debug(f"Range request: {range_header}")
|
115
|
-
headers = {
|
116
|
-
"Content-Range": f"bytes {start}-{end - 1}/{content_length}",
|
117
|
-
"Content-Type": content_type,
|
118
|
-
}
|
119
|
-
|
120
|
-
return StreamingResponse(
|
121
|
-
self.read_range(upload_id, start, end),
|
122
|
-
media_type=content_type,
|
123
|
-
headers=headers,
|
124
|
-
)
|
125
|
-
|
126
72
|
async def iterate_body_chunks(self, request, chunk_size):
|
127
73
|
partial = b""
|
128
74
|
remaining = b""
|
@@ -146,3 +92,6 @@ class FileStorageManager:
|
|
146
92
|
|
147
93
|
if partial or remaining:
|
148
94
|
yield partial + remaining
|
95
|
+
|
96
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
97
|
+
raise NotImplementedError()
|