nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -21,36 +21,48 @@ from datetime import datetime
|
|
21
21
|
from typing import Optional, Union
|
22
22
|
|
23
23
|
from google.protobuf.json_format import MessageToDict
|
24
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage
|
25
24
|
|
26
25
|
import nucliadb_models as models
|
26
|
+
from nucliadb.common.models_utils import from_proto, to_proto
|
27
27
|
from nucliadb.ingest.fields.conversation import Conversation
|
28
28
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
29
29
|
from nucliadb.ingest.processing import PushPayload
|
30
30
|
from nucliadb.writer import SERVICE_NAME
|
31
|
-
from nucliadb.writer.layouts import serialize_blocks
|
32
31
|
from nucliadb.writer.utilities import get_processing
|
33
|
-
from nucliadb_models.common import
|
32
|
+
from nucliadb_models.common import FieldTypeName
|
33
|
+
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
34
34
|
from nucliadb_models.conversation import PushConversation
|
35
35
|
from nucliadb_models.writer import (
|
36
|
-
GENERIC_MIME_TYPE,
|
37
36
|
CreateResourcePayload,
|
38
37
|
UpdateResourcePayload,
|
39
38
|
)
|
40
39
|
from nucliadb_protos import resources_pb2
|
40
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage
|
41
41
|
from nucliadb_utils.storages.storage import StorageField
|
42
42
|
from nucliadb_utils.utilities import get_storage
|
43
43
|
|
44
44
|
|
45
|
+
async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
|
46
|
+
processing = get_processing()
|
47
|
+
|
48
|
+
if field_pb.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
|
49
|
+
file_field = models.FileField(
|
50
|
+
language=field_pb.language,
|
51
|
+
password=field_pb.password,
|
52
|
+
file=models.File(payload=None, uri=field_pb.file.uri),
|
53
|
+
)
|
54
|
+
return processing.convert_external_filefield_to_str(file_field)
|
55
|
+
else:
|
56
|
+
storage = await get_storage(service_name=SERVICE_NAME)
|
57
|
+
return await processing.convert_internal_filefield_to_str(field_pb, storage)
|
58
|
+
|
59
|
+
|
45
60
|
async def extract_file_field(
|
46
61
|
field_id: str,
|
47
62
|
resource: ORMResource,
|
48
63
|
toprocess: PushPayload,
|
49
64
|
password: Optional[str] = None,
|
50
65
|
):
|
51
|
-
processing = get_processing()
|
52
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
53
|
-
|
54
66
|
field_type = resources_pb2.FieldType.FILE
|
55
67
|
field = await resource.get_field(field_id, field_type)
|
56
68
|
field_pb = await field.get_value()
|
@@ -60,9 +72,7 @@ async def extract_file_field(
|
|
60
72
|
if password is not None:
|
61
73
|
field_pb.password = password
|
62
74
|
|
63
|
-
toprocess.filefield[field_id] = await
|
64
|
-
field_pb, storage
|
65
|
-
)
|
75
|
+
toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
|
66
76
|
|
67
77
|
|
68
78
|
async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
@@ -70,12 +80,11 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
70
80
|
storage = await get_storage(service_name=SERVICE_NAME)
|
71
81
|
await resource.get_fields()
|
72
82
|
for (field_type, field_id), field in resource.fields.items():
|
73
|
-
field_type_name =
|
83
|
+
field_type_name = from_proto.field_type_name(field_type)
|
74
84
|
|
75
85
|
if field_type_name not in {
|
76
86
|
FieldTypeName.TEXT,
|
77
87
|
FieldTypeName.FILE,
|
78
|
-
FieldTypeName.LAYOUT,
|
79
88
|
FieldTypeName.CONVERSATION,
|
80
89
|
FieldTypeName.LINK,
|
81
90
|
}:
|
@@ -84,9 +93,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
84
93
|
field_pb = await field.get_value()
|
85
94
|
|
86
95
|
if field_type_name is FieldTypeName.FILE:
|
87
|
-
toprocess.filefield[
|
88
|
-
field_id
|
89
|
-
] = await processing.convert_internal_filefield_to_str(field_pb, storage)
|
96
|
+
toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
|
90
97
|
|
91
98
|
if field_type_name is FieldTypeName.LINK:
|
92
99
|
parsed_link = MessageToDict(
|
@@ -106,28 +113,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
106
113
|
parsed_text["format"] = models.PushTextFormat[parsed_text["format"]]
|
107
114
|
toprocess.textfield[field_id] = models.Text(**parsed_text)
|
108
115
|
|
109
|
-
if field_type_name is FieldTypeName.
|
110
|
-
parsed_layout = MessageToDict(
|
111
|
-
field_pb,
|
112
|
-
preserving_proto_field_name=True,
|
113
|
-
including_default_value_fields=True,
|
114
|
-
)
|
115
|
-
parsed_layout["format"] = resources_pb2.FieldLayout.Format.Value(
|
116
|
-
parsed_layout["format"]
|
117
|
-
)
|
118
|
-
|
119
|
-
for blockid, block in parsed_layout["body"]["blocks"].items():
|
120
|
-
cf = field_pb.body.blocks[blockid].file
|
121
|
-
block["file"] = await processing.convert_internal_cf_to_str(cf, storage)
|
122
|
-
|
123
|
-
parsed_layout["blocks"] = parsed_layout.get("body", {}).get("blocks", {})
|
124
|
-
del parsed_layout["body"]
|
125
|
-
|
126
|
-
toprocess.layoutfield[field_id] = models.LayoutDiff(**parsed_layout)
|
127
|
-
|
128
|
-
if field_type_name is FieldTypeName.CONVERSATION and isinstance(
|
129
|
-
field, Conversation
|
130
|
-
):
|
116
|
+
if field_type_name is FieldTypeName.CONVERSATION and isinstance(field, Conversation):
|
131
117
|
metadata = await field.get_metadata()
|
132
118
|
if metadata.pages == 0:
|
133
119
|
continue
|
@@ -148,14 +134,13 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
148
134
|
await processing.convert_internal_cf_to_str(cf, storage)
|
149
135
|
for cf in message.content.attachments
|
150
136
|
]
|
151
|
-
parsed_message["content"]
|
152
|
-
|
153
|
-
|
137
|
+
if "attachments_fields" in parsed_message["content"]:
|
138
|
+
# Not defined on the push payload
|
139
|
+
del parsed_message["content"]["attachments_fields"]
|
140
|
+
parsed_message["content"]["format"] = resources_pb2.MessageContent.Format.Value(
|
154
141
|
parsed_message["content"]["format"]
|
155
142
|
)
|
156
|
-
full_conversation.messages.append(
|
157
|
-
models.PushMessage(**parsed_message)
|
158
|
-
)
|
143
|
+
full_conversation.messages.append(models.PushMessage(**parsed_message))
|
159
144
|
toprocess.conversationfield[field_id] = full_conversation
|
160
145
|
|
161
146
|
|
@@ -168,9 +153,7 @@ async def parse_fields(
|
|
168
153
|
x_skip_store: bool,
|
169
154
|
):
|
170
155
|
for key, file_field in item.files.items():
|
171
|
-
await parse_file_field(
|
172
|
-
key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store
|
173
|
-
)
|
156
|
+
await parse_file_field(key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store)
|
174
157
|
|
175
158
|
for key, link_field in item.links.items():
|
176
159
|
parse_link_field(key, link_field, writer, toprocess)
|
@@ -178,19 +161,8 @@ async def parse_fields(
|
|
178
161
|
for key, text_field in item.texts.items():
|
179
162
|
parse_text_field(key, text_field, writer, toprocess)
|
180
163
|
|
181
|
-
for key, layout_field in item.layouts.items():
|
182
|
-
await parse_layout_field(key, layout_field, writer, toprocess, kbid, uuid)
|
183
|
-
|
184
164
|
for key, conversation_field in item.conversations.items():
|
185
|
-
await parse_conversation_field(
|
186
|
-
key, conversation_field, writer, toprocess, kbid, uuid
|
187
|
-
)
|
188
|
-
|
189
|
-
for key, datetime_field in item.datetimes.items():
|
190
|
-
parse_datetime_field(key, datetime_field, writer, toprocess)
|
191
|
-
|
192
|
-
for key, keywordset_field in item.keywordsets.items():
|
193
|
-
parse_keywordset_field(key, keywordset_field, writer, toprocess)
|
165
|
+
await parse_conversation_field(key, conversation_field, writer, toprocess, kbid, uuid)
|
194
166
|
|
195
167
|
|
196
168
|
def parse_text_field(
|
@@ -200,9 +172,7 @@ def parse_text_field(
|
|
200
172
|
toprocess: PushPayload,
|
201
173
|
) -> None:
|
202
174
|
writer.texts[key].body = text_field.body
|
203
|
-
writer.texts[key].format = resources_pb2.FieldText.Format.Value(
|
204
|
-
text_field.format.value
|
205
|
-
)
|
175
|
+
writer.texts[key].format = resources_pb2.FieldText.Format.Value(text_field.format.value)
|
206
176
|
etw = resources_pb2.ExtractedTextWrapper()
|
207
177
|
etw.field.field = key
|
208
178
|
etw.field.field_type = resources_pb2.FieldType.TEXT
|
@@ -317,84 +287,16 @@ def parse_link_field(
|
|
317
287
|
if link_field.css_selector is not None:
|
318
288
|
writer.links[key].css_selector = link_field.css_selector
|
319
289
|
|
290
|
+
if link_field.xpath is not None:
|
291
|
+
writer.links[key].xpath = link_field.xpath
|
292
|
+
|
320
293
|
toprocess.linkfield[key] = models.LinkUpload(
|
321
294
|
link=link_field.uri,
|
322
295
|
headers=link_field.headers or {},
|
323
296
|
cookies=link_field.cookies or {},
|
324
297
|
localstorage=link_field.localstorage or {},
|
325
298
|
css_selector=link_field.css_selector,
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
def parse_keywordset_field(
|
330
|
-
key: str,
|
331
|
-
keywordset_field: models.FieldKeywordset,
|
332
|
-
writer: BrokerMessage,
|
333
|
-
toprocess: PushPayload,
|
334
|
-
) -> None:
|
335
|
-
if keywordset_field.keywords is None:
|
336
|
-
return
|
337
|
-
|
338
|
-
for keyword in keywordset_field.keywords:
|
339
|
-
fieldpb = resources_pb2.Keyword()
|
340
|
-
fieldpb.value = keyword.value
|
341
|
-
writer.keywordsets[key].keywords.append(fieldpb)
|
342
|
-
|
343
|
-
|
344
|
-
def parse_datetime_field(
|
345
|
-
key: str,
|
346
|
-
datetime_field: models.FieldDatetime,
|
347
|
-
writer: BrokerMessage,
|
348
|
-
toprocess: PushPayload,
|
349
|
-
) -> None:
|
350
|
-
if datetime_field.value is None:
|
351
|
-
return
|
352
|
-
|
353
|
-
writer.datetimes[key].value.FromDatetime(datetime_field.value)
|
354
|
-
|
355
|
-
|
356
|
-
async def parse_layout_field(
|
357
|
-
key: str,
|
358
|
-
layout_field: models.InputLayoutField,
|
359
|
-
writer: BrokerMessage,
|
360
|
-
toprocess: PushPayload,
|
361
|
-
kbid: str,
|
362
|
-
uuid: str,
|
363
|
-
) -> None:
|
364
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
365
|
-
processing = get_processing()
|
366
|
-
|
367
|
-
lc: resources_pb2.FieldLayout = await serialize_blocks(
|
368
|
-
layout_field, kbid, uuid, key, storage
|
369
|
-
)
|
370
|
-
writer.layouts[key].CopyFrom(lc)
|
371
|
-
|
372
|
-
toprocess_blocks = {}
|
373
|
-
for blockid, block in layout_field.body.blocks.items():
|
374
|
-
sf_conv_field: StorageField = storage.layout_field(
|
375
|
-
kbid, uuid, field=key, ident=block.ident
|
376
|
-
)
|
377
|
-
cf_conv_field = await storage.upload_b64file_to_cloudfile(
|
378
|
-
sf_conv_field,
|
379
|
-
block.file.payload.encode(),
|
380
|
-
block.file.filename,
|
381
|
-
block.file.content_type,
|
382
|
-
block.file.md5,
|
383
|
-
)
|
384
|
-
|
385
|
-
toprocess_blocks[blockid] = models.PushLayoutBlock(
|
386
|
-
x=block.x,
|
387
|
-
y=block.y,
|
388
|
-
cols=block.cols,
|
389
|
-
rows=block.rows,
|
390
|
-
type=block.type,
|
391
|
-
ident=block.ident,
|
392
|
-
payload=block.payload,
|
393
|
-
file=await processing.convert_internal_cf_to_str(cf_conv_field, storage),
|
394
|
-
)
|
395
|
-
|
396
|
-
toprocess.layoutfield[key] = models.LayoutDiff(
|
397
|
-
format=lc.format, blocks=toprocess_blocks # type: ignore
|
299
|
+
xpath=link_field.xpath,
|
398
300
|
)
|
399
301
|
|
400
302
|
|
@@ -429,8 +331,16 @@ async def parse_conversation_field(
|
|
429
331
|
)
|
430
332
|
|
431
333
|
cm.content.text = message.content.text
|
432
|
-
cm.content.format = resources_pb2.MessageContent.Format.Value(
|
433
|
-
|
334
|
+
cm.content.format = resources_pb2.MessageContent.Format.Value(message.content.format.value)
|
335
|
+
cm.content.attachments_fields.extend(
|
336
|
+
[
|
337
|
+
resources_pb2.FieldRef(
|
338
|
+
field_type=to_proto.field_type_name(attachment.field_type),
|
339
|
+
field_id=attachment.field_id,
|
340
|
+
split=attachment.split if attachment.split is not None else "",
|
341
|
+
)
|
342
|
+
for attachment in message.content.attachments_fields
|
343
|
+
]
|
434
344
|
)
|
435
345
|
|
436
346
|
for count, file in enumerate(message.content.attachments):
|
@@ -16,10 +16,9 @@
|
|
16
16
|
#
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
from nucliadb_protos.resources_pb2 import Origin
|
20
|
-
|
21
19
|
from nucliadb_models import Extra, InputOrigin
|
22
20
|
from nucliadb_protos import resources_pb2
|
21
|
+
from nucliadb_protos.resources_pb2 import Origin
|
23
22
|
|
24
23
|
|
25
24
|
def parse_origin(origin: Origin, origin_payload: InputOrigin):
|
nucliadb/writer/settings.py
CHANGED
@@ -19,7 +19,8 @@
|
|
19
19
|
#
|
20
20
|
from typing import Optional
|
21
21
|
|
22
|
-
from pydantic import
|
22
|
+
from pydantic import Field
|
23
|
+
from pydantic_settings import BaseSettings
|
23
24
|
|
24
25
|
|
25
26
|
class Settings(BaseSettings):
|
@@ -32,10 +33,10 @@ class BackPressureSettings(BaseSettings):
|
|
32
33
|
enabled: bool = Field(
|
33
34
|
default=False,
|
34
35
|
description="Enable or disable back pressure.",
|
35
|
-
|
36
|
+
alias="back_pressure_enabled",
|
36
37
|
)
|
37
38
|
indexing_rate: float = Field(
|
38
|
-
default=
|
39
|
+
default=4,
|
39
40
|
description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time", # noqa
|
40
41
|
)
|
41
42
|
ingest_rate: float = Field(
|
@@ -47,16 +48,20 @@ class BackPressureSettings(BaseSettings):
|
|
47
48
|
description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time", # noqa
|
48
49
|
)
|
49
50
|
max_indexing_pending: int = Field(
|
50
|
-
default=
|
51
|
+
default=200,
|
51
52
|
description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks", # noqa
|
53
|
+
alias="back_pressure_max_indexing_pending",
|
52
54
|
)
|
53
55
|
max_ingest_pending: int = Field(
|
54
|
-
default
|
56
|
+
# Disabled by default
|
57
|
+
default=0,
|
55
58
|
description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks", # noqa
|
59
|
+
alias="back_pressure_max_ingest_pending",
|
56
60
|
)
|
57
61
|
max_processing_pending: int = Field(
|
58
62
|
default=1000,
|
59
63
|
description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks", # noqa
|
64
|
+
alias="back_pressure_max_processing_pending",
|
60
65
|
)
|
61
66
|
indexing_check_interval: int = Field(
|
62
67
|
default=30,
|
@@ -66,6 +71,10 @@ class BackPressureSettings(BaseSettings):
|
|
66
71
|
default=30,
|
67
72
|
description="Interval in seconds to check the ingest pending messages",
|
68
73
|
)
|
74
|
+
max_wait_time: int = Field(
|
75
|
+
default=60,
|
76
|
+
description="Max time in seconds to wait before trying again after back pressure",
|
77
|
+
)
|
69
78
|
|
70
79
|
|
71
80
|
settings = Settings()
|
nucliadb/writer/tus/__init__.py
CHANGED
@@ -23,10 +23,6 @@ from typing import Optional
|
|
23
23
|
from nucliadb.writer.settings import settings as writer_settings
|
24
24
|
from nucliadb.writer.tus.dm import FileDataManager, RedisFileDataManagerFactory
|
25
25
|
from nucliadb.writer.tus.exceptions import ManagerNotAvailable
|
26
|
-
from nucliadb.writer.tus.gcs import GCloudBlobStore, GCloudFileStorageManager
|
27
|
-
from nucliadb.writer.tus.local import LocalBlobStore, LocalFileStorageManager
|
28
|
-
from nucliadb.writer.tus.pg import PGBlobStore, PGFileStorageManager
|
29
|
-
from nucliadb.writer.tus.s3 import S3BlobStore, S3FileStorageManager
|
30
26
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
31
27
|
from nucliadb_utils.exceptions import ConfigurationError
|
32
28
|
from nucliadb_utils.settings import FileBackendConfig, storage_settings
|
@@ -48,6 +44,8 @@ REDIS_FILE_DATA_MANAGER_FACTORY: Optional[RedisFileDataManagerFactory] = None
|
|
48
44
|
async def initialize():
|
49
45
|
global DRIVER
|
50
46
|
if storage_settings.file_backend == FileBackendConfig.GCS:
|
47
|
+
from nucliadb.writer.tus.gcs import GCloudBlobStore, GCloudFileStorageManager
|
48
|
+
|
51
49
|
storage_backend = GCloudBlobStore()
|
52
50
|
|
53
51
|
await storage_backend.initialize(
|
@@ -64,6 +62,8 @@ async def initialize():
|
|
64
62
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
65
63
|
|
66
64
|
elif storage_settings.file_backend == FileBackendConfig.S3:
|
65
|
+
from nucliadb.writer.tus.s3 import S3BlobStore, S3FileStorageManager
|
66
|
+
|
67
67
|
storage_backend = S3BlobStore()
|
68
68
|
|
69
69
|
await storage_backend.initialize(
|
@@ -83,6 +83,8 @@ async def initialize():
|
|
83
83
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
84
84
|
|
85
85
|
elif storage_settings.file_backend == FileBackendConfig.LOCAL:
|
86
|
+
from nucliadb.writer.tus.local import LocalBlobStore, LocalFileStorageManager
|
87
|
+
|
86
88
|
storage_backend = LocalBlobStore(storage_settings.local_files)
|
87
89
|
|
88
90
|
await storage_backend.initialize()
|
@@ -91,12 +93,18 @@ async def initialize():
|
|
91
93
|
|
92
94
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
93
95
|
|
94
|
-
elif storage_settings.file_backend == FileBackendConfig.
|
95
|
-
|
96
|
+
elif storage_settings.file_backend == FileBackendConfig.AZURE:
|
97
|
+
from nucliadb.writer.tus.azure import AzureBlobStore, AzureFileStorageManager
|
96
98
|
|
97
|
-
|
99
|
+
if storage_settings.azure_account_url is None:
|
100
|
+
raise ConfigurationError("AZURE_ACCOUNT_URL env variable not configured")
|
98
101
|
|
99
|
-
|
102
|
+
storage_backend = AzureBlobStore()
|
103
|
+
await storage_backend.initialize(
|
104
|
+
storage_settings.azure_account_url,
|
105
|
+
connection_string=storage_settings.azure_connection_string,
|
106
|
+
)
|
107
|
+
storage_manager = AzureFileStorageManager(storage_backend)
|
100
108
|
|
101
109
|
DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
|
102
110
|
|
@@ -117,7 +125,7 @@ async def finalize():
|
|
117
125
|
REDIS_FILE_DATA_MANAGER_FACTORY = None
|
118
126
|
|
119
127
|
|
120
|
-
def get_dm() -> FileDataManager:
|
128
|
+
def get_dm() -> FileDataManager:
|
121
129
|
if writer_settings.dm_enabled:
|
122
130
|
global REDIS_FILE_DATA_MANAGER_FACTORY
|
123
131
|
if REDIS_FILE_DATA_MANAGER_FACTORY is None:
|
@@ -136,9 +144,3 @@ def get_storage_manager() -> FileStorageManager:
|
|
136
144
|
if DRIVER is None:
|
137
145
|
raise ManagerNotAvailable()
|
138
146
|
return DRIVER.manager
|
139
|
-
|
140
|
-
|
141
|
-
def clear_storage():
|
142
|
-
global DRIVER
|
143
|
-
|
144
|
-
DRIVER = None
|
@@ -0,0 +1,111 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
from __future__ import annotations
|
21
|
+
|
22
|
+
from typing import Optional
|
23
|
+
|
24
|
+
from nucliadb.writer import logger
|
25
|
+
from nucliadb.writer.tus.dm import FileDataManager
|
26
|
+
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
27
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
28
|
+
from nucliadb_utils.storages import CHUNK_SIZE
|
29
|
+
from nucliadb_utils.storages.azure import AzureObjectStore
|
30
|
+
from nucliadb_utils.storages.exceptions import ObjectNotFoundError
|
31
|
+
from nucliadb_utils.storages.utils import ObjectMetadata
|
32
|
+
|
33
|
+
|
34
|
+
class AzureBlobStore(BlobStore):
|
35
|
+
async def finalize(self):
|
36
|
+
if self._object_store is None:
|
37
|
+
return
|
38
|
+
try:
|
39
|
+
await self._object_store.finalize()
|
40
|
+
except Exception:
|
41
|
+
logger.exception("Error closing AzureBlobStore")
|
42
|
+
self._object_store = None
|
43
|
+
|
44
|
+
async def initialize(self, account_url: str, connection_string: Optional[str] = None):
|
45
|
+
self.bucket = "nucliadb-{kbid}"
|
46
|
+
self.source = CloudFile.Source.AZURE
|
47
|
+
self._object_store = AzureObjectStore(account_url, connection_string=connection_string)
|
48
|
+
await self._object_store.initialize()
|
49
|
+
|
50
|
+
@property
|
51
|
+
def object_store(self) -> AzureObjectStore:
|
52
|
+
assert self._object_store is not None
|
53
|
+
return self._object_store
|
54
|
+
|
55
|
+
async def check_exists(self, bucket_name: str) -> bool:
|
56
|
+
return await self.object_store.bucket_exists(bucket_name)
|
57
|
+
|
58
|
+
async def create_bucket(self, bucket_name: str) -> bool:
|
59
|
+
created = await self.object_store.bucket_create(bucket_name)
|
60
|
+
return not created
|
61
|
+
|
62
|
+
|
63
|
+
class AzureFileStorageManager(FileStorageManager):
|
64
|
+
storage: AzureBlobStore
|
65
|
+
chunk_size = CHUNK_SIZE
|
66
|
+
min_upload_size = None
|
67
|
+
|
68
|
+
@property
|
69
|
+
def object_store(self) -> AzureObjectStore:
|
70
|
+
return self.storage.object_store
|
71
|
+
|
72
|
+
async def start(self, dm: FileDataManager, path: str, kbid: str):
|
73
|
+
bucket = self.storage.get_bucket_name(kbid)
|
74
|
+
if dm.filename == 0:
|
75
|
+
filename = "file"
|
76
|
+
else:
|
77
|
+
filename = dm.filename
|
78
|
+
metadata = ObjectMetadata(
|
79
|
+
filename=filename,
|
80
|
+
content_type=dm.content_type,
|
81
|
+
size=dm.size,
|
82
|
+
)
|
83
|
+
await self.object_store.upload_multipart_start(bucket, path, metadata)
|
84
|
+
await dm.update(path=path, bucket=bucket)
|
85
|
+
|
86
|
+
async def delete_upload(self, uri: str, kbid: str) -> None:
|
87
|
+
bucket = self.storage.get_bucket_name(kbid)
|
88
|
+
try:
|
89
|
+
await self.object_store.delete(bucket, uri)
|
90
|
+
except ObjectNotFoundError:
|
91
|
+
logger.warning(
|
92
|
+
"Attempt to delete an upload but not found",
|
93
|
+
extra={"uri": uri, "kbid": kbid, "bucket": bucket},
|
94
|
+
)
|
95
|
+
|
96
|
+
async def append(self, dm: FileDataManager, iterable, offset: int) -> int:
|
97
|
+
bucket = dm.get("bucket")
|
98
|
+
assert bucket is not None
|
99
|
+
path = dm.get("path")
|
100
|
+
assert path is not None
|
101
|
+
uploaded_bytes = await self.object_store.upload_multipart_append(bucket, path, iterable)
|
102
|
+
await dm.update(offset=offset)
|
103
|
+
return uploaded_bytes
|
104
|
+
|
105
|
+
async def finish(self, dm: FileDataManager):
|
106
|
+
path = dm.get("path")
|
107
|
+
await dm.finish()
|
108
|
+
return path
|
109
|
+
|
110
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
111
|
+
pass
|
nucliadb/writer/tus/dm.py
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
import time
|
21
21
|
from typing import Any, Optional
|
22
22
|
|
23
|
+
import backoff
|
23
24
|
import orjson
|
24
25
|
from redis import asyncio as aioredis
|
25
26
|
from starlette.requests import Request
|
@@ -33,6 +34,11 @@ class NoRedisConfigured(Exception):
|
|
33
34
|
pass
|
34
35
|
|
35
36
|
|
37
|
+
RETRIABLE_REDIS_ERRORS = (
|
38
|
+
aioredis.ConnectionError,
|
39
|
+
aioredis.TimeoutError,
|
40
|
+
)
|
41
|
+
|
36
42
|
DATA: dict[str, Any] = {}
|
37
43
|
|
38
44
|
|
@@ -59,10 +65,7 @@ class FileDataManager:
|
|
59
65
|
# someone else
|
60
66
|
last_activity: Optional[int] = self._data.get("last_activity")
|
61
67
|
if last_activity and (time.time() - last_activity) < self._ttl:
|
62
|
-
if (
|
63
|
-
request.headers
|
64
|
-
and request.headers.get("tus-override-upload", "0") != "1"
|
65
|
-
):
|
68
|
+
if request.headers and request.headers.get("tus-override-upload", "0") != "1":
|
66
69
|
raise HTTPPreconditionFailed(
|
67
70
|
detail="There is already an active tusupload that conflicts with this one."
|
68
71
|
)
|
@@ -136,7 +139,7 @@ class RedisFileDataManagerFactory:
|
|
136
139
|
|
137
140
|
async def finalize(self):
|
138
141
|
try:
|
139
|
-
await self.redis.
|
142
|
+
await self.redis.aclose(close_connection_pool=True)
|
140
143
|
except Exception:
|
141
144
|
logger.warning("Error closing redis connection", exc_info=True)
|
142
145
|
pass
|
@@ -146,6 +149,9 @@ class RedisFileDataManager(FileDataManager):
|
|
146
149
|
def __init__(self, redis: aioredis.Redis):
|
147
150
|
self.redis = redis
|
148
151
|
|
152
|
+
@backoff.on_exception(
|
153
|
+
backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
|
154
|
+
)
|
149
155
|
async def load(self, key):
|
150
156
|
# preload data
|
151
157
|
self.key = key
|
@@ -157,6 +163,9 @@ class RedisFileDataManager(FileDataManager):
|
|
157
163
|
self._data = orjson.loads(data)
|
158
164
|
self._loaded = True
|
159
165
|
|
166
|
+
@backoff.on_exception(
|
167
|
+
backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
|
168
|
+
)
|
160
169
|
async def save(self):
|
161
170
|
if self.key is None:
|
162
171
|
raise Exception("Not initialized")
|
@@ -164,6 +173,9 @@ class RedisFileDataManager(FileDataManager):
|
|
164
173
|
value = orjson.dumps(self._data)
|
165
174
|
await self.redis.set(self.key, value, ex=self._ttl)
|
166
175
|
|
176
|
+
@backoff.on_exception(
|
177
|
+
backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
|
178
|
+
)
|
167
179
|
async def _delete_key(self):
|
168
180
|
if self.key is None:
|
169
181
|
raise Exception("Not initialized")
|
@@ -31,9 +31,7 @@ class HTTPException(StarletteHTTPException):
|
|
31
31
|
|
32
32
|
def __init__(self, detail: Optional[str] = None):
|
33
33
|
if self._status_code:
|
34
|
-
super(HTTPException, self).__init__(
|
35
|
-
status_code=self._status_code, detail=detail
|
36
|
-
)
|
34
|
+
super(HTTPException, self).__init__(status_code=self._status_code, detail=detail)
|
37
35
|
else:
|
38
36
|
raise AttributeError("Status code not defined")
|
39
37
|
|