nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/writer/api/v1/upload.py
CHANGED
@@ -18,21 +18,18 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
import base64
|
21
|
-
import mimetypes
|
22
21
|
import pickle
|
23
22
|
import uuid
|
24
23
|
from datetime import datetime
|
25
24
|
from hashlib import md5
|
26
25
|
from io import BytesIO
|
27
|
-
from typing import Optional
|
26
|
+
from typing import Annotated, Optional
|
28
27
|
|
29
28
|
from fastapi import HTTPException
|
30
29
|
from fastapi.params import Header
|
31
30
|
from fastapi.requests import Request
|
32
31
|
from fastapi.responses import Response
|
33
|
-
from fastapi_versioning import version
|
34
|
-
from nucliadb_protos.resources_pb2 import FieldFile, Metadata
|
35
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage
|
32
|
+
from fastapi_versioning import version
|
36
33
|
from starlette.requests import Request as StarletteRequest
|
37
34
|
|
38
35
|
from nucliadb.common import datamanagers
|
@@ -40,13 +37,15 @@ from nucliadb.ingest.orm.utils import set_title
|
|
40
37
|
from nucliadb.ingest.processing import PushPayload, Source
|
41
38
|
from nucliadb.models.responses import HTTPClientError
|
42
39
|
from nucliadb.writer import SERVICE_NAME
|
40
|
+
from nucliadb.writer.api.v1 import transaction
|
43
41
|
from nucliadb.writer.api.v1.resource import (
|
44
42
|
get_rid_from_slug_or_raise_error,
|
45
43
|
validate_rid_exists_or_raise_error,
|
46
44
|
)
|
45
|
+
from nucliadb.writer.api.v1.slug import ensure_slug_uniqueness, noop_context_manager
|
47
46
|
from nucliadb.writer.back_pressure import maybe_back_pressure
|
48
47
|
from nucliadb.writer.resource.audit import parse_audit
|
49
|
-
from nucliadb.writer.resource.basic import
|
48
|
+
from nucliadb.writer.resource.basic import parse_basic_creation
|
50
49
|
from nucliadb.writer.resource.field import parse_fields
|
51
50
|
from nucliadb.writer.resource.origin import parse_extra, parse_origin
|
52
51
|
from nucliadb.writer.tus import TUSUPLOAD, UPLOAD, get_dm, get_storage_manager
|
@@ -58,20 +57,21 @@ from nucliadb.writer.tus.exceptions import (
|
|
58
57
|
InvalidTUSMetadata,
|
59
58
|
ResumableURINotAvailable,
|
60
59
|
)
|
61
|
-
from nucliadb.writer.tus.storage import FileStorageManager
|
60
|
+
from nucliadb.writer.tus.storage import FileStorageManager
|
62
61
|
from nucliadb.writer.tus.utils import parse_tus_metadata
|
63
62
|
from nucliadb.writer.utilities import get_processing
|
63
|
+
from nucliadb_models import content_types
|
64
64
|
from nucliadb_models.resource import NucliaDBRoles
|
65
65
|
from nucliadb_models.utils import FieldIdString
|
66
66
|
from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
|
67
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, Metadata
|
68
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage
|
67
69
|
from nucliadb_utils.authentication import requires_one
|
68
70
|
from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
|
69
71
|
from nucliadb_utils.storages.storage import KB_RESOURCE_FIELD
|
70
|
-
from nucliadb_utils.transaction import TransactionCommitTimeoutError
|
71
72
|
from nucliadb_utils.utilities import (
|
72
73
|
get_partitioning,
|
73
74
|
get_storage,
|
74
|
-
get_transaction_utility,
|
75
75
|
)
|
76
76
|
|
77
77
|
from .router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREFIX, api
|
@@ -82,6 +82,10 @@ TUS_HEADERS = {
|
|
82
82
|
"Tus-Extension": "creation-defer-length",
|
83
83
|
}
|
84
84
|
|
85
|
+
ExtractStrategyHeader = Header(
|
86
|
+
description="Extract strategy to use when uploading a file. If not provided, the default strategy will be used.",
|
87
|
+
)
|
88
|
+
|
85
89
|
|
86
90
|
@api.options(
|
87
91
|
f"/{KB_PREFIX}/{{kbid}}/{RSLUG_PREFIX}/{{rslug}}/file/{{field}}/{TUSUPLOAD}/{{upload_id}}",
|
@@ -142,9 +146,12 @@ async def tus_post_rslug_prefix(
|
|
142
146
|
rslug: str,
|
143
147
|
field: FieldIdString,
|
144
148
|
item: Optional[CreateResourcePayload] = None,
|
149
|
+
x_extract_strategy: Annotated[Optional[str], ExtractStrategyHeader] = None,
|
145
150
|
) -> Response:
|
146
151
|
rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
|
147
|
-
return await _tus_post(
|
152
|
+
return await _tus_post(
|
153
|
+
request, kbid, item, path_rid=rid, field_id=field, extract_strategy=x_extract_strategy
|
154
|
+
)
|
148
155
|
|
149
156
|
|
150
157
|
@api.post(
|
@@ -161,8 +168,11 @@ async def tus_post_rid_prefix(
|
|
161
168
|
path_rid: str,
|
162
169
|
field: FieldIdString,
|
163
170
|
item: Optional[CreateResourcePayload] = None,
|
171
|
+
x_extract_strategy: Annotated[Optional[str], ExtractStrategyHeader] = None,
|
164
172
|
) -> Response:
|
165
|
-
return await _tus_post(
|
173
|
+
return await _tus_post(
|
174
|
+
request, kbid, item, path_rid=path_rid, field_id=field, extract_strategy=x_extract_strategy
|
175
|
+
)
|
166
176
|
|
167
177
|
|
168
178
|
@api.post(
|
@@ -177,8 +187,9 @@ async def tus_post(
|
|
177
187
|
request: Request,
|
178
188
|
kbid: str,
|
179
189
|
item: Optional[CreateResourcePayload] = None,
|
190
|
+
x_extract_strategy: Annotated[Optional[str], ExtractStrategyHeader] = None,
|
180
191
|
) -> Response:
|
181
|
-
return await _tus_post(request, kbid, item)
|
192
|
+
return await _tus_post(request, kbid, item, extract_strategy=x_extract_strategy)
|
182
193
|
|
183
194
|
|
184
195
|
# called by one the three POST above - there are defined distinctly to produce clean API doc
|
@@ -188,6 +199,7 @@ async def _tus_post(
|
|
188
199
|
item: Optional[CreateResourcePayload] = None,
|
189
200
|
path_rid: Optional[str] = None,
|
190
201
|
field_id: Optional[str] = None,
|
202
|
+
extract_strategy: Optional[str] = None,
|
191
203
|
) -> Response:
|
192
204
|
"""
|
193
205
|
An empty POST request is used to create a new upload resource.
|
@@ -196,6 +208,13 @@ async def _tus_post(
|
|
196
208
|
if path_rid is not None:
|
197
209
|
await validate_rid_exists_or_raise_error(kbid, path_rid)
|
198
210
|
|
211
|
+
kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
|
212
|
+
if item and item.hidden and not (kb_config and kb_config.hidden_resources_enabled):
|
213
|
+
raise HTTPException(
|
214
|
+
status_code=422,
|
215
|
+
detail="Cannot hide a resource: the KB does not have hidden resources enabled",
|
216
|
+
)
|
217
|
+
|
199
218
|
await maybe_back_pressure(request, kbid, resource_uuid=path_rid)
|
200
219
|
|
201
220
|
dm = get_dm()
|
@@ -221,15 +240,11 @@ async def _tus_post(
|
|
221
240
|
try:
|
222
241
|
metadata = parse_tus_metadata(request.headers["upload-metadata"])
|
223
242
|
except InvalidTUSMetadata as exc:
|
224
|
-
raise HTTPBadRequest(
|
225
|
-
detail=f"Upload-Metadata header contains errors: {str(exc)}"
|
226
|
-
)
|
243
|
+
raise HTTPBadRequest(detail=f"Upload-Metadata header contains errors: {str(exc)}")
|
227
244
|
else:
|
228
245
|
metadata = {}
|
229
246
|
|
230
|
-
path, rid, field = await validate_field_upload(
|
231
|
-
kbid, path_rid, field_id, metadata.get("md5")
|
232
|
-
)
|
247
|
+
path, rid, field = await validate_field_upload(kbid, path_rid, field_id, metadata.get("md5"))
|
233
248
|
|
234
249
|
if implies_resource_creation:
|
235
250
|
# When uploading a file to a new kb resource, we want to allow multiple
|
@@ -255,8 +270,15 @@ async def _tus_post(
|
|
255
270
|
request_content_type = None
|
256
271
|
if item is None:
|
257
272
|
request_content_type = request.headers.get("content-type")
|
258
|
-
if
|
259
|
-
request_content_type =
|
273
|
+
if request_content_type is None:
|
274
|
+
request_content_type = content_types.guess(metadata["filename"]) or "application/octet-stream"
|
275
|
+
|
276
|
+
if request_content_type is not None and not content_types.valid(request_content_type):
|
277
|
+
raise HTTPException(
|
278
|
+
status_code=415,
|
279
|
+
detail=f"Unsupported content type: {request_content_type}",
|
280
|
+
)
|
281
|
+
|
260
282
|
metadata.setdefault("content_type", request_content_type)
|
261
283
|
|
262
284
|
metadata["implies_resource_creation"] = implies_resource_creation
|
@@ -275,6 +297,7 @@ async def _tus_post(
|
|
275
297
|
deferred_length=deferred_length,
|
276
298
|
offset=0,
|
277
299
|
item=creation_payload,
|
300
|
+
extract_strategy=extract_strategy,
|
278
301
|
)
|
279
302
|
|
280
303
|
if size is not None:
|
@@ -286,9 +309,7 @@ async def _tus_post(
|
|
286
309
|
await dm.save()
|
287
310
|
|
288
311
|
# Find the URL for upload, with the same parameter as this call
|
289
|
-
location = api.url_path_for(
|
290
|
-
"Upload information", upload_id=upload_id, **request.path_params
|
291
|
-
)
|
312
|
+
location = api.url_path_for("Upload information", upload_id=upload_id, **request.path_params)
|
292
313
|
return Response(
|
293
314
|
status_code=201,
|
294
315
|
headers={
|
@@ -465,7 +486,7 @@ async def _tus_patch(
|
|
465
486
|
field: Optional[str] = None,
|
466
487
|
) -> Response:
|
467
488
|
"""
|
468
|
-
Upload all bytes in the requests and append them in the
|
489
|
+
Upload all bytes in the requests and append them in the specified offset
|
469
490
|
"""
|
470
491
|
if rid is not None:
|
471
492
|
await validate_rid_exists_or_raise_error(kbid, rid)
|
@@ -494,8 +515,7 @@ async def _tus_patch(
|
|
494
515
|
|
495
516
|
if offset != dm.offset:
|
496
517
|
raise HTTPConflict(
|
497
|
-
detail=f"Current upload offset({offset}) does not match "
|
498
|
-
f"object offset {dm.offset}"
|
518
|
+
detail=f"Current upload offset({offset}) does not match " f"object offset {dm.offset}"
|
499
519
|
)
|
500
520
|
|
501
521
|
storage_manager = get_storage_manager()
|
@@ -507,9 +527,7 @@ async def _tus_patch(
|
|
507
527
|
|
508
528
|
if to_upload and read_bytes != to_upload: # pragma: no cover
|
509
529
|
# check length matches if provided
|
510
|
-
raise HTTPPreconditionFailed(
|
511
|
-
detail="Upload size does not match what was provided"
|
512
|
-
)
|
530
|
+
raise HTTPPreconditionFailed(detail="Upload size does not match what was provided")
|
513
531
|
await dm.update(offset=offset + read_bytes)
|
514
532
|
|
515
533
|
headers = {
|
@@ -521,7 +539,6 @@ async def _tus_patch(
|
|
521
539
|
}
|
522
540
|
|
523
541
|
upload_finished = dm.get("size") is not None and dm.offset >= dm.get("size")
|
524
|
-
|
525
542
|
if upload_finished:
|
526
543
|
rid = dm.get("rid", rid)
|
527
544
|
if rid is None:
|
@@ -540,13 +557,19 @@ async def _tus_patch(
|
|
540
557
|
if isinstance(item_payload, str):
|
541
558
|
item_payload = item_payload.encode()
|
542
559
|
creation_payload = pickle.loads(base64.b64decode(item_payload))
|
560
|
+
|
561
|
+
content_type = dm.get("metadata", {}).get("content_type")
|
562
|
+
if content_type is not None and not content_types.valid(content_type):
|
563
|
+
return HTTPClientError(
|
564
|
+
status_code=415,
|
565
|
+
detail=f"Unsupported content type: {content_type}",
|
566
|
+
)
|
567
|
+
|
543
568
|
try:
|
544
569
|
seqid = await store_file_on_nuclia_db(
|
545
570
|
size=dm.get("size"),
|
546
|
-
content_type=
|
547
|
-
override_resource_title=dm.get("metadata", {}).get(
|
548
|
-
"implies_resource_creation", False
|
549
|
-
),
|
571
|
+
content_type=content_type,
|
572
|
+
override_resource_title=dm.get("metadata", {}).get("implies_resource_creation", False),
|
550
573
|
filename=dm.get("metadata", {}).get("filename"),
|
551
574
|
password=dm.get("metadata", {}).get("password"),
|
552
575
|
language=dm.get("metadata", {}).get("language"),
|
@@ -559,26 +582,24 @@ async def _tus_patch(
|
|
559
582
|
request=request,
|
560
583
|
bucket=storage_manager.storage.get_bucket_name(kbid),
|
561
584
|
item=creation_payload,
|
585
|
+
extract_strategy=dm.get("extract_strategy") or None,
|
562
586
|
)
|
563
587
|
except LimitsExceededError as exc:
|
564
588
|
raise HTTPException(status_code=exc.status_code, detail=exc.detail)
|
565
589
|
|
566
590
|
headers["NDB-Seq"] = f"{seqid}"
|
567
591
|
else:
|
568
|
-
|
592
|
+
validate_intermediate_tus_chunk(read_bytes, storage_manager)
|
569
593
|
await dm.save()
|
570
594
|
|
571
595
|
return Response(headers=headers)
|
572
596
|
|
573
597
|
|
574
|
-
def
|
575
|
-
|
576
|
-
storage_manager.
|
577
|
-
|
578
|
-
|
579
|
-
raise HTTPPreconditionFailed(
|
580
|
-
detail=f"Intermediate chunks cannot be smaller than {storage_manager.min_upload_size} bytes"
|
581
|
-
)
|
598
|
+
def validate_intermediate_tus_chunk(read_bytes: int, storage_manager: FileStorageManager):
|
599
|
+
try:
|
600
|
+
storage_manager.validate_intermediate_chunk(read_bytes)
|
601
|
+
except ValueError as err:
|
602
|
+
raise HTTPPreconditionFailed(detail=str(err))
|
582
603
|
|
583
604
|
|
584
605
|
@api.post(
|
@@ -599,6 +620,7 @@ async def upload_rslug_prefix(
|
|
599
620
|
x_password: Optional[list[str]] = Header(None), # type: ignore
|
600
621
|
x_language: Optional[list[str]] = Header(None), # type: ignore
|
601
622
|
x_md5: Optional[list[str]] = Header(None), # type: ignore
|
623
|
+
x_extract_strategy: Annotated[Optional[str], ExtractStrategyHeader] = None,
|
602
624
|
) -> ResourceFileUploaded:
|
603
625
|
rid = await get_rid_from_slug_or_raise_error(kbid, rslug)
|
604
626
|
return await _upload(
|
@@ -610,6 +632,7 @@ async def upload_rslug_prefix(
|
|
610
632
|
x_password=x_password,
|
611
633
|
x_language=x_language,
|
612
634
|
x_md5=x_md5,
|
635
|
+
x_extract_strategy=x_extract_strategy,
|
613
636
|
)
|
614
637
|
|
615
638
|
|
@@ -631,6 +654,7 @@ async def upload_rid_prefix(
|
|
631
654
|
x_password: Optional[list[str]] = Header(None), # type: ignore
|
632
655
|
x_language: Optional[list[str]] = Header(None), # type: ignore
|
633
656
|
x_md5: Optional[list[str]] = Header(None), # type: ignore
|
657
|
+
x_extract_strategy: Annotated[Optional[str], ExtractStrategyHeader] = None,
|
634
658
|
) -> ResourceFileUploaded:
|
635
659
|
return await _upload(
|
636
660
|
request,
|
@@ -641,6 +665,7 @@ async def upload_rid_prefix(
|
|
641
665
|
x_password=x_password,
|
642
666
|
x_language=x_language,
|
643
667
|
x_md5=x_md5,
|
668
|
+
x_extract_strategy=x_extract_strategy,
|
644
669
|
)
|
645
670
|
|
646
671
|
|
@@ -660,6 +685,7 @@ async def upload(
|
|
660
685
|
x_password: Optional[list[str]] = Header(None), # type: ignore
|
661
686
|
x_language: Optional[list[str]] = Header(None), # type: ignore
|
662
687
|
x_md5: Optional[list[str]] = Header(None), # type: ignore
|
688
|
+
x_extract_strategy: Annotated[Optional[str], ExtractStrategyHeader] = None,
|
663
689
|
) -> ResourceFileUploaded:
|
664
690
|
return await _upload(
|
665
691
|
request,
|
@@ -668,6 +694,7 @@ async def upload(
|
|
668
694
|
x_password=x_password,
|
669
695
|
x_language=x_language,
|
670
696
|
x_md5=x_md5,
|
697
|
+
x_extract_strategy=x_extract_strategy,
|
671
698
|
)
|
672
699
|
|
673
700
|
|
@@ -681,6 +708,7 @@ async def _upload(
|
|
681
708
|
x_password: Optional[list[str]] = Header(None), # type: ignore
|
682
709
|
x_language: Optional[list[str]] = Header(None), # type: ignore
|
683
710
|
x_md5: Optional[list[str]] = Header(None), # type: ignore
|
711
|
+
x_extract_strategy: Optional[str] = None,
|
684
712
|
) -> ResourceFileUploaded:
|
685
713
|
if path_rid is not None:
|
686
714
|
await validate_rid_exists_or_raise_error(kbid, path_rid)
|
@@ -688,9 +716,7 @@ async def _upload(
|
|
688
716
|
await maybe_back_pressure(request, kbid, resource_uuid=path_rid)
|
689
717
|
|
690
718
|
md5_user = x_md5[0] if x_md5 is not None and len(x_md5) > 0 else None
|
691
|
-
path, rid, valid_field = await validate_field_upload(
|
692
|
-
kbid, path_rid, field, md5_user
|
693
|
-
)
|
719
|
+
path, rid, valid_field = await validate_field_upload(kbid, path_rid, field, md5_user)
|
694
720
|
dm = get_dm()
|
695
721
|
storage_manager = get_storage_manager()
|
696
722
|
|
@@ -719,8 +745,14 @@ async def _upload(
|
|
719
745
|
# - content-type set by the user in the upload request header takes precedence.
|
720
746
|
# - if not set, we will try to guess it from the filename and default to a generic binary content type otherwise
|
721
747
|
content_type = request.headers.get("content-type")
|
722
|
-
if
|
723
|
-
content_type =
|
748
|
+
if content_type is None:
|
749
|
+
content_type = content_types.guess(filename) or "application/octet-stream"
|
750
|
+
|
751
|
+
if not content_types.valid(content_type):
|
752
|
+
raise HTTPException(
|
753
|
+
status_code=415,
|
754
|
+
detail=f"Unsupported content type: {content_type}",
|
755
|
+
)
|
724
756
|
|
725
757
|
metadata = {"content_type": content_type, "filename": filename}
|
726
758
|
|
@@ -770,6 +802,7 @@ async def _upload(
|
|
770
802
|
path=path,
|
771
803
|
request=request,
|
772
804
|
bucket=storage_manager.storage.get_bucket_name(kbid),
|
805
|
+
extract_strategy=x_extract_strategy,
|
773
806
|
)
|
774
807
|
except LimitsExceededError as exc:
|
775
808
|
raise HTTPException(status_code=exc.status_code, detail=exc.detail)
|
@@ -791,13 +824,9 @@ async def validate_field_upload(
|
|
791
824
|
if rid is None:
|
792
825
|
# we are going to create a new resource and a field
|
793
826
|
if md5 is not None:
|
794
|
-
exists = await datamanagers.atomic.resources.resource_exists(
|
795
|
-
kbid=kbid, rid=md5
|
796
|
-
)
|
827
|
+
exists = await datamanagers.atomic.resources.resource_exists(kbid=kbid, rid=md5)
|
797
828
|
if exists:
|
798
|
-
raise HTTPConflict(
|
799
|
-
"A resource with the same uploaded file already exists"
|
800
|
-
)
|
829
|
+
raise HTTPConflict("A resource with the same uploaded file already exists")
|
801
830
|
rid = md5
|
802
831
|
else:
|
803
832
|
rid = uuid.uuid4().hex
|
@@ -823,7 +852,7 @@ async def store_file_on_nuclia_db(
|
|
823
852
|
path: str,
|
824
853
|
request: Request,
|
825
854
|
bucket: str,
|
826
|
-
source: Source,
|
855
|
+
source: CloudFile.Source.ValueType,
|
827
856
|
rid: str,
|
828
857
|
field: str,
|
829
858
|
content_type: str = "application/octet-stream",
|
@@ -833,11 +862,10 @@ async def store_file_on_nuclia_db(
|
|
833
862
|
language: Optional[str] = None,
|
834
863
|
md5: Optional[str] = None,
|
835
864
|
item: Optional[CreateResourcePayload] = None,
|
865
|
+
extract_strategy: Optional[str] = None,
|
836
866
|
) -> Optional[int]:
|
837
867
|
# File is on NucliaDB Storage at path
|
838
|
-
|
839
868
|
partitioning = get_partitioning()
|
840
|
-
transaction = get_transaction_utility()
|
841
869
|
processing = get_processing()
|
842
870
|
storage = await get_storage(service_name=SERVICE_NAME)
|
843
871
|
|
@@ -859,14 +887,17 @@ async def store_file_on_nuclia_db(
|
|
859
887
|
|
860
888
|
parse_audit(writer.audit, request)
|
861
889
|
|
890
|
+
unique_slug_context_manager = noop_context_manager()
|
862
891
|
if item is not None:
|
863
892
|
if item.slug:
|
893
|
+
unique_slug_context_manager = ensure_slug_uniqueness(kbid, item.slug)
|
864
894
|
writer.slug = item.slug
|
865
895
|
toprocess.slug = item.slug
|
866
896
|
|
867
897
|
toprocess.processing_options = item.processing_options
|
868
898
|
|
869
|
-
|
899
|
+
kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
|
900
|
+
parse_basic_creation(writer, item, toprocess, kb_config)
|
870
901
|
if item.origin is not None:
|
871
902
|
parse_origin(writer.origin, item.origin)
|
872
903
|
if item.extra is not None:
|
@@ -882,62 +913,63 @@ async def store_file_on_nuclia_db(
|
|
882
913
|
uuid=rid,
|
883
914
|
x_skip_store=False,
|
884
915
|
)
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
file_field.file.
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
await
|
924
|
-
|
925
|
-
raise HTTPException(
|
926
|
-
status_code=501,
|
927
|
-
detail="Inconsistent write. This resource will not be processed and may not be stored.",
|
916
|
+
else:
|
917
|
+
# Use defaults for everything, but don't forget hidden which depends on KB config
|
918
|
+
kb_config = await datamanagers.atomic.kb.get_config(kbid=kbid)
|
919
|
+
if kb_config and kb_config.hidden_resources_hide_on_creation:
|
920
|
+
writer.basic.hidden = True
|
921
|
+
|
922
|
+
async with unique_slug_context_manager:
|
923
|
+
if override_resource_title and filename is not None:
|
924
|
+
set_title(writer, toprocess, filename)
|
925
|
+
|
926
|
+
writer.basic.icon = content_type
|
927
|
+
writer.basic.created.FromDatetime(datetime.now())
|
928
|
+
|
929
|
+
# Update resource with file
|
930
|
+
file_field = FieldFile()
|
931
|
+
file_field.added.FromDatetime(datetime.now())
|
932
|
+
file_field.file.bucket_name = bucket
|
933
|
+
file_field.file.content_type = content_type
|
934
|
+
if filename is not None:
|
935
|
+
file_field.file.filename = filename
|
936
|
+
file_field.file.uri = path
|
937
|
+
file_field.file.source = source
|
938
|
+
|
939
|
+
if md5:
|
940
|
+
file_field.file.md5 = md5
|
941
|
+
if size:
|
942
|
+
file_field.file.size = size
|
943
|
+
if language:
|
944
|
+
file_field.language = language
|
945
|
+
if password:
|
946
|
+
file_field.password = password
|
947
|
+
if extract_strategy is not None:
|
948
|
+
file_field.extract_strategy = extract_strategy
|
949
|
+
|
950
|
+
writer.files[field].CopyFrom(file_field)
|
951
|
+
# Do not store passwords on maindb
|
952
|
+
writer.files[field].ClearField("password")
|
953
|
+
|
954
|
+
toprocess.filefield[field] = await processing.convert_internal_filefield_to_str(
|
955
|
+
file_field, storage=storage
|
928
956
|
)
|
929
957
|
|
930
|
-
|
931
|
-
|
932
|
-
|
933
|
-
|
934
|
-
|
935
|
-
|
936
|
-
|
937
|
-
detail=
|
938
|
-
|
958
|
+
writer.source = BrokerMessage.MessageSource.WRITER
|
959
|
+
writer.basic.metadata.status = Metadata.Status.PENDING
|
960
|
+
writer.basic.metadata.useful = True
|
961
|
+
await transaction.commit(writer, partition)
|
962
|
+
try:
|
963
|
+
processing_info = await processing.send_to_process(toprocess, partition)
|
964
|
+
except LimitsExceededError as exc:
|
965
|
+
raise HTTPException(status_code=exc.status_code, detail=exc.detail)
|
966
|
+
except SendToProcessError:
|
967
|
+
raise HTTPException(
|
968
|
+
status_code=500,
|
969
|
+
detail="Error while sending to process. Try calling /reprocess",
|
970
|
+
)
|
939
971
|
|
940
|
-
|
972
|
+
return processing_info.seqid
|
941
973
|
|
942
974
|
|
943
975
|
def maybe_b64decode(some_string: str) -> str:
|
@@ -946,9 +978,3 @@ def maybe_b64decode(some_string: str) -> str:
|
|
946
978
|
except ValueError:
|
947
979
|
# not b64encoded
|
948
980
|
return some_string
|
949
|
-
|
950
|
-
|
951
|
-
def guess_content_type(filename: str) -> str:
|
952
|
-
default = "application/octet-stream"
|
953
|
-
guessed, _ = mimetypes.guess_type(filename)
|
954
|
-
return guessed or default
|
nucliadb/writer/app.py
CHANGED
@@ -18,61 +18,38 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
|
21
|
-
import
|
21
|
+
import importlib.metadata
|
22
22
|
|
23
|
-
import pkg_resources
|
24
23
|
from fastapi import FastAPI
|
25
24
|
from starlette.middleware import Middleware
|
26
25
|
from starlette.middleware.authentication import AuthenticationMiddleware
|
27
|
-
from starlette.middleware.cors import CORSMiddleware
|
28
26
|
from starlette.requests import ClientDisconnect
|
29
27
|
from starlette.responses import HTMLResponse
|
30
28
|
|
31
|
-
from nucliadb.common.context.fastapi import get_app_context, set_app_context
|
32
29
|
from nucliadb.writer import API_PREFIX
|
33
30
|
from nucliadb.writer.api.v1.router import api as api_v1
|
34
|
-
from nucliadb.writer.lifecycle import
|
31
|
+
from nucliadb.writer.lifecycle import lifespan
|
35
32
|
from nucliadb_telemetry import errors
|
36
33
|
from nucliadb_telemetry.fastapi.utils import (
|
37
34
|
client_disconnect_handler,
|
38
35
|
global_exception_handler,
|
39
36
|
)
|
40
|
-
from nucliadb_utils import const
|
41
37
|
from nucliadb_utils.authentication import NucliaCloudAuthenticationBackend
|
42
38
|
from nucliadb_utils.fastapi.openapi import extend_openapi
|
43
39
|
from nucliadb_utils.fastapi.versioning import VersionedFastAPI
|
44
|
-
from nucliadb_utils.settings import
|
45
|
-
from nucliadb_utils.utilities import has_feature
|
40
|
+
from nucliadb_utils.settings import running_settings
|
46
41
|
|
47
42
|
middleware = []
|
48
43
|
|
49
|
-
|
50
|
-
middleware.append(
|
51
|
-
Middleware(
|
52
|
-
CORSMiddleware,
|
53
|
-
allow_origins=http_settings.cors_origins,
|
54
|
-
allow_methods=["*"],
|
55
|
-
# Authorization will be exluded from * in the future, (CORS non-wildcard request-header).
|
56
|
-
# Browsers already showing deprecation notices, so it needs to be specified explicitly
|
57
|
-
allow_headers=["*", "Authorization"],
|
58
|
-
)
|
59
|
-
)
|
60
|
-
|
61
|
-
middleware.extend(
|
62
|
-
[Middleware(AuthenticationMiddleware, backend=NucliaCloudAuthenticationBackend())]
|
63
|
-
)
|
64
|
-
|
44
|
+
middleware.extend([Middleware(AuthenticationMiddleware, backend=NucliaCloudAuthenticationBackend())])
|
65
45
|
|
66
|
-
errors.setup_error_handling(pkg_resources.get_distribution("nucliadb").version)
|
67
46
|
|
68
|
-
|
69
|
-
on_shutdown = [finalize]
|
47
|
+
errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
|
70
48
|
|
71
49
|
fastapi_settings = dict(
|
72
50
|
debug=running_settings.debug,
|
73
51
|
middleware=middleware,
|
74
|
-
|
75
|
-
on_shutdown=on_shutdown,
|
52
|
+
lifespan=lifespan,
|
76
53
|
exception_handlers={
|
77
54
|
Exception: global_exception_handler,
|
78
55
|
ClientDisconnect: client_disconnect_handler,
|
@@ -102,18 +79,4 @@ def create_application() -> FastAPI:
|
|
102
79
|
# Use raw starlette routes to avoid unnecessary overhead
|
103
80
|
application.add_route("/", homepage)
|
104
81
|
|
105
|
-
set_app_context(application)
|
106
|
-
maybe_configure_back_pressure(application)
|
107
82
|
return application
|
108
|
-
|
109
|
-
|
110
|
-
def maybe_configure_back_pressure(application: FastAPI):
|
111
|
-
from nucliadb.writer.back_pressure import start_materializer, stop_materializer
|
112
|
-
from nucliadb.writer.settings import back_pressure_settings
|
113
|
-
from nucliadb_utils.settings import is_onprem_nucliadb
|
114
|
-
|
115
|
-
if back_pressure_settings.enabled and not is_onprem_nucliadb():
|
116
|
-
context = get_app_context(application)
|
117
|
-
start_materializer_with_context = functools.partial(start_materializer, context)
|
118
|
-
application.add_event_handler("startup", start_materializer_with_context)
|
119
|
-
application.add_event_handler("shutdown", stop_materializer)
|