nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/writer/tus/s3.py
CHANGED
@@ -19,21 +19,22 @@
|
|
19
19
|
#
|
20
20
|
from __future__ import annotations
|
21
21
|
|
22
|
+
import base64
|
22
23
|
import uuid
|
23
24
|
from contextlib import AsyncExitStack
|
24
|
-
from typing import
|
25
|
+
from typing import Optional
|
25
26
|
|
26
27
|
import aiobotocore # type: ignore
|
27
28
|
import aiohttp
|
28
|
-
import backoff
|
29
|
+
import backoff
|
29
30
|
import botocore # type: ignore
|
30
31
|
from aiobotocore.session import AioSession # type: ignore
|
31
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
32
32
|
|
33
33
|
from nucliadb.writer import logger
|
34
34
|
from nucliadb.writer.tus.dm import FileDataManager
|
35
|
-
from nucliadb.writer.tus.exceptions import
|
35
|
+
from nucliadb.writer.tus.exceptions import ResumableURINotAvailable
|
36
36
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
37
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
37
38
|
from nucliadb_utils.storages.s3 import (
|
38
39
|
CHUNK_SIZE,
|
39
40
|
MIN_UPLOAD_SIZE,
|
@@ -53,9 +54,7 @@ class S3FileStorageManager(FileStorageManager):
|
|
53
54
|
chunk_size = CHUNK_SIZE
|
54
55
|
min_upload_size = MIN_UPLOAD_SIZE
|
55
56
|
|
56
|
-
@backoff.on_exception(
|
57
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
58
|
-
)
|
57
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
59
58
|
async def _abort_multipart(self, dm: FileDataManager):
|
60
59
|
try:
|
61
60
|
mpu = dm.get("mpu")
|
@@ -72,21 +71,25 @@ class S3FileStorageManager(FileStorageManager):
|
|
72
71
|
if dm.get("mpu") is not None:
|
73
72
|
await self._abort_multipart(dm)
|
74
73
|
|
74
|
+
custom_metadata: dict[str, str] = {
|
75
|
+
"base64_filename": base64.b64encode((dm.filename or "").encode()).decode(),
|
76
|
+
"content_type": dm.content_type or "",
|
77
|
+
"size": str(dm.size),
|
78
|
+
}
|
79
|
+
|
75
80
|
await dm.update(
|
76
81
|
path=path,
|
77
82
|
upload_file_id=upload_file_id,
|
78
83
|
multipart={"Parts": []},
|
79
84
|
block=1,
|
80
|
-
mpu=await self._create_multipart(path, bucket),
|
85
|
+
mpu=await self._create_multipart(path, bucket, custom_metadata),
|
81
86
|
bucket=bucket,
|
82
87
|
)
|
83
88
|
|
84
|
-
@backoff.on_exception(
|
85
|
-
|
86
|
-
)
|
87
|
-
async def _create_multipart(self, path, bucket):
|
89
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
90
|
+
async def _create_multipart(self, path, bucket, custom_metadata: dict[str, str]):
|
88
91
|
return await self.storage._s3aioclient.create_multipart_upload(
|
89
|
-
Bucket=bucket, Key=path
|
92
|
+
Bucket=bucket, Key=path, Metadata=custom_metadata
|
90
93
|
)
|
91
94
|
|
92
95
|
async def append(self, dm: FileDataManager, iterable, offset) -> int:
|
@@ -96,16 +99,12 @@ class S3FileStorageManager(FileStorageManager):
|
|
96
99
|
size += len(chunk)
|
97
100
|
part = await self._upload_part(dm, chunk)
|
98
101
|
multipart = dm.get("multipart")
|
99
|
-
multipart["Parts"].append(
|
100
|
-
{"PartNumber": dm.get("block"), "ETag": part["ETag"]}
|
101
|
-
)
|
102
|
+
multipart["Parts"].append({"PartNumber": dm.get("block"), "ETag": part["ETag"]})
|
102
103
|
await dm.update(multipart=multipart, block=dm.get("block") + 1)
|
103
104
|
|
104
105
|
return size
|
105
106
|
|
106
|
-
@backoff.on_exception(
|
107
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
108
|
-
)
|
107
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
109
108
|
async def _upload_part(self, dm: FileDataManager, data):
|
110
109
|
mpu = dm.get("mpu")
|
111
110
|
if mpu is None:
|
@@ -128,18 +127,14 @@ class S3FileStorageManager(FileStorageManager):
|
|
128
127
|
await dm.finish()
|
129
128
|
return path
|
130
129
|
|
131
|
-
@backoff.on_exception(
|
132
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
133
|
-
)
|
130
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
134
131
|
async def _complete_multipart_upload(self, dm: FileDataManager):
|
135
132
|
# if blocks is 0, it means the file is of zero length so we need to
|
136
133
|
# trick it to finish a multiple part with no data.
|
137
134
|
if dm.get("block") == 1:
|
138
135
|
part = await self._upload_part(dm, b"")
|
139
136
|
multipart = dm.get("multipart")
|
140
|
-
multipart["Parts"].append(
|
141
|
-
{"PartNumber": dm.get("block"), "ETag": part["ETag"]}
|
142
|
-
)
|
137
|
+
multipart["Parts"].append({"PartNumber": dm.get("block"), "ETag": part["ETag"]})
|
143
138
|
await dm.update(multipart=multipart, block=dm.get("block") + 1)
|
144
139
|
await self.storage._s3aioclient.complete_multipart_upload(
|
145
140
|
Bucket=dm.get("bucket"),
|
@@ -148,45 +143,10 @@ class S3FileStorageManager(FileStorageManager):
|
|
148
143
|
MultipartUpload=dm.get("multipart"),
|
149
144
|
)
|
150
145
|
|
151
|
-
@backoff.on_exception(
|
152
|
-
backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
|
153
|
-
)
|
146
|
+
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
|
154
147
|
async def _download(self, uri: str, kbid: str, **kwargs):
|
155
148
|
bucket = self.storage.get_bucket_name(kbid)
|
156
|
-
return await self.storage._s3aioclient.get_object(
|
157
|
-
Bucket=bucket, Key=uri, **kwargs
|
158
|
-
)
|
159
|
-
|
160
|
-
async def iter_data(
|
161
|
-
self, uri: str, kbid: str, headers: Optional[dict[str, str]] = None
|
162
|
-
):
|
163
|
-
if headers is None:
|
164
|
-
headers = {}
|
165
|
-
try:
|
166
|
-
downloader = await self._download(uri, kbid, **headers)
|
167
|
-
except self.storage._s3aioclient.exceptions.NoSuchKey:
|
168
|
-
raise CloudFileNotFound()
|
169
|
-
|
170
|
-
# we do not want to timeout ever from this...
|
171
|
-
# downloader['Body'].set_socket_timeout(999999)
|
172
|
-
stream = downloader["Body"]
|
173
|
-
data = await stream.read(CHUNK_SIZE)
|
174
|
-
while True:
|
175
|
-
if not data:
|
176
|
-
break
|
177
|
-
yield data
|
178
|
-
data = await stream.read(CHUNK_SIZE)
|
179
|
-
|
180
|
-
async def read_range(
|
181
|
-
self, uri, kbid: str, start: int, end: int
|
182
|
-
) -> AsyncIterator[bytes]:
|
183
|
-
"""
|
184
|
-
Iterate through ranges of data
|
185
|
-
"""
|
186
|
-
async for chunk in self.iter_data(
|
187
|
-
uri, kbid, headers={"Range": f"bytes={start}-{end - 1}"}
|
188
|
-
):
|
189
|
-
yield chunk
|
149
|
+
return await self.storage._s3aioclient.get_object(Bucket=bucket, Key=uri, **kwargs)
|
190
150
|
|
191
151
|
async def delete_upload(self, uri: str, kbid: str):
|
192
152
|
bucket = self.storage.get_bucket_name(kbid)
|
@@ -198,6 +158,10 @@ class S3FileStorageManager(FileStorageManager):
|
|
198
158
|
else:
|
199
159
|
raise AttributeError("No valid uri")
|
200
160
|
|
161
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
162
|
+
if uploaded_bytes % self.min_upload_size != 0:
|
163
|
+
raise ValueError(f"Intermediate chunks need to be multiples of {self.min_upload_size} bytes")
|
164
|
+
|
201
165
|
|
202
166
|
class S3BlobStore(BlobStore):
|
203
167
|
async def check_exists(self, bucket_name: str) -> bool:
|
@@ -213,9 +177,7 @@ class S3BlobStore(BlobStore):
|
|
213
177
|
async def create_bucket(self, bucket):
|
214
178
|
exists = await self.check_exists(bucket)
|
215
179
|
if not exists:
|
216
|
-
await create_bucket(
|
217
|
-
self._s3aioclient, bucket, self.bucket_tags, self.region_name
|
218
|
-
)
|
180
|
+
await create_bucket(self._s3aioclient, bucket, self.bucket_tags, self.region_name)
|
219
181
|
return exists
|
220
182
|
|
221
183
|
async def finalize(self):
|
@@ -247,9 +209,7 @@ class S3BlobStore(BlobStore):
|
|
247
209
|
verify=verify_ssl,
|
248
210
|
use_ssl=ssl,
|
249
211
|
region_name=region_name,
|
250
|
-
config=aiobotocore.config.AioConfig(
|
251
|
-
None, max_pool_connections=max_pool_connections
|
252
|
-
),
|
212
|
+
config=aiobotocore.config.AioConfig(None, max_pool_connections=max_pool_connections),
|
253
213
|
)
|
254
214
|
session = AioSession()
|
255
215
|
self._s3aioclient = await self._exit_stack.enter_async_context(
|
nucliadb/writer/tus/storage.py
CHANGED
@@ -21,15 +21,8 @@ from __future__ import annotations
|
|
21
21
|
|
22
22
|
from typing import AsyncIterator, Optional
|
23
23
|
|
24
|
-
from lru import LRU # type: ignore
|
25
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
26
|
-
from starlette.responses import StreamingResponse
|
27
|
-
|
28
|
-
from nucliadb.writer import logger
|
29
24
|
from nucliadb.writer.tus.dm import FileDataManager
|
30
|
-
from
|
31
|
-
|
32
|
-
CACHED_BUCKETS = LRU(50) # type: ignore
|
25
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
33
26
|
|
34
27
|
|
35
28
|
class BlobStore:
|
@@ -56,14 +49,9 @@ class FileStorageManager:
|
|
56
49
|
chunk_size: int
|
57
50
|
min_upload_size: Optional[int] = None
|
58
51
|
|
59
|
-
def __init__(self, storage):
|
52
|
+
def __init__(self, storage: BlobStore):
|
60
53
|
self.storage = storage
|
61
54
|
|
62
|
-
def read_range(
|
63
|
-
self, uri: str, kbid: str, start: int, end: int
|
64
|
-
) -> AsyncIterator[bytes]:
|
65
|
-
raise NotImplementedError()
|
66
|
-
|
67
55
|
def iter_data(
|
68
56
|
self, uri: str, kbid: str, headers: Optional[dict[str, str]] = None
|
69
57
|
) -> AsyncIterator[bytes]:
|
@@ -81,48 +69,6 @@ class FileStorageManager:
|
|
81
69
|
async def delete_upload(self, uri, kbid):
|
82
70
|
raise NotImplementedError()
|
83
71
|
|
84
|
-
async def full_download(self, content_length, content_type, upload_id):
|
85
|
-
return StreamingResponse(
|
86
|
-
self.iter_data(upload_id),
|
87
|
-
media_type=content_type,
|
88
|
-
headers={
|
89
|
-
"Content-Length": str(content_length),
|
90
|
-
"Content-Type": content_type,
|
91
|
-
},
|
92
|
-
)
|
93
|
-
|
94
|
-
async def range_download(
|
95
|
-
self, content_length, content_type, upload_id, range_header
|
96
|
-
):
|
97
|
-
try:
|
98
|
-
start, _, end = range_header.split("bytes=")[-1].partition("-")
|
99
|
-
start = int(start)
|
100
|
-
if len(end) == 0:
|
101
|
-
# bytes=0- is valid
|
102
|
-
end = content_length - 1
|
103
|
-
end = int(end) + 1 # python is inclusive, http is exclusive
|
104
|
-
except (IndexError, ValueError):
|
105
|
-
# range errors fallback to full download
|
106
|
-
raise HTTPRangeNotSatisfiable(detail=f"Range not parsable {range_header}")
|
107
|
-
if start > end or start < 0:
|
108
|
-
raise HTTPRangeNotSatisfiable(detail="Invalid range {start}-{end}")
|
109
|
-
if end > content_length:
|
110
|
-
raise HTTPRangeNotSatisfiable(
|
111
|
-
detail="Invalid range {start}-{end}, too large end value"
|
112
|
-
)
|
113
|
-
|
114
|
-
logger.debug(f"Range request: {range_header}")
|
115
|
-
headers = {
|
116
|
-
"Content-Range": f"bytes {start}-{end - 1}/{content_length}",
|
117
|
-
"Content-Type": content_type,
|
118
|
-
}
|
119
|
-
|
120
|
-
return StreamingResponse(
|
121
|
-
self.read_range(upload_id, start, end),
|
122
|
-
media_type=content_type,
|
123
|
-
headers=headers,
|
124
|
-
)
|
125
|
-
|
126
72
|
async def iterate_body_chunks(self, request, chunk_size):
|
127
73
|
partial = b""
|
128
74
|
remaining = b""
|
@@ -146,3 +92,6 @@ class FileStorageManager:
|
|
146
92
|
|
147
93
|
if partial or remaining:
|
148
94
|
yield partial + remaining
|
95
|
+
|
96
|
+
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
97
|
+
raise NotImplementedError()
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
21
|
+
#
|
22
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
23
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
24
|
+
#
|
25
|
+
# AGPL:
|
26
|
+
# This program is free software: you can redistribute it and/or modify
|
27
|
+
# it under the terms of the GNU Affero General Public License as
|
28
|
+
# published by the Free Software Foundation, either version 3 of the
|
29
|
+
# License, or (at your option) any later version.
|
30
|
+
#
|
31
|
+
# This program is distributed in the hope that it will be useful,
|
32
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
33
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
34
|
+
# GNU Affero General Public License for more details.
|
35
|
+
#
|
36
|
+
# You should have received a copy of the GNU Affero General Public License
|
37
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
38
|
+
#
|
39
|
+
|
40
|
+
from nucliadb import learning_proxy
|
41
|
+
from nucliadb.common import datamanagers
|
42
|
+
from nucliadb.ingest.orm.exceptions import VectorSetConflict
|
43
|
+
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
44
|
+
from nucliadb.writer import logger
|
45
|
+
from nucliadb_protos import knowledgebox_pb2
|
46
|
+
from nucliadb_telemetry import errors
|
47
|
+
from nucliadb_utils.utilities import get_storage
|
48
|
+
|
49
|
+
|
50
|
+
async def add(kbid: str, vectorset_id: str) -> None:
|
51
|
+
# First off, add the vectorset to the learning configuration if it's not already there
|
52
|
+
lconfig = await learning_proxy.get_configuration(kbid)
|
53
|
+
assert lconfig is not None
|
54
|
+
semantic_models = lconfig.model_dump()["semantic_models"]
|
55
|
+
if vectorset_id not in semantic_models:
|
56
|
+
semantic_models.append(vectorset_id)
|
57
|
+
await learning_proxy.update_configuration(kbid, {"semantic_models": semantic_models})
|
58
|
+
lconfig = await learning_proxy.get_configuration(kbid)
|
59
|
+
assert lconfig is not None
|
60
|
+
|
61
|
+
# Then, add the vectorset to the index if it's not already there
|
62
|
+
async with datamanagers.with_rw_transaction() as txn:
|
63
|
+
kbobj = KnowledgeBox(txn, await get_storage(), kbid)
|
64
|
+
vectorset_config = get_vectorset_config(lconfig, vectorset_id)
|
65
|
+
try:
|
66
|
+
await kbobj.create_vectorset(vectorset_config)
|
67
|
+
await txn.commit()
|
68
|
+
except VectorSetConflict:
|
69
|
+
# Vectorset already exists, nothing to do
|
70
|
+
return
|
71
|
+
|
72
|
+
|
73
|
+
async def delete(kbid: str, vectorset_id: str) -> None:
|
74
|
+
lconfig = await learning_proxy.get_configuration(kbid)
|
75
|
+
if lconfig is not None:
|
76
|
+
semantic_models = lconfig.model_dump()["semantic_models"]
|
77
|
+
if vectorset_id in semantic_models:
|
78
|
+
semantic_models.remove(vectorset_id)
|
79
|
+
await learning_proxy.update_configuration(kbid, {"semantic_models": semantic_models})
|
80
|
+
try:
|
81
|
+
async with datamanagers.with_rw_transaction() as txn:
|
82
|
+
kbobj = KnowledgeBox(txn, await get_storage(), kbid)
|
83
|
+
await kbobj.delete_vectorset(vectorset_id=vectorset_id)
|
84
|
+
await txn.commit()
|
85
|
+
except Exception as ex:
|
86
|
+
errors.capture_exception(ex)
|
87
|
+
logger.exception(
|
88
|
+
"Could not delete vectorset from index", extra={"kbid": kbid, "vectorset_id": vectorset_id}
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
def get_vectorset_config(
|
93
|
+
learning_config: learning_proxy.LearningConfiguration, vectorset_id: str
|
94
|
+
) -> knowledgebox_pb2.VectorSetConfig:
|
95
|
+
"""
|
96
|
+
Create a VectorSetConfig from a LearningConfiguration for a given vectorset_id
|
97
|
+
"""
|
98
|
+
vectorset_config = knowledgebox_pb2.VectorSetConfig(vectorset_id=vectorset_id)
|
99
|
+
vectorset_index_config = knowledgebox_pb2.VectorIndexConfig(
|
100
|
+
vector_type=knowledgebox_pb2.VectorType.DENSE_F32,
|
101
|
+
)
|
102
|
+
model_config = learning_config.semantic_model_configs[vectorset_id]
|
103
|
+
|
104
|
+
# Parse similarity function
|
105
|
+
parsed_similarity = learning_proxy.SimilarityFunction(model_config.similarity)
|
106
|
+
if parsed_similarity == learning_proxy.SimilarityFunction.COSINE.value:
|
107
|
+
vectorset_index_config.similarity = knowledgebox_pb2.VectorSimilarity.COSINE
|
108
|
+
elif parsed_similarity == learning_proxy.SimilarityFunction.DOT.value:
|
109
|
+
vectorset_index_config.similarity = knowledgebox_pb2.VectorSimilarity.DOT
|
110
|
+
else:
|
111
|
+
raise ValueError(
|
112
|
+
f"Unknown similarity function {model_config.similarity}, parsed as {parsed_similarity}"
|
113
|
+
)
|
114
|
+
|
115
|
+
# Parse vector dimension
|
116
|
+
vectorset_index_config.vector_dimension = model_config.size
|
117
|
+
|
118
|
+
# Parse matryoshka dimensions
|
119
|
+
if len(model_config.matryoshka_dims) > 0:
|
120
|
+
vectorset_index_config.normalize_vectors = True
|
121
|
+
vectorset_config.matryoshka_dimensions.extend(model_config.matryoshka_dims)
|
122
|
+
else:
|
123
|
+
vectorset_index_config.normalize_vectors = False
|
124
|
+
vectorset_config.vectorset_index_config.CopyFrom(vectorset_index_config)
|
125
|
+
return vectorset_config
|
@@ -0,0 +1,148 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: nucliadb
|
3
|
+
Version: 6.2.1.post2777
|
4
|
+
Home-page: https://docs.nuclia.dev/docs/management/nucliadb/intro
|
5
|
+
Author: NucliaDB Community
|
6
|
+
Author-email: nucliadb@nuclia.com
|
7
|
+
License: BSD
|
8
|
+
Project-URL: Nuclia, https://nuclia.com
|
9
|
+
Project-URL: Github, https://github.com/nuclia/nucliadb
|
10
|
+
Project-URL: Slack, https://nuclia-community.slack.com
|
11
|
+
Project-URL: API Reference, https://docs.nuclia.dev/docs/api
|
12
|
+
Keywords: search,semantic,AI
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
14
|
+
Classifier: Intended Audience :: Developers
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
16
|
+
Classifier: License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)
|
17
|
+
Classifier: Programming Language :: Python
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
22
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
23
|
+
Requires-Python: >=3.9, <4
|
24
|
+
Description-Content-Type: text/markdown
|
25
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.2.1.post2777
|
26
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.2.1.post2777
|
27
|
+
Requires-Dist: nucliadb-protos>=6.2.1.post2777
|
28
|
+
Requires-Dist: nucliadb-models>=6.2.1.post2777
|
29
|
+
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
30
|
+
Requires-Dist: nucliadb-node-binding>=2.26.0
|
31
|
+
Requires-Dist: nuclia-models>=0.24.2
|
32
|
+
Requires-Dist: uvicorn
|
33
|
+
Requires-Dist: argdantic
|
34
|
+
Requires-Dist: aiohttp>=3.11.11
|
35
|
+
Requires-Dist: lru-dict>=1.1.7
|
36
|
+
Requires-Dist: backoff
|
37
|
+
Requires-Dist: aiofiles>=0.8.0
|
38
|
+
Requires-Dist: psutil>=5.9.7
|
39
|
+
Requires-Dist: types-psutil>=5.9.5.17
|
40
|
+
Requires-Dist: types-aiofiles>=0.8.3
|
41
|
+
Requires-Dist: protobuf>=4.22.3
|
42
|
+
Requires-Dist: types-protobuf<5,>=4.24
|
43
|
+
Requires-Dist: grpcio<1.63.0,>=1.44.0
|
44
|
+
Requires-Dist: grpcio-health-checking<1.63.0,>=1.44.0
|
45
|
+
Requires-Dist: grpcio-channelz<1.63.0,>=1.44.0
|
46
|
+
Requires-Dist: grpcio-status<1.63.0,>=1.44.0
|
47
|
+
Requires-Dist: grpcio-tools<1.63.0,>=1.44.0
|
48
|
+
Requires-Dist: grpcio-testing<1.63.0,>=1.44.0
|
49
|
+
Requires-Dist: grpcio-reflection<1.63.0,>=1.44.0
|
50
|
+
Requires-Dist: orjson>=3.6.7
|
51
|
+
Requires-Dist: types-setuptools
|
52
|
+
Requires-Dist: pydantic>=2.6
|
53
|
+
Requires-Dist: pydantic-settings>=2.2
|
54
|
+
Requires-Dist: aiobotocore>=2.9.0
|
55
|
+
Requires-Dist: botocore>=1.34.0
|
56
|
+
Requires-Dist: google-cloud-storage
|
57
|
+
Requires-Dist: gcloud
|
58
|
+
Requires-Dist: oauth2client
|
59
|
+
Requires-Dist: jwcrypto>=1.5.6
|
60
|
+
Requires-Dist: pyyaml>=5.1
|
61
|
+
Requires-Dist: fastapi-versioning>=0.10.0
|
62
|
+
Requires-Dist: fastapi>=0.95.2
|
63
|
+
Requires-Dist: sentry-sdk>=2.8.0
|
64
|
+
Requires-Dist: pyjwt>=2.4.0
|
65
|
+
Requires-Dist: mmh3>=3.0.0
|
66
|
+
Requires-Dist: httpx>=0.23.0
|
67
|
+
Requires-Dist: grpc-stubs>=1.44.0
|
68
|
+
Requires-Dist: aiodns>=3.0.0
|
69
|
+
Requires-Dist: types-orjson
|
70
|
+
Requires-Dist: psycopg[binary,pool]
|
71
|
+
Requires-Dist: multidict>=6.0.4
|
72
|
+
Requires-Dist: deprecated>=1.2.12
|
73
|
+
Requires-Dist: asgiref>=3.3.2
|
74
|
+
Requires-Dist: jmespath>=1.0.0
|
75
|
+
Requires-Dist: idna>=3.3
|
76
|
+
Requires-Dist: sniffio>=1.2.0
|
77
|
+
Requires-Dist: async_lru>=2.0.4
|
78
|
+
Requires-Dist: async-timeout>=4.0.3
|
79
|
+
Requires-Dist: cachetools>=5.3.2
|
80
|
+
Requires-Dist: types-cachetools>=5.3.0.5
|
81
|
+
Requires-Dist: kubernetes_asyncio<30.0.0
|
82
|
+
Provides-Extra: redis
|
83
|
+
Requires-Dist: redis>=4.3.4; extra == "redis"
|
84
|
+
Dynamic: author
|
85
|
+
Dynamic: author-email
|
86
|
+
Dynamic: classifier
|
87
|
+
Dynamic: description
|
88
|
+
Dynamic: description-content-type
|
89
|
+
Dynamic: home-page
|
90
|
+
Dynamic: keywords
|
91
|
+
Dynamic: license
|
92
|
+
Dynamic: project-url
|
93
|
+
Dynamic: provides-extra
|
94
|
+
Dynamic: requires-dist
|
95
|
+
Dynamic: requires-python
|
96
|
+
|
97
|
+
# nucliadb
|
98
|
+
|
99
|
+
This module contains most of the Python components for NucliaDB:
|
100
|
+
|
101
|
+
- ingest
|
102
|
+
- reader
|
103
|
+
- writer
|
104
|
+
- search
|
105
|
+
- train
|
106
|
+
|
107
|
+
# NucliaDB Migrations
|
108
|
+
|
109
|
+
This module is used to manage NucliaDB Migrations.
|
110
|
+
|
111
|
+
All migrations will be provided in the `migrations` folder and have a filename
|
112
|
+
that follows the structure: `[sequence]_[migration name].py`.
|
113
|
+
Where `sequence` is the order the migration should be run in with zero padding.
|
114
|
+
Example: `0001_migrate_data.py`.
|
115
|
+
|
116
|
+
Each migration should have the following:
|
117
|
+
|
118
|
+
```python
|
119
|
+
from nucliadb.migrator.context import ExecutionContext
|
120
|
+
|
121
|
+
|
122
|
+
async def migrate(context: ExecutionContext) -> None:
|
123
|
+
"""
|
124
|
+
Non-kb type of migration. Migrate global data.
|
125
|
+
"""
|
126
|
+
|
127
|
+
|
128
|
+
async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
|
129
|
+
"""
|
130
|
+
Migrate kb.
|
131
|
+
|
132
|
+
Must have both types of migrations.
|
133
|
+
"""
|
134
|
+
```
|
135
|
+
|
136
|
+
|
137
|
+
## How migrations are managed
|
138
|
+
|
139
|
+
- All migrations utilize a distributed lock to prevent simulateously running jobs
|
140
|
+
- Global migration state:
|
141
|
+
- current version
|
142
|
+
- target version
|
143
|
+
- KBs to migrate
|
144
|
+
- KB Migration State:
|
145
|
+
- current version
|
146
|
+
|
147
|
+
- Migrations are currently run with a deployment and will be continuously retried on failure.
|
148
|
+
- Running migrations in a deployment is to make sure a migration does not prevent code deployment.
|