nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/ingest/orm/utils.py
CHANGED
@@ -18,7 +18,10 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
|
20
20
|
import urllib.parse
|
21
|
+
from typing import Sequence
|
21
22
|
|
23
|
+
from nucliadb.ingest.processing import PushPayload
|
24
|
+
from nucliadb_models.text import PushTextFormat, Text
|
22
25
|
from nucliadb_protos.resources_pb2 import (
|
23
26
|
ExtractedTextWrapper,
|
24
27
|
FieldComputedMetadataWrapper,
|
@@ -28,9 +31,6 @@ from nucliadb_protos.resources_pb2 import (
|
|
28
31
|
)
|
29
32
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
30
33
|
|
31
|
-
from nucliadb.ingest.processing import PushPayload
|
32
|
-
from nucliadb_models.text import PushTextFormat, Text
|
33
|
-
|
34
34
|
|
35
35
|
def set_title(writer: BrokerMessage, toprocess: PushPayload, title: str):
|
36
36
|
title = urllib.parse.unquote(title)
|
@@ -56,7 +56,7 @@ def compute_paragraph_key(rid: str, paragraph_key: str) -> str:
|
|
56
56
|
return paragraph_key.replace("N_RID", rid)
|
57
57
|
|
58
58
|
|
59
|
-
def choose_matryoshka_dimension(dimensions:
|
59
|
+
def choose_matryoshka_dimension(dimensions: Sequence[int]) -> int:
|
60
60
|
"""Given a list of matryoshka embedding available dimensions, choose one to
|
61
61
|
set the vector dimension.
|
62
62
|
"""
|
nucliadb/ingest/partitions.py
CHANGED
@@ -38,9 +38,7 @@ def assign_partitions(settings: Settings):
|
|
38
38
|
try:
|
39
39
|
settings.replica_number = int(sts_values[-1])
|
40
40
|
except Exception:
|
41
|
-
logger.error(
|
42
|
-
f"Could not extract replica number from hostname: {hostname}"
|
43
|
-
)
|
41
|
+
logger.error(f"Could not extract replica number from hostname: {hostname}")
|
44
42
|
pass
|
45
43
|
|
46
44
|
if settings.replica_number == -1:
|
@@ -53,9 +51,5 @@ def assign_partitions(settings: Settings):
|
|
53
51
|
# update settings AND Environment Varialbe (for this process and its childs) with partition list
|
54
52
|
settings.partitions = partitions_list
|
55
53
|
os.environ["PARTITIONS"] = json.dumps(partitions_list)
|
56
|
-
logger.info(
|
57
|
-
|
58
|
-
)
|
59
|
-
logger.info(
|
60
|
-
f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}"
|
61
|
-
)
|
54
|
+
logger.info(f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}")
|
55
|
+
logger.info(f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}")
|
nucliadb/ingest/processing.py
CHANGED
@@ -29,17 +29,22 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar
|
|
29
29
|
import aiohttp
|
30
30
|
import backoff
|
31
31
|
import jwt
|
32
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
33
|
-
from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
|
34
32
|
from pydantic import BaseModel, Field
|
35
33
|
|
36
34
|
import nucliadb_models as models
|
37
35
|
from nucliadb_models.resource import QueueType
|
36
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
37
|
+
from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
|
38
38
|
from nucliadb_telemetry import metrics
|
39
39
|
from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
|
40
|
-
from nucliadb_utils.settings import
|
40
|
+
from nucliadb_utils.settings import (
|
41
|
+
FileBackendConfig,
|
42
|
+
is_onprem_nucliadb,
|
43
|
+
nuclia_settings,
|
44
|
+
storage_settings,
|
45
|
+
)
|
41
46
|
from nucliadb_utils.storages.storage import Storage
|
42
|
-
from nucliadb_utils.utilities import Utility, set_utility
|
47
|
+
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
43
48
|
|
44
49
|
logger = logging.getLogger(__name__)
|
45
50
|
|
@@ -96,9 +101,6 @@ class PushPayload(BaseModel):
|
|
96
101
|
# Diff on Text Field
|
97
102
|
textfield: dict[str, models.Text] = {}
|
98
103
|
|
99
|
-
# Diff on a Layout Field
|
100
|
-
layoutfield: dict[str, models.LayoutDiff] = {}
|
101
|
-
|
102
104
|
# New conversations to process
|
103
105
|
conversationfield: dict[str, models.PushConversation] = {}
|
104
106
|
|
@@ -112,6 +114,10 @@ class PushPayload(BaseModel):
|
|
112
114
|
|
113
115
|
|
114
116
|
async def start_processing_engine():
|
117
|
+
processing_engine = get_utility(Utility.PROCESSING)
|
118
|
+
if processing_engine is not None:
|
119
|
+
return
|
120
|
+
|
115
121
|
if nuclia_settings.dummy_processing:
|
116
122
|
processing_engine = DummyProcessingEngine()
|
117
123
|
else:
|
@@ -129,22 +135,41 @@ async def start_processing_engine():
|
|
129
135
|
set_utility(Utility.PROCESSING, processing_engine)
|
130
136
|
|
131
137
|
|
132
|
-
def
|
138
|
+
async def stop_processing_engine():
|
139
|
+
utility = get_utility(Utility.PROCESSING)
|
140
|
+
if utility is not None:
|
141
|
+
await utility.finalize()
|
142
|
+
clean_utility(Utility.PROCESSING)
|
143
|
+
|
144
|
+
|
145
|
+
class ProcessingDriverType(Enum):
|
146
|
+
# XXX IMPORTANT XXX: Make sure the values are in sync with
|
147
|
+
# the ones defined in nuclia/learning/processing repository
|
148
|
+
GCS = 0
|
149
|
+
S3 = 1
|
150
|
+
LOCAL = 2
|
151
|
+
|
152
|
+
|
153
|
+
def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> ProcessingDriverType:
|
133
154
|
"""
|
134
155
|
Outputs a nuclia-internal backend driver identifier that is used by processing
|
135
156
|
to store the blobs of processed metadata in the right bucket folder.
|
136
157
|
"""
|
137
|
-
if
|
138
|
-
|
139
|
-
|
140
|
-
return
|
141
|
-
|
142
|
-
|
143
|
-
|
158
|
+
if is_onprem_nucliadb():
|
159
|
+
# On-prem installations are always regarded as local storage from the processing perspective,
|
160
|
+
# as Nuclia processing engine will not have direct access to the storage.
|
161
|
+
return ProcessingDriverType.LOCAL
|
162
|
+
|
163
|
+
try:
|
164
|
+
return {
|
165
|
+
FileBackendConfig.GCS: ProcessingDriverType.GCS,
|
166
|
+
FileBackendConfig.S3: ProcessingDriverType.S3,
|
167
|
+
}[file_backend_driver]
|
168
|
+
except KeyError:
|
144
169
|
logger.error(
|
145
170
|
f"Not a valid file backend driver to processing, fallback to local: {file_backend_driver}"
|
146
171
|
)
|
147
|
-
return
|
172
|
+
return ProcessingDriverType.LOCAL
|
148
173
|
|
149
174
|
|
150
175
|
class ProcessingEngine:
|
@@ -162,37 +187,25 @@ class ProcessingEngine:
|
|
162
187
|
self.nuclia_service_account = nuclia_service_account
|
163
188
|
self.nuclia_zone = nuclia_zone
|
164
189
|
if nuclia_public_url is not None:
|
165
|
-
self.nuclia_public_url: Optional[str] = nuclia_public_url.format(
|
166
|
-
zone=nuclia_zone
|
167
|
-
)
|
190
|
+
self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
|
168
191
|
else:
|
169
192
|
self.nuclia_public_url = None
|
170
193
|
|
171
194
|
self.onprem = onprem
|
172
195
|
if self.onprem:
|
173
|
-
self.nuclia_upload_url =
|
174
|
-
f"{self.nuclia_public_url}/api/v1/processing/upload"
|
175
|
-
)
|
196
|
+
self.nuclia_upload_url = f"{self.nuclia_public_url}/api/v1/processing/upload"
|
176
197
|
else:
|
177
|
-
self.nuclia_upload_url =
|
178
|
-
|
179
|
-
)
|
180
|
-
self.nuclia_internal_push = (
|
181
|
-
f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
|
182
|
-
)
|
198
|
+
self.nuclia_upload_url = f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
|
199
|
+
self.nuclia_internal_push = f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
|
183
200
|
self.nuclia_internal_delete = (
|
184
201
|
f"{nuclia_processing_cluster_url}/api/v1/internal/processing/requests"
|
185
202
|
)
|
186
|
-
self.nuclia_external_push_v2 =
|
187
|
-
|
188
|
-
)
|
189
|
-
self.nuclia_external_delete = (
|
190
|
-
f"{self.nuclia_public_url}/api/v1/processing/requests"
|
191
|
-
)
|
203
|
+
self.nuclia_external_push_v2 = f"{self.nuclia_public_url}/api/v1/processing/push"
|
204
|
+
self.nuclia_external_delete = f"{self.nuclia_public_url}/api/v1/processing/requests"
|
192
205
|
|
193
206
|
self.nuclia_jwt_key = nuclia_jwt_key
|
194
207
|
self.days_to_keep = days_to_keep
|
195
|
-
self.driver = to_processing_driver_type(driver)
|
208
|
+
self.driver: ProcessingDriverType = to_processing_driver_type(driver)
|
196
209
|
self._exit_stack = AsyncExitStack()
|
197
210
|
|
198
211
|
async def initialize(self):
|
@@ -215,7 +228,7 @@ class ProcessingEngine:
|
|
215
228
|
"iat": now,
|
216
229
|
"md5": cf.md5,
|
217
230
|
"source": 1, # To indicate that this files comes internally
|
218
|
-
"driver": self.driver,
|
231
|
+
"driver": self.driver.value,
|
219
232
|
"jti": uuid.uuid4().hex,
|
220
233
|
"bucket_name": cf.bucket_name,
|
221
234
|
"filename": cf.filename,
|
@@ -239,7 +252,7 @@ class ProcessingEngine:
|
|
239
252
|
"iat": now,
|
240
253
|
"md5": file.file.md5,
|
241
254
|
"source": 1, # To indicate that this files comes internally
|
242
|
-
"driver": self.driver,
|
255
|
+
"driver": self.driver.value,
|
243
256
|
"jti": uuid.uuid4().hex,
|
244
257
|
"bucket_name": file.file.bucket_name,
|
245
258
|
"filename": file.file.filename,
|
@@ -314,9 +327,7 @@ class ProcessingEngine:
|
|
314
327
|
max_tries=MAX_TRIES,
|
315
328
|
)
|
316
329
|
@processing_observer.wrap({"type": "file_field_upload_internal"})
|
317
|
-
async def convert_internal_filefield_to_str(
|
318
|
-
self, file: FieldFilePB, storage: Storage
|
319
|
-
) -> str:
|
330
|
+
async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
|
320
331
|
"""It's already an internal file that needs to be uploaded"""
|
321
332
|
if self.onprem is False:
|
322
333
|
# Upload the file to processing upload
|
@@ -325,9 +336,7 @@ class ProcessingEngine:
|
|
325
336
|
headers = {}
|
326
337
|
headers["X-PASSWORD"] = file.password
|
327
338
|
headers["X-LANGUAGE"] = file.language
|
328
|
-
headers["X-FILENAME"] = base64.b64encode(
|
329
|
-
file.file.filename.encode()
|
330
|
-
).decode()
|
339
|
+
headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode()
|
331
340
|
headers["X-MD5"] = file.file.md5
|
332
341
|
headers["CONTENT-TYPE"] = file.file.content_type
|
333
342
|
if file.file.size:
|
@@ -335,9 +344,7 @@ class ProcessingEngine:
|
|
335
344
|
headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
|
336
345
|
|
337
346
|
iterator = storage.downloadbytescf_iterator(file.file)
|
338
|
-
async with self.session.post(
|
339
|
-
self.nuclia_upload_url, data=iterator, headers=headers
|
340
|
-
) as resp:
|
347
|
+
async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
|
341
348
|
if resp.status == 200:
|
342
349
|
jwttoken = await resp.text()
|
343
350
|
elif resp.status == 402:
|
@@ -371,9 +378,7 @@ class ProcessingEngine:
|
|
371
378
|
headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
|
372
379
|
|
373
380
|
iterator = storage.downloadbytescf_iterator(cf)
|
374
|
-
async with self.session.post(
|
375
|
-
self.nuclia_upload_url, data=iterator, headers=headers
|
376
|
-
) as resp:
|
381
|
+
async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
|
377
382
|
if resp.status == 200:
|
378
383
|
jwttoken = await resp.text()
|
379
384
|
elif resp.status == 402:
|
@@ -393,9 +398,7 @@ class ProcessingEngine:
|
|
393
398
|
jitter=backoff.random_jitter,
|
394
399
|
max_tries=MAX_TRIES,
|
395
400
|
)
|
396
|
-
async def send_to_process(
|
397
|
-
self, item: PushPayload, partition: int
|
398
|
-
) -> ProcessingInfo:
|
401
|
+
async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
|
399
402
|
op_type = "process_external" if self.onprem else "process_internal"
|
400
403
|
with processing_observer({"type": op_type}):
|
401
404
|
headers = {"CONTENT-TYPE": "application/json"}
|
@@ -403,15 +406,13 @@ class ProcessingEngine:
|
|
403
406
|
# Upload the payload
|
404
407
|
item.partition = partition
|
405
408
|
resp = await self.session.post(
|
406
|
-
url=self.nuclia_internal_push, data=item.
|
409
|
+
url=self.nuclia_internal_push, data=item.model_dump_json(), headers=headers
|
407
410
|
)
|
408
411
|
else:
|
409
|
-
headers.update(
|
410
|
-
{"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"}
|
411
|
-
)
|
412
|
+
headers.update({"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"})
|
412
413
|
# Upload the payload
|
413
414
|
resp = await self.session.post(
|
414
|
-
url=self.nuclia_external_push_v2, data=item.
|
415
|
+
url=self.nuclia_external_push_v2, data=item.model_dump_json(), headers=headers
|
415
416
|
)
|
416
417
|
if resp.status == 200:
|
417
418
|
data = await resp.json()
|
@@ -441,9 +442,7 @@ class ProcessingEngine:
|
|
441
442
|
queue=QueueType(queue_type) if queue_type is not None else None,
|
442
443
|
)
|
443
444
|
|
444
|
-
async def delete_from_processing(
|
445
|
-
self, *, kbid: str, resource_id: Optional[str] = None
|
446
|
-
) -> None:
|
445
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
|
447
446
|
"""
|
448
447
|
Delete a resource from processing. This prevents inflight resources from being processed
|
449
448
|
and wasting resources.
|
@@ -473,7 +472,7 @@ class ProcessingEngine:
|
|
473
472
|
|
474
473
|
class DummyProcessingEngine(ProcessingEngine):
|
475
474
|
def __init__(self):
|
476
|
-
self.calls: list[list[Any]] = []
|
475
|
+
self.calls: list[list[Any]] = []
|
477
476
|
self.values = defaultdict(list)
|
478
477
|
self.onprem = True
|
479
478
|
|
@@ -495,9 +494,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
495
494
|
self.values["convert_external_filefield_to_str"].append(file_field)
|
496
495
|
return f"convert_external_filefield_to_str,{index}"
|
497
496
|
|
498
|
-
async def convert_internal_filefield_to_str(
|
499
|
-
self, file: FieldFilePB, storage: Storage
|
500
|
-
) -> str:
|
497
|
+
async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
|
501
498
|
self.calls.append([file, storage])
|
502
499
|
index = len(self.values["convert_internal_filefield_to_str"])
|
503
500
|
self.values["convert_internal_filefield_to_str"].append([file, storage])
|
@@ -509,16 +506,10 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
509
506
|
self.values["convert_internal_cf_to_str"].append([cf, storage])
|
510
507
|
return f"convert_internal_cf_to_str,{index}"
|
511
508
|
|
512
|
-
async def send_to_process(
|
513
|
-
self, item: PushPayload, partition: int
|
514
|
-
) -> ProcessingInfo:
|
509
|
+
async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
|
515
510
|
self.calls.append([item, partition])
|
516
511
|
self.values["send_to_process"].append([item, partition])
|
517
|
-
return ProcessingInfo(
|
518
|
-
seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED
|
519
|
-
)
|
512
|
+
return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
|
520
513
|
|
521
|
-
async def delete_from_processing(
|
522
|
-
self, *, kbid: str, resource_id: Optional[str] = None
|
523
|
-
) -> None:
|
514
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
|
524
515
|
self.calls.append([kbid, resource_id])
|
nucliadb/ingest/py.typed
ADDED
File without changes
|