nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/ingest/orm/utils.py
CHANGED
@@ -18,7 +18,10 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
|
20
20
|
import urllib.parse
|
21
|
+
from typing import Sequence
|
21
22
|
|
23
|
+
from nucliadb.ingest.processing import PushPayload
|
24
|
+
from nucliadb_models.text import PushTextFormat, Text
|
22
25
|
from nucliadb_protos.resources_pb2 import (
|
23
26
|
ExtractedTextWrapper,
|
24
27
|
FieldComputedMetadataWrapper,
|
@@ -28,9 +31,6 @@ from nucliadb_protos.resources_pb2 import (
|
|
28
31
|
)
|
29
32
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
30
33
|
|
31
|
-
from nucliadb.ingest.processing import PushPayload
|
32
|
-
from nucliadb_models.text import PushTextFormat, Text
|
33
|
-
|
34
34
|
|
35
35
|
def set_title(writer: BrokerMessage, toprocess: PushPayload, title: str):
|
36
36
|
title = urllib.parse.unquote(title)
|
@@ -56,7 +56,7 @@ def compute_paragraph_key(rid: str, paragraph_key: str) -> str:
|
|
56
56
|
return paragraph_key.replace("N_RID", rid)
|
57
57
|
|
58
58
|
|
59
|
-
def choose_matryoshka_dimension(dimensions:
|
59
|
+
def choose_matryoshka_dimension(dimensions: Sequence[int]) -> int:
|
60
60
|
"""Given a list of matryoshka embedding available dimensions, choose one to
|
61
61
|
set the vector dimension.
|
62
62
|
"""
|
nucliadb/ingest/partitions.py
CHANGED
@@ -38,9 +38,7 @@ def assign_partitions(settings: Settings):
|
|
38
38
|
try:
|
39
39
|
settings.replica_number = int(sts_values[-1])
|
40
40
|
except Exception:
|
41
|
-
logger.error(
|
42
|
-
f"Could not extract replica number from hostname: {hostname}"
|
43
|
-
)
|
41
|
+
logger.error(f"Could not extract replica number from hostname: {hostname}")
|
44
42
|
pass
|
45
43
|
|
46
44
|
if settings.replica_number == -1:
|
@@ -53,9 +51,5 @@ def assign_partitions(settings: Settings):
|
|
53
51
|
# update settings AND Environment Varialbe (for this process and its childs) with partition list
|
54
52
|
settings.partitions = partitions_list
|
55
53
|
os.environ["PARTITIONS"] = json.dumps(partitions_list)
|
56
|
-
logger.info(
|
57
|
-
|
58
|
-
)
|
59
|
-
logger.info(
|
60
|
-
f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}"
|
61
|
-
)
|
54
|
+
logger.info(f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}")
|
55
|
+
logger.info(f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}")
|
nucliadb/ingest/processing.py
CHANGED
@@ -29,17 +29,22 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar
|
|
29
29
|
import aiohttp
|
30
30
|
import backoff
|
31
31
|
import jwt
|
32
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
33
|
-
from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
|
34
32
|
from pydantic import BaseModel, Field
|
35
33
|
|
36
34
|
import nucliadb_models as models
|
37
35
|
from nucliadb_models.resource import QueueType
|
36
|
+
from nucliadb_protos.resources_pb2 import CloudFile
|
37
|
+
from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
|
38
38
|
from nucliadb_telemetry import metrics
|
39
39
|
from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
|
40
|
-
from nucliadb_utils.settings import
|
40
|
+
from nucliadb_utils.settings import (
|
41
|
+
FileBackendConfig,
|
42
|
+
is_onprem_nucliadb,
|
43
|
+
nuclia_settings,
|
44
|
+
storage_settings,
|
45
|
+
)
|
41
46
|
from nucliadb_utils.storages.storage import Storage
|
42
|
-
from nucliadb_utils.utilities import Utility, set_utility
|
47
|
+
from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
|
43
48
|
|
44
49
|
logger = logging.getLogger(__name__)
|
45
50
|
|
@@ -96,9 +101,6 @@ class PushPayload(BaseModel):
|
|
96
101
|
# Diff on Text Field
|
97
102
|
textfield: dict[str, models.Text] = {}
|
98
103
|
|
99
|
-
# Diff on a Layout Field
|
100
|
-
layoutfield: dict[str, models.LayoutDiff] = {}
|
101
|
-
|
102
104
|
# New conversations to process
|
103
105
|
conversationfield: dict[str, models.PushConversation] = {}
|
104
106
|
|
@@ -112,6 +114,10 @@ class PushPayload(BaseModel):
|
|
112
114
|
|
113
115
|
|
114
116
|
async def start_processing_engine():
|
117
|
+
processing_engine = get_utility(Utility.PROCESSING)
|
118
|
+
if processing_engine is not None:
|
119
|
+
return
|
120
|
+
|
115
121
|
if nuclia_settings.dummy_processing:
|
116
122
|
processing_engine = DummyProcessingEngine()
|
117
123
|
else:
|
@@ -129,22 +135,41 @@ async def start_processing_engine():
|
|
129
135
|
set_utility(Utility.PROCESSING, processing_engine)
|
130
136
|
|
131
137
|
|
132
|
-
def
|
138
|
+
async def stop_processing_engine():
|
139
|
+
utility = get_utility(Utility.PROCESSING)
|
140
|
+
if utility is not None:
|
141
|
+
await utility.finalize()
|
142
|
+
clean_utility(Utility.PROCESSING)
|
143
|
+
|
144
|
+
|
145
|
+
class ProcessingDriverType(Enum):
|
146
|
+
# XXX IMPORTANT XXX: Make sure the values are in sync with
|
147
|
+
# the ones defined in nuclia/learning/processing repository
|
148
|
+
GCS = 0
|
149
|
+
S3 = 1
|
150
|
+
LOCAL = 2
|
151
|
+
|
152
|
+
|
153
|
+
def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> ProcessingDriverType:
|
133
154
|
"""
|
134
155
|
Outputs a nuclia-internal backend driver identifier that is used by processing
|
135
156
|
to store the blobs of processed metadata in the right bucket folder.
|
136
157
|
"""
|
137
|
-
if
|
138
|
-
|
139
|
-
|
140
|
-
return
|
141
|
-
|
142
|
-
|
143
|
-
|
158
|
+
if is_onprem_nucliadb():
|
159
|
+
# On-prem installations are always regarded as local storage from the processing perspective,
|
160
|
+
# as Nuclia processing engine will not have direct access to the storage.
|
161
|
+
return ProcessingDriverType.LOCAL
|
162
|
+
|
163
|
+
try:
|
164
|
+
return {
|
165
|
+
FileBackendConfig.GCS: ProcessingDriverType.GCS,
|
166
|
+
FileBackendConfig.S3: ProcessingDriverType.S3,
|
167
|
+
}[file_backend_driver]
|
168
|
+
except KeyError:
|
144
169
|
logger.error(
|
145
170
|
f"Not a valid file backend driver to processing, fallback to local: {file_backend_driver}"
|
146
171
|
)
|
147
|
-
return
|
172
|
+
return ProcessingDriverType.LOCAL
|
148
173
|
|
149
174
|
|
150
175
|
class ProcessingEngine:
|
@@ -162,37 +187,25 @@ class ProcessingEngine:
|
|
162
187
|
self.nuclia_service_account = nuclia_service_account
|
163
188
|
self.nuclia_zone = nuclia_zone
|
164
189
|
if nuclia_public_url is not None:
|
165
|
-
self.nuclia_public_url: Optional[str] = nuclia_public_url.format(
|
166
|
-
zone=nuclia_zone
|
167
|
-
)
|
190
|
+
self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
|
168
191
|
else:
|
169
192
|
self.nuclia_public_url = None
|
170
193
|
|
171
194
|
self.onprem = onprem
|
172
195
|
if self.onprem:
|
173
|
-
self.nuclia_upload_url =
|
174
|
-
f"{self.nuclia_public_url}/api/v1/processing/upload"
|
175
|
-
)
|
196
|
+
self.nuclia_upload_url = f"{self.nuclia_public_url}/api/v1/processing/upload"
|
176
197
|
else:
|
177
|
-
self.nuclia_upload_url =
|
178
|
-
|
179
|
-
)
|
180
|
-
self.nuclia_internal_push = (
|
181
|
-
f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
|
182
|
-
)
|
198
|
+
self.nuclia_upload_url = f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
|
199
|
+
self.nuclia_internal_push = f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
|
183
200
|
self.nuclia_internal_delete = (
|
184
201
|
f"{nuclia_processing_cluster_url}/api/v1/internal/processing/requests"
|
185
202
|
)
|
186
|
-
self.nuclia_external_push_v2 =
|
187
|
-
|
188
|
-
)
|
189
|
-
self.nuclia_external_delete = (
|
190
|
-
f"{self.nuclia_public_url}/api/v1/processing/requests"
|
191
|
-
)
|
203
|
+
self.nuclia_external_push_v2 = f"{self.nuclia_public_url}/api/v1/processing/push"
|
204
|
+
self.nuclia_external_delete = f"{self.nuclia_public_url}/api/v1/processing/requests"
|
192
205
|
|
193
206
|
self.nuclia_jwt_key = nuclia_jwt_key
|
194
207
|
self.days_to_keep = days_to_keep
|
195
|
-
self.driver = to_processing_driver_type(driver)
|
208
|
+
self.driver: ProcessingDriverType = to_processing_driver_type(driver)
|
196
209
|
self._exit_stack = AsyncExitStack()
|
197
210
|
|
198
211
|
async def initialize(self):
|
@@ -215,7 +228,7 @@ class ProcessingEngine:
|
|
215
228
|
"iat": now,
|
216
229
|
"md5": cf.md5,
|
217
230
|
"source": 1, # To indicate that this files comes internally
|
218
|
-
"driver": self.driver,
|
231
|
+
"driver": self.driver.value,
|
219
232
|
"jti": uuid.uuid4().hex,
|
220
233
|
"bucket_name": cf.bucket_name,
|
221
234
|
"filename": cf.filename,
|
@@ -239,7 +252,7 @@ class ProcessingEngine:
|
|
239
252
|
"iat": now,
|
240
253
|
"md5": file.file.md5,
|
241
254
|
"source": 1, # To indicate that this files comes internally
|
242
|
-
"driver": self.driver,
|
255
|
+
"driver": self.driver.value,
|
243
256
|
"jti": uuid.uuid4().hex,
|
244
257
|
"bucket_name": file.file.bucket_name,
|
245
258
|
"filename": file.file.filename,
|
@@ -248,6 +261,7 @@ class ProcessingEngine:
|
|
248
261
|
"content_type": file.file.content_type,
|
249
262
|
"password": file.password,
|
250
263
|
"language": file.language,
|
264
|
+
"extract_strategy": file.extract_strategy,
|
251
265
|
}
|
252
266
|
return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
|
253
267
|
|
@@ -265,6 +279,8 @@ class ProcessingEngine:
|
|
265
279
|
headers["X-LANGUAGE"] = file.language
|
266
280
|
headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode() # type: ignore
|
267
281
|
headers["X-MD5"] = file.file.md5
|
282
|
+
if file.extract_strategy is not None:
|
283
|
+
headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
|
268
284
|
headers["CONTENT_TYPE"] = file.file.content_type
|
269
285
|
headers["CONTENT-LENGTH"] = str(len(file.file.payload)) # type: ignore
|
270
286
|
headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
|
@@ -304,6 +320,7 @@ class ProcessingEngine:
|
|
304
320
|
"content_type": file_field.file.content_type,
|
305
321
|
"language": file_field.language,
|
306
322
|
"password": file_field.password,
|
323
|
+
"extract_strategy": file_field.extract_strategy,
|
307
324
|
}
|
308
325
|
return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
|
309
326
|
|
@@ -314,9 +331,7 @@ class ProcessingEngine:
|
|
314
331
|
max_tries=MAX_TRIES,
|
315
332
|
)
|
316
333
|
@processing_observer.wrap({"type": "file_field_upload_internal"})
|
317
|
-
async def convert_internal_filefield_to_str(
|
318
|
-
self, file: FieldFilePB, storage: Storage
|
319
|
-
) -> str:
|
334
|
+
async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
|
320
335
|
"""It's already an internal file that needs to be uploaded"""
|
321
336
|
if self.onprem is False:
|
322
337
|
# Upload the file to processing upload
|
@@ -325,19 +340,17 @@ class ProcessingEngine:
|
|
325
340
|
headers = {}
|
326
341
|
headers["X-PASSWORD"] = file.password
|
327
342
|
headers["X-LANGUAGE"] = file.language
|
328
|
-
headers["X-FILENAME"] = base64.b64encode(
|
329
|
-
file.file.filename.encode()
|
330
|
-
).decode()
|
343
|
+
headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode()
|
331
344
|
headers["X-MD5"] = file.file.md5
|
332
345
|
headers["CONTENT-TYPE"] = file.file.content_type
|
333
346
|
if file.file.size:
|
334
347
|
headers["CONTENT-LENGTH"] = str(file.file.size)
|
348
|
+
if file.extract_strategy != "":
|
349
|
+
headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
|
335
350
|
headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
|
336
351
|
|
337
352
|
iterator = storage.downloadbytescf_iterator(file.file)
|
338
|
-
async with self.session.post(
|
339
|
-
self.nuclia_upload_url, data=iterator, headers=headers
|
340
|
-
) as resp:
|
353
|
+
async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
|
341
354
|
if resp.status == 200:
|
342
355
|
jwttoken = await resp.text()
|
343
356
|
elif resp.status == 402:
|
@@ -371,9 +384,7 @@ class ProcessingEngine:
|
|
371
384
|
headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
|
372
385
|
|
373
386
|
iterator = storage.downloadbytescf_iterator(cf)
|
374
|
-
async with self.session.post(
|
375
|
-
self.nuclia_upload_url, data=iterator, headers=headers
|
376
|
-
) as resp:
|
387
|
+
async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
|
377
388
|
if resp.status == 200:
|
378
389
|
jwttoken = await resp.text()
|
379
390
|
elif resp.status == 402:
|
@@ -393,9 +404,7 @@ class ProcessingEngine:
|
|
393
404
|
jitter=backoff.random_jitter,
|
394
405
|
max_tries=MAX_TRIES,
|
395
406
|
)
|
396
|
-
async def send_to_process(
|
397
|
-
self, item: PushPayload, partition: int
|
398
|
-
) -> ProcessingInfo:
|
407
|
+
async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
|
399
408
|
op_type = "process_external" if self.onprem else "process_internal"
|
400
409
|
with processing_observer({"type": op_type}):
|
401
410
|
headers = {"CONTENT-TYPE": "application/json"}
|
@@ -403,15 +412,13 @@ class ProcessingEngine:
|
|
403
412
|
# Upload the payload
|
404
413
|
item.partition = partition
|
405
414
|
resp = await self.session.post(
|
406
|
-
url=self.nuclia_internal_push, data=item.
|
415
|
+
url=self.nuclia_internal_push, data=item.model_dump_json(), headers=headers
|
407
416
|
)
|
408
417
|
else:
|
409
|
-
headers.update(
|
410
|
-
{"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"}
|
411
|
-
)
|
418
|
+
headers.update({"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"})
|
412
419
|
# Upload the payload
|
413
420
|
resp = await self.session.post(
|
414
|
-
url=self.nuclia_external_push_v2, data=item.
|
421
|
+
url=self.nuclia_external_push_v2, data=item.model_dump_json(), headers=headers
|
415
422
|
)
|
416
423
|
if resp.status == 200:
|
417
424
|
data = await resp.json()
|
@@ -441,9 +448,7 @@ class ProcessingEngine:
|
|
441
448
|
queue=QueueType(queue_type) if queue_type is not None else None,
|
442
449
|
)
|
443
450
|
|
444
|
-
async def delete_from_processing(
|
445
|
-
self, *, kbid: str, resource_id: Optional[str] = None
|
446
|
-
) -> None:
|
451
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
|
447
452
|
"""
|
448
453
|
Delete a resource from processing. This prevents inflight resources from being processed
|
449
454
|
and wasting resources.
|
@@ -473,7 +478,7 @@ class ProcessingEngine:
|
|
473
478
|
|
474
479
|
class DummyProcessingEngine(ProcessingEngine):
|
475
480
|
def __init__(self):
|
476
|
-
self.calls: list[list[Any]] = []
|
481
|
+
self.calls: list[list[Any]] = []
|
477
482
|
self.values = defaultdict(list)
|
478
483
|
self.onprem = True
|
479
484
|
|
@@ -495,9 +500,7 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
495
500
|
self.values["convert_external_filefield_to_str"].append(file_field)
|
496
501
|
return f"convert_external_filefield_to_str,{index}"
|
497
502
|
|
498
|
-
async def convert_internal_filefield_to_str(
|
499
|
-
self, file: FieldFilePB, storage: Storage
|
500
|
-
) -> str:
|
503
|
+
async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
|
501
504
|
self.calls.append([file, storage])
|
502
505
|
index = len(self.values["convert_internal_filefield_to_str"])
|
503
506
|
self.values["convert_internal_filefield_to_str"].append([file, storage])
|
@@ -509,16 +512,10 @@ class DummyProcessingEngine(ProcessingEngine):
|
|
509
512
|
self.values["convert_internal_cf_to_str"].append([cf, storage])
|
510
513
|
return f"convert_internal_cf_to_str,{index}"
|
511
514
|
|
512
|
-
async def send_to_process(
|
513
|
-
self, item: PushPayload, partition: int
|
514
|
-
) -> ProcessingInfo:
|
515
|
+
async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
|
515
516
|
self.calls.append([item, partition])
|
516
517
|
self.values["send_to_process"].append([item, partition])
|
517
|
-
return ProcessingInfo(
|
518
|
-
seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED
|
519
|
-
)
|
518
|
+
return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
|
520
519
|
|
521
|
-
async def delete_from_processing(
|
522
|
-
self, *, kbid: str, resource_id: Optional[str] = None
|
523
|
-
) -> None:
|
520
|
+
async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
|
524
521
|
self.calls.append([kbid, resource_id])
|
nucliadb/ingest/py.typed
ADDED
File without changes
|