nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -45,9 +45,7 @@ from .shard_creator import ShardCreatorHandler
|
|
45
45
|
def _handle_task_result(task: asyncio.Task) -> None:
|
46
46
|
e = task.exception()
|
47
47
|
if e:
|
48
|
-
logger.exception(
|
49
|
-
"Loop stopped by exception. This should not happen. Exiting.", exc_info=e
|
50
|
-
)
|
48
|
+
logger.exception("Loop stopped by exception. This should not happen. Exiting.", exc_info=e)
|
51
49
|
sys.exit(1)
|
52
50
|
|
53
51
|
|
@@ -87,9 +85,7 @@ async def start_ingest_consumers(
|
|
87
85
|
if transaction_settings.transaction_local:
|
88
86
|
raise ConfigurationError("Can not start ingest consumers in local mode")
|
89
87
|
|
90
|
-
while len(
|
91
|
-
manager.get_index_nodes()
|
92
|
-
) == 0 and running_settings.running_environment not in (
|
88
|
+
while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
|
93
89
|
"local",
|
94
90
|
"test",
|
95
91
|
):
|
@@ -101,9 +97,9 @@ async def start_ingest_consumers(
|
|
101
97
|
storage = await get_storage(service_name=service_name or SERVICE_NAME)
|
102
98
|
nats_connection_manager = get_nats_manager()
|
103
99
|
|
104
|
-
max_concurrent_processing = asyncio.Semaphore(
|
105
|
-
|
106
|
-
|
100
|
+
max_concurrent_processing = asyncio.Semaphore(settings.max_concurrent_ingest_processing)
|
101
|
+
|
102
|
+
consumer_finalizers = []
|
107
103
|
|
108
104
|
for partition in settings.partitions:
|
109
105
|
consumer = IngestConsumer(
|
@@ -115,8 +111,15 @@ async def start_ingest_consumers(
|
|
115
111
|
lock=max_concurrent_processing,
|
116
112
|
)
|
117
113
|
await consumer.initialize()
|
114
|
+
consumer_finalizers.append(consumer.finalize)
|
118
115
|
|
119
|
-
|
116
|
+
async def _finalize():
|
117
|
+
# Finalize all the consumers and the nats connection manager
|
118
|
+
for consumer_finalize in consumer_finalizers:
|
119
|
+
await consumer_finalize()
|
120
|
+
await nats_connection_manager.finalize()
|
121
|
+
|
122
|
+
return _finalize
|
120
123
|
|
121
124
|
|
122
125
|
async def start_ingest_processed_consumer(
|
@@ -132,9 +135,7 @@ async def start_ingest_processed_consumer(
|
|
132
135
|
if transaction_settings.transaction_local:
|
133
136
|
raise ConfigurationError("Can not start ingest consumers in local mode")
|
134
137
|
|
135
|
-
while len(
|
136
|
-
manager.get_index_nodes()
|
137
|
-
) == 0 and running_settings.running_environment not in (
|
138
|
+
while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
|
138
139
|
"local",
|
139
140
|
"test",
|
140
141
|
):
|
@@ -161,19 +162,20 @@ async def start_ingest_processed_consumer(
|
|
161
162
|
async def start_auditor() -> Callable[[], Awaitable[None]]:
|
162
163
|
audit = get_audit()
|
163
164
|
assert audit is not None
|
165
|
+
|
164
166
|
pubsub = await get_pubsub()
|
165
167
|
assert pubsub is not None, "Pubsub is not configured"
|
166
168
|
storage = await get_storage(service_name=SERVICE_NAME)
|
167
169
|
index_auditor = IndexAuditHandler(audit=audit, pubsub=pubsub)
|
168
|
-
resource_writes_auditor = ResourceWritesAuditHandler(
|
169
|
-
storage=storage, audit=audit, pubsub=pubsub
|
170
|
-
)
|
170
|
+
resource_writes_auditor = ResourceWritesAuditHandler(storage=storage, audit=audit, pubsub=pubsub)
|
171
171
|
|
172
172
|
await index_auditor.initialize()
|
173
173
|
await resource_writes_auditor.initialize()
|
174
174
|
|
175
175
|
return partial(
|
176
|
-
asyncio.gather,
|
176
|
+
asyncio.gather,
|
177
|
+
index_auditor.finalize(),
|
178
|
+
resource_writes_auditor.finalize(), # type: ignore
|
177
179
|
)
|
178
180
|
|
179
181
|
|
@@ -82,9 +82,7 @@ class ShardCreatorHandler:
|
|
82
82
|
metrics.total_messages.inc({"type": "shard_creator", "action": "ignored"})
|
83
83
|
return
|
84
84
|
|
85
|
-
self.task_handler.schedule(
|
86
|
-
notification.kbid, partial(self.process_kb, notification.kbid)
|
87
|
-
)
|
85
|
+
self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
|
88
86
|
metrics.total_messages.inc({"type": "shard_creator", "action": "scheduled"})
|
89
87
|
|
90
88
|
@metrics.handler_histo.wrap({"type": "shard_creator"})
|
@@ -105,7 +103,7 @@ class ShardCreatorHandler:
|
|
105
103
|
async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
|
106
104
|
# remember, a lock will do at least 1+ reads and 1 write.
|
107
105
|
# with heavy writes, this adds some simple k/v pressure
|
108
|
-
node, shard_id = choose_node(current_shard)
|
106
|
+
node, shard_id = choose_node(current_shard, use_nidx=True)
|
109
107
|
shard: nodereader_pb2.Shard = await node.reader.GetShard(
|
110
108
|
nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
|
111
109
|
)
|
@@ -48,9 +48,7 @@ class DelayedTaskHandler:
|
|
48
48
|
for task in list(self.outstanding_tasks.values()):
|
49
49
|
await task
|
50
50
|
|
51
|
-
def schedule(
|
52
|
-
self, key: str, handler: Callable[[], Coroutine[None, None, None]]
|
53
|
-
) -> None:
|
51
|
+
def schedule(self, key: str, handler: Callable[[], Coroutine[None, None, None]]) -> None:
|
54
52
|
if key in self.to_process:
|
55
53
|
# already waiting to process this key, ignore
|
56
54
|
return
|
nucliadb/ingest/fields/base.py
CHANGED
@@ -21,14 +21,20 @@ from __future__ import annotations
|
|
21
21
|
|
22
22
|
import enum
|
23
23
|
from datetime import datetime
|
24
|
-
from typing import Any, Optional, Type
|
24
|
+
from typing import Any, Generic, Optional, Type, TypeVar
|
25
25
|
|
26
|
+
from google.protobuf.message import DecodeError, Message
|
27
|
+
|
28
|
+
from nucliadb.common import datamanagers
|
29
|
+
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
26
30
|
from nucliadb_protos.resources_pb2 import (
|
27
31
|
CloudFile,
|
28
32
|
ExtractedTextWrapper,
|
29
33
|
ExtractedVectorsWrapper,
|
34
|
+
FieldAuthor,
|
30
35
|
FieldComputedMetadata,
|
31
36
|
FieldComputedMetadataWrapper,
|
37
|
+
FieldQuestionAnswers,
|
32
38
|
FieldQuestionAnswerWrapper,
|
33
39
|
LargeComputedMetadata,
|
34
40
|
LargeComputedMetadataWrapper,
|
@@ -36,34 +42,33 @@ from nucliadb_protos.resources_pb2 import (
|
|
36
42
|
)
|
37
43
|
from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
|
38
44
|
from nucliadb_protos.writer_pb2 import Error
|
39
|
-
|
40
|
-
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
41
45
|
from nucliadb_utils.storages.storage import Storage, StorageField
|
42
46
|
|
43
|
-
|
44
|
-
KB_RESOURCE_ERROR = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/error"
|
45
|
-
|
46
|
-
SUBFIELDFIELDS = ["l", "c"]
|
47
|
+
SUBFIELDFIELDS = ("c",)
|
47
48
|
|
48
49
|
|
49
50
|
class FieldTypes(str, enum.Enum):
|
50
51
|
FIELD_TEXT = "extracted_text"
|
51
52
|
FIELD_VECTORS = "extracted_vectors"
|
53
|
+
FIELD_VECTORSET = "{vectorset}/extracted_vectors"
|
52
54
|
FIELD_METADATA = "metadata"
|
53
55
|
FIELD_LARGE_METADATA = "large_metadata"
|
54
56
|
THUMBNAIL = "thumbnail"
|
55
57
|
QUESTION_ANSWERS = "question_answers"
|
56
58
|
|
57
59
|
|
58
|
-
|
59
|
-
|
60
|
+
PbType = TypeVar("PbType", bound=Message)
|
61
|
+
|
62
|
+
|
63
|
+
class Field(Generic[PbType]):
|
64
|
+
pbklass: Type[PbType]
|
60
65
|
type: str = "x"
|
61
66
|
value: Optional[Any]
|
62
67
|
extracted_text: Optional[ExtractedText]
|
63
|
-
extracted_vectors: Optional[VectorObject]
|
68
|
+
extracted_vectors: dict[Optional[str], VectorObject]
|
64
69
|
computed_metadata: Optional[FieldComputedMetadata]
|
65
70
|
large_computed_metadata: Optional[LargeComputedMetadata]
|
66
|
-
question_answers: Optional[
|
71
|
+
question_answers: Optional[FieldQuestionAnswers]
|
67
72
|
|
68
73
|
def __init__(
|
69
74
|
self,
|
@@ -77,7 +82,7 @@ class Field:
|
|
77
82
|
|
78
83
|
self.value = None
|
79
84
|
self.extracted_text: Optional[ExtractedText] = None
|
80
|
-
self.extracted_vectors =
|
85
|
+
self.extracted_vectors = {}
|
81
86
|
self.computed_metadata = None
|
82
87
|
self.large_computed_metadata = None
|
83
88
|
self.question_answers = None
|
@@ -112,44 +117,51 @@ class Field:
|
|
112
117
|
return f"{self.uuid}/{self.type}/{self.id}"
|
113
118
|
|
114
119
|
def get_storage_field(self, field_type: FieldTypes) -> StorageField:
|
115
|
-
return self.storage.file_extracted(
|
116
|
-
|
117
|
-
|
120
|
+
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, field_type.value)
|
121
|
+
|
122
|
+
def _get_extracted_vectors_storage_field(self, vectorset: Optional[str] = None) -> StorageField:
|
123
|
+
if vectorset:
|
124
|
+
key = FieldTypes.FIELD_VECTORSET.value.format(vectorset=vectorset)
|
125
|
+
else:
|
126
|
+
key = FieldTypes.FIELD_VECTORS.value
|
127
|
+
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
|
118
128
|
|
119
|
-
async def db_get_value(self):
|
129
|
+
async def db_get_value(self) -> Optional[PbType]:
|
120
130
|
if self.value is None:
|
121
|
-
payload = await
|
122
|
-
|
123
|
-
|
124
|
-
|
131
|
+
payload = await datamanagers.fields.get_raw(
|
132
|
+
self.resource.txn,
|
133
|
+
kbid=self.kbid,
|
134
|
+
rid=self.uuid,
|
135
|
+
field_type=self.type,
|
136
|
+
field_id=self.id,
|
125
137
|
)
|
126
138
|
if payload is None:
|
127
|
-
return
|
139
|
+
return None
|
128
140
|
|
129
141
|
self.value = self.pbklass()
|
130
142
|
self.value.ParseFromString(payload)
|
131
143
|
return self.value
|
132
144
|
|
133
145
|
async def db_set_value(self, payload: Any):
|
134
|
-
await
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
146
|
+
await datamanagers.fields.set(
|
147
|
+
self.resource.txn,
|
148
|
+
kbid=self.kbid,
|
149
|
+
rid=self.uuid,
|
150
|
+
field_type=self.type,
|
151
|
+
field_id=self.id,
|
152
|
+
value=payload,
|
139
153
|
)
|
140
154
|
self.value = payload
|
141
155
|
self.resource.modified = True
|
142
156
|
|
143
157
|
async def delete(self):
|
144
|
-
|
145
|
-
|
158
|
+
await datamanagers.fields.delete(
|
159
|
+
self.resource.txn,
|
160
|
+
kbid=self.kbid,
|
161
|
+
rid=self.uuid,
|
162
|
+
field_type=self.type,
|
163
|
+
field_id=self.id,
|
146
164
|
)
|
147
|
-
# Make sure we explicitly delete the field and any nested key
|
148
|
-
keys_to_delete = []
|
149
|
-
async for key in self.resource.txn.keys(field_base_key):
|
150
|
-
keys_to_delete.append(key)
|
151
|
-
for key in keys_to_delete:
|
152
|
-
await self.resource.txn.delete(key)
|
153
165
|
await self.delete_extracted_text()
|
154
166
|
await self.delete_vectors()
|
155
167
|
await self.delete_metadata()
|
@@ -169,9 +181,9 @@ class Field:
|
|
169
181
|
except KeyError:
|
170
182
|
pass
|
171
183
|
|
172
|
-
async def delete_vectors(self) -> None:
|
184
|
+
async def delete_vectors(self, vectorset: Optional[str] = None) -> None:
|
173
185
|
# Try delete vectors
|
174
|
-
sf = self.
|
186
|
+
sf = self._get_extracted_vectors_storage_field(vectorset)
|
175
187
|
try:
|
176
188
|
await self.storage.delete_upload(sf.key, sf.bucket)
|
177
189
|
except KeyError:
|
@@ -185,53 +197,79 @@ class Field:
|
|
185
197
|
pass
|
186
198
|
|
187
199
|
async def get_error(self) -> Optional[Error]:
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
200
|
+
return await datamanagers.fields.get_error(
|
201
|
+
self.resource.txn,
|
202
|
+
kbid=self.kbid,
|
203
|
+
rid=self.uuid,
|
204
|
+
field_type=self.type,
|
205
|
+
field_id=self.id,
|
192
206
|
)
|
193
|
-
if payload is None:
|
194
|
-
return None
|
195
|
-
pberror = Error()
|
196
|
-
pberror.ParseFromString(payload)
|
197
|
-
return pberror
|
198
207
|
|
199
208
|
async def set_error(self, error: Error) -> None:
|
200
|
-
await
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
209
|
+
await datamanagers.fields.set_error(
|
210
|
+
self.resource.txn,
|
211
|
+
kbid=self.kbid,
|
212
|
+
rid=self.uuid,
|
213
|
+
field_type=self.type,
|
214
|
+
field_id=self.id,
|
215
|
+
error=error,
|
205
216
|
)
|
206
217
|
|
207
|
-
async def get_question_answers(self) -> Optional[
|
208
|
-
if self.question_answers is None:
|
218
|
+
async def get_question_answers(self, force=False) -> Optional[FieldQuestionAnswers]:
|
219
|
+
if self.question_answers is None or force:
|
209
220
|
sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
|
210
|
-
|
221
|
+
try:
|
222
|
+
payload = await self.storage.download_pb(sf, FieldQuestionAnswers)
|
223
|
+
except DecodeError:
|
224
|
+
deprecated_payload = await self.storage.download_pb(sf, QuestionAnswers)
|
225
|
+
if deprecated_payload is not None:
|
226
|
+
payload = FieldQuestionAnswers()
|
227
|
+
payload.question_answers.CopyFrom(deprecated_payload)
|
211
228
|
if payload is not None:
|
212
229
|
self.question_answers = payload
|
213
230
|
return self.question_answers
|
214
231
|
|
215
232
|
async def set_question_answers(self, payload: FieldQuestionAnswerWrapper) -> None:
|
233
|
+
if self.type in SUBFIELDFIELDS:
|
234
|
+
try:
|
235
|
+
actual_payload: Optional[FieldQuestionAnswers] = await self.get_question_answers(
|
236
|
+
force=True
|
237
|
+
)
|
238
|
+
except KeyError:
|
239
|
+
actual_payload = None
|
240
|
+
else:
|
241
|
+
actual_payload = None
|
216
242
|
sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
|
217
243
|
|
218
|
-
if
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
244
|
+
if actual_payload is None:
|
245
|
+
# Its first question answer
|
246
|
+
if payload.HasField("file"):
|
247
|
+
await self.storage.normalize_binary(payload.file, sf)
|
248
|
+
else:
|
249
|
+
await self.storage.upload_pb(sf, payload.question_answers)
|
250
|
+
self.question_answers = payload.question_answers
|
224
251
|
else:
|
225
|
-
|
226
|
-
|
227
|
-
|
252
|
+
if payload.HasField("file"):
|
253
|
+
raw_payload = await self.storage.downloadbytescf(payload.file)
|
254
|
+
pb = FieldQuestionAnswers()
|
255
|
+
pb.ParseFromString(raw_payload.read())
|
256
|
+
raw_payload.flush()
|
257
|
+
payload.question_answers.CopyFrom(pb)
|
258
|
+
# We know its payload.question_answers
|
259
|
+
for key, value in payload.question_answers.split_question_answers.items():
|
260
|
+
actual_payload.split_question_answers[key] = value
|
261
|
+
for key in payload.question_answers.deleted_splits:
|
262
|
+
if key in actual_payload.split_question_answers:
|
263
|
+
del actual_payload.split_question_answers[key]
|
264
|
+
if payload.question_answers.HasField("question_answers") != "":
|
265
|
+
actual_payload.question_answers.CopyFrom(payload.question_answers.question_answers)
|
266
|
+
await self.storage.upload_pb(sf, actual_payload)
|
267
|
+
self.question_answers = actual_payload
|
228
268
|
|
229
269
|
async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
|
230
270
|
if self.type in SUBFIELDFIELDS:
|
231
271
|
try:
|
232
|
-
actual_payload: Optional[ExtractedText] = await self.get_extracted_text(
|
233
|
-
force=True
|
234
|
-
)
|
272
|
+
actual_payload: Optional[ExtractedText] = await self.get_extracted_text(force=True)
|
235
273
|
except KeyError:
|
236
274
|
actual_payload = None
|
237
275
|
else:
|
@@ -271,23 +309,21 @@ class Field:
|
|
271
309
|
self.extracted_text = payload
|
272
310
|
return self.extracted_text
|
273
311
|
|
274
|
-
async def set_vectors(
|
275
|
-
|
276
|
-
) -> tuple[Optional[VectorObject], bool, list[str]]:
|
312
|
+
async def set_vectors(self, payload: ExtractedVectorsWrapper) -> Optional[VectorObject]:
|
313
|
+
vectorset = payload.vectorset_id or None
|
277
314
|
if self.type in SUBFIELDFIELDS:
|
278
315
|
try:
|
279
316
|
actual_payload: Optional[VectorObject] = await self.get_vectors(
|
280
|
-
|
317
|
+
vectorset=vectorset,
|
318
|
+
force=True,
|
281
319
|
)
|
282
320
|
except KeyError:
|
283
321
|
actual_payload = None
|
284
322
|
else:
|
285
323
|
actual_payload = None
|
286
324
|
|
287
|
-
sf = self.
|
325
|
+
sf = self._get_extracted_vectors_storage_field(vectorset)
|
288
326
|
vo: Optional[VectorObject] = None
|
289
|
-
replace_field: bool = True
|
290
|
-
replace_splits = []
|
291
327
|
if actual_payload is None:
|
292
328
|
# Its first extracted text
|
293
329
|
if payload.HasField("file"):
|
@@ -296,7 +332,7 @@ class Field:
|
|
296
332
|
else:
|
297
333
|
await self.storage.upload_pb(sf, payload.vectors)
|
298
334
|
vo = payload.vectors
|
299
|
-
self.extracted_vectors = payload.vectors
|
335
|
+
self.extracted_vectors[vectorset] = payload.vectors
|
300
336
|
else:
|
301
337
|
if payload.HasField("file"):
|
302
338
|
raw_payload = await self.storage.downloadbytescf(payload.file)
|
@@ -304,36 +340,38 @@ class Field:
|
|
304
340
|
pb.ParseFromString(raw_payload.read())
|
305
341
|
raw_payload.flush()
|
306
342
|
payload.vectors.CopyFrom(pb)
|
307
|
-
vo =
|
343
|
+
vo = actual_payload
|
308
344
|
# We know its payload.body
|
309
345
|
for key, value in payload.vectors.split_vectors.items():
|
310
346
|
actual_payload.split_vectors[key].CopyFrom(value)
|
311
347
|
for key in payload.vectors.deleted_splits:
|
312
348
|
if key in actual_payload.split_vectors:
|
313
|
-
replace_splits.append(key)
|
314
349
|
del actual_payload.split_vectors[key]
|
315
350
|
if len(payload.vectors.vectors.vectors) > 0:
|
316
|
-
replace_field = True
|
317
351
|
actual_payload.vectors.CopyFrom(payload.vectors.vectors)
|
318
352
|
await self.storage.upload_pb(sf, actual_payload)
|
319
|
-
self.extracted_vectors = actual_payload
|
320
|
-
return vo
|
321
|
-
|
322
|
-
async def get_vectors(
|
323
|
-
|
324
|
-
|
353
|
+
self.extracted_vectors[vectorset] = actual_payload
|
354
|
+
return vo
|
355
|
+
|
356
|
+
async def get_vectors(
|
357
|
+
self, vectorset: Optional[str] = None, force: bool = False
|
358
|
+
) -> Optional[VectorObject]:
|
359
|
+
# compat with vectorsets coming from protobuffers where no value is
|
360
|
+
# empty string instead of None. This shouldn't be handled here but we
|
361
|
+
# have to make sure it gets the correct vectorset
|
362
|
+
vectorset = vectorset or None
|
363
|
+
if self.extracted_vectors.get(vectorset, None) is None or force:
|
364
|
+
sf = self._get_extracted_vectors_storage_field(vectorset)
|
325
365
|
payload = await self.storage.download_pb(sf, VectorObject)
|
326
366
|
if payload is not None:
|
327
|
-
self.extracted_vectors = payload
|
328
|
-
return self.extracted_vectors
|
367
|
+
self.extracted_vectors[vectorset] = payload
|
368
|
+
return self.extracted_vectors.get(vectorset, None)
|
329
369
|
|
330
|
-
async def set_field_metadata(
|
331
|
-
self, payload: FieldComputedMetadataWrapper
|
332
|
-
) -> tuple[FieldComputedMetadata, list[str], dict[str, list[str]]]:
|
370
|
+
async def set_field_metadata(self, payload: FieldComputedMetadataWrapper) -> FieldComputedMetadata:
|
333
371
|
if self.type in SUBFIELDFIELDS:
|
334
372
|
try:
|
335
|
-
actual_payload: Optional[FieldComputedMetadata] = (
|
336
|
-
|
373
|
+
actual_payload: Optional[FieldComputedMetadata] = await self.get_field_metadata(
|
374
|
+
force=True
|
337
375
|
)
|
338
376
|
except KeyError:
|
339
377
|
actual_payload = None
|
@@ -359,8 +397,6 @@ class Field:
|
|
359
397
|
metadata.thumbnail.CopyFrom(cf_split)
|
360
398
|
metadata.last_index.FromDatetime(datetime.now())
|
361
399
|
|
362
|
-
replace_field = []
|
363
|
-
replace_splits = {}
|
364
400
|
if actual_payload is None:
|
365
401
|
# Its first metadata
|
366
402
|
await self.storage.upload_pb(sf, payload.metadata)
|
@@ -371,22 +407,15 @@ class Field:
|
|
371
407
|
actual_payload.split_metadata[key].CopyFrom(value)
|
372
408
|
for key in payload.metadata.deleted_splits:
|
373
409
|
if key in actual_payload.split_metadata:
|
374
|
-
replace_splits[key] = [
|
375
|
-
f"{x.start}-{x.end}"
|
376
|
-
for x in actual_payload.split_metadata[key].paragraphs
|
377
|
-
]
|
378
410
|
del actual_payload.split_metadata[key]
|
379
411
|
if payload.metadata.metadata:
|
380
412
|
actual_payload.metadata.CopyFrom(payload.metadata.metadata)
|
381
|
-
replace_field = [f"{x.start}-{x.end}" for x in metadata.paragraphs]
|
382
413
|
await self.storage.upload_pb(sf, actual_payload)
|
383
414
|
self.computed_metadata = actual_payload
|
384
415
|
|
385
|
-
return self.computed_metadata
|
416
|
+
return self.computed_metadata
|
386
417
|
|
387
|
-
async def get_field_metadata(
|
388
|
-
self, force: bool = False
|
389
|
-
) -> Optional[FieldComputedMetadata]:
|
418
|
+
async def get_field_metadata(self, force: bool = False) -> Optional[FieldComputedMetadata]:
|
390
419
|
if self.computed_metadata is None or force:
|
391
420
|
sf = self.get_storage_field(FieldTypes.FIELD_METADATA)
|
392
421
|
payload = await self.storage.download_pb(sf, FieldComputedMetadata)
|
@@ -397,8 +426,8 @@ class Field:
|
|
397
426
|
async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
|
398
427
|
if self.type in SUBFIELDFIELDS:
|
399
428
|
try:
|
400
|
-
actual_payload: Optional[LargeComputedMetadata] = (
|
401
|
-
|
429
|
+
actual_payload: Optional[LargeComputedMetadata] = await self.get_large_field_metadata(
|
430
|
+
force=True
|
402
431
|
)
|
403
432
|
except KeyError:
|
404
433
|
actual_payload = None
|
@@ -434,9 +463,7 @@ class Field:
|
|
434
463
|
|
435
464
|
return self.large_computed_metadata
|
436
465
|
|
437
|
-
async def get_large_field_metadata(
|
438
|
-
self, force: bool = False
|
439
|
-
) -> Optional[LargeComputedMetadata]:
|
466
|
+
async def get_large_field_metadata(self, force: bool = False) -> Optional[LargeComputedMetadata]:
|
440
467
|
if self.large_computed_metadata is None or force:
|
441
468
|
sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
|
442
469
|
payload = await self.storage.download_pb(
|
@@ -447,6 +474,11 @@ class Field:
|
|
447
474
|
self.large_computed_metadata = payload
|
448
475
|
return self.large_computed_metadata
|
449
476
|
|
477
|
+
async def generated_by(self) -> FieldAuthor:
|
478
|
+
author = FieldAuthor()
|
479
|
+
author.user.SetInParent()
|
480
|
+
return author
|
481
|
+
|
450
482
|
def serialize(self):
|
451
483
|
return self.value.SerializeToString()
|
452
484
|
|
@@ -20,11 +20,9 @@
|
|
20
20
|
import uuid
|
21
21
|
from typing import Any, Optional
|
22
22
|
|
23
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
24
|
-
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
25
|
-
from nucliadb_protos.resources_pb2 import FieldConversation
|
26
|
-
|
27
23
|
from nucliadb.ingest.fields.base import Field
|
24
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
|
25
|
+
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
28
26
|
from nucliadb_utils.storages.storage import StorageField
|
29
27
|
|
30
28
|
PAGE_SIZE = 200
|
@@ -36,7 +34,7 @@ class PageNotFound(Exception):
|
|
36
34
|
pass
|
37
35
|
|
38
36
|
|
39
|
-
class Conversation(Field):
|
37
|
+
class Conversation(Field[PBConversation]):
|
40
38
|
pbklass = PBConversation
|
41
39
|
type: str = "c"
|
42
40
|
value: dict[int, PBConversation]
|
@@ -120,6 +118,21 @@ class Conversation(Field):
|
|
120
118
|
except PageNotFound:
|
121
119
|
return None
|
122
120
|
|
121
|
+
async def get_full_conversation(self) -> Optional[PBConversation]:
|
122
|
+
"""
|
123
|
+
Messages of a conversations may be stored across several pages.
|
124
|
+
This method fetches them all and returns a single complete conversation.
|
125
|
+
"""
|
126
|
+
full_conv = PBConversation()
|
127
|
+
n_page = 1
|
128
|
+
while True:
|
129
|
+
page = await self.get_value(page=n_page)
|
130
|
+
if page is None:
|
131
|
+
break
|
132
|
+
full_conv.messages.extend(page.messages)
|
133
|
+
n_page += 1
|
134
|
+
return full_conv
|
135
|
+
|
123
136
|
async def get_metadata(self) -> FieldConversation:
|
124
137
|
if self.metadata is None:
|
125
138
|
payload = await self.resource.txn.get(
|
@@ -28,7 +28,4 @@ class InvalidPBClass(Exception):
|
|
28
28
|
def __init__(self, source: Type, destination: Type):
|
29
29
|
self.source = source
|
30
30
|
self.destination = destination
|
31
|
-
super().__init__(
|
32
|
-
"Source and destination does not match "
|
33
|
-
f"{self.source} - {self.destination}"
|
34
|
-
)
|
31
|
+
super().__init__("Source and destination does not match " f"{self.source} - {self.destination}")
|
nucliadb/ingest/fields/file.py
CHANGED
@@ -19,15 +19,14 @@
|
|
19
19
|
#
|
20
20
|
from typing import Any, Optional
|
21
21
|
|
22
|
-
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
|
23
|
-
|
24
22
|
from nucliadb.ingest.fields.base import Field
|
23
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
|
25
24
|
from nucliadb_utils.storages.storage import StorageField
|
26
25
|
|
27
26
|
FILE_METADATA = "file_metadata"
|
28
27
|
|
29
28
|
|
30
|
-
class File(Field):
|
29
|
+
class File(Field[FieldFile]):
|
31
30
|
pbklass = FieldFile
|
32
31
|
value: FieldFile
|
33
32
|
type: str = "f"
|
@@ -52,15 +51,13 @@ class File(Field):
|
|
52
51
|
|
53
52
|
is_external_file = payload.file.source == CloudFile.Source.EXTERNAL
|
54
53
|
if not is_external_file:
|
55
|
-
sf: StorageField = self.storage.file_field(
|
56
|
-
self.kbid, self.uuid, self.id, old_cf
|
57
|
-
)
|
54
|
+
sf: StorageField = self.storage.file_field(self.kbid, self.uuid, self.id, old_cf)
|
58
55
|
cf: CloudFile = await self.storage.normalize_binary(payload.file, sf)
|
59
56
|
payload.file.CopyFrom(cf)
|
60
57
|
|
61
58
|
await self.db_set_value(payload)
|
62
59
|
|
63
|
-
async def get_value(self) -> FieldFile:
|
60
|
+
async def get_value(self) -> Optional[FieldFile]:
|
64
61
|
return await self.db_get_value()
|
65
62
|
|
66
63
|
async def set_file_extracted_data(self, file_extracted_data: FileExtractedData):
|
@@ -80,17 +77,13 @@ class File(Field):
|
|
80
77
|
cf_file_page_preview: CloudFile = await self.storage.normalize_binary(
|
81
78
|
preview, sf_file_page_preview
|
82
79
|
)
|
83
|
-
file_extracted_data.file_pages_previews.pages[page].CopyFrom(
|
84
|
-
cf_file_page_preview
|
85
|
-
)
|
80
|
+
file_extracted_data.file_pages_previews.pages[page].CopyFrom(cf_file_page_preview)
|
86
81
|
|
87
82
|
for fileid, origincf in file_extracted_data.file_generated.items():
|
88
83
|
sf_generated: StorageField = self.storage.file_extracted(
|
89
84
|
self.kbid, self.uuid, self.type, self.id, f"generated/{fileid}"
|
90
85
|
)
|
91
|
-
cf_generated: CloudFile = await self.storage.normalize_binary(
|
92
|
-
origincf, sf_generated
|
93
|
-
)
|
86
|
+
cf_generated: CloudFile = await self.storage.normalize_binary(origincf, sf_generated)
|
94
87
|
file_extracted_data.file_generated[fileid].CopyFrom(cf_generated)
|
95
88
|
|
96
89
|
if file_extracted_data.HasField("file_thumbnail"):
|
@@ -113,7 +106,5 @@ class File(Field):
|
|
113
106
|
sf: StorageField = self.storage.file_extracted(
|
114
107
|
self.kbid, self.uuid, self.type, self.id, FILE_METADATA
|
115
108
|
)
|
116
|
-
self.file_extracted_data = await self.storage.download_pb(
|
117
|
-
sf, FileExtractedData
|
118
|
-
)
|
109
|
+
self.file_extracted_data = await self.storage.download_pb(sf, FileExtractedData)
|
119
110
|
return self.file_extracted_data
|