nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/ingest/fields/base.py
CHANGED
@@ -21,57 +21,54 @@ from __future__ import annotations
|
|
21
21
|
|
22
22
|
import enum
|
23
23
|
from datetime import datetime
|
24
|
-
from typing import Any, Optional, Type
|
24
|
+
from typing import Any, Generic, Optional, Type, TypeVar
|
25
25
|
|
26
|
+
from google.protobuf.message import DecodeError, Message
|
27
|
+
|
28
|
+
from nucliadb.common import datamanagers
|
29
|
+
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
26
30
|
from nucliadb_protos.resources_pb2 import (
|
27
31
|
CloudFile,
|
28
32
|
ExtractedTextWrapper,
|
29
33
|
ExtractedVectorsWrapper,
|
34
|
+
FieldAuthor,
|
30
35
|
FieldComputedMetadata,
|
31
36
|
FieldComputedMetadataWrapper,
|
37
|
+
FieldQuestionAnswers,
|
32
38
|
FieldQuestionAnswerWrapper,
|
33
39
|
LargeComputedMetadata,
|
34
40
|
LargeComputedMetadataWrapper,
|
35
41
|
QuestionAnswers,
|
36
|
-
UserVectorsWrapper,
|
37
|
-
)
|
38
|
-
from nucliadb_protos.utils_pb2 import (
|
39
|
-
ExtractedText,
|
40
|
-
UserVectorSet,
|
41
|
-
UserVectorsList,
|
42
|
-
VectorObject,
|
43
42
|
)
|
43
|
+
from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
|
44
44
|
from nucliadb_protos.writer_pb2 import Error
|
45
|
-
|
46
|
-
from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
|
47
45
|
from nucliadb_utils.storages.storage import Storage, StorageField
|
48
46
|
|
49
|
-
|
50
|
-
KB_RESOURCE_ERROR = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/error"
|
51
|
-
|
52
|
-
SUBFIELDFIELDS = ["l", "c"]
|
47
|
+
SUBFIELDFIELDS = ("c",)
|
53
48
|
|
54
49
|
|
55
50
|
class FieldTypes(str, enum.Enum):
|
56
51
|
FIELD_TEXT = "extracted_text"
|
57
52
|
FIELD_VECTORS = "extracted_vectors"
|
58
|
-
|
53
|
+
FIELD_VECTORSET = "{vectorset}/extracted_vectors"
|
59
54
|
FIELD_METADATA = "metadata"
|
60
55
|
FIELD_LARGE_METADATA = "large_metadata"
|
61
56
|
THUMBNAIL = "thumbnail"
|
62
57
|
QUESTION_ANSWERS = "question_answers"
|
63
58
|
|
64
59
|
|
65
|
-
|
66
|
-
|
60
|
+
PbType = TypeVar("PbType", bound=Message)
|
61
|
+
|
62
|
+
|
63
|
+
class Field(Generic[PbType]):
|
64
|
+
pbklass: Type[PbType]
|
67
65
|
type: str = "x"
|
68
66
|
value: Optional[Any]
|
69
67
|
extracted_text: Optional[ExtractedText]
|
70
|
-
extracted_vectors: Optional[VectorObject]
|
68
|
+
extracted_vectors: dict[Optional[str], VectorObject]
|
71
69
|
computed_metadata: Optional[FieldComputedMetadata]
|
72
70
|
large_computed_metadata: Optional[LargeComputedMetadata]
|
73
|
-
|
74
|
-
question_answers: Optional[QuestionAnswers]
|
71
|
+
question_answers: Optional[FieldQuestionAnswers]
|
75
72
|
|
76
73
|
def __init__(
|
77
74
|
self,
|
@@ -85,10 +82,9 @@ class Field:
|
|
85
82
|
|
86
83
|
self.value = None
|
87
84
|
self.extracted_text: Optional[ExtractedText] = None
|
88
|
-
self.extracted_vectors =
|
85
|
+
self.extracted_vectors = {}
|
89
86
|
self.computed_metadata = None
|
90
87
|
self.large_computed_metadata = None
|
91
|
-
self.extracted_user_vectors = None
|
92
88
|
self.question_answers = None
|
93
89
|
|
94
90
|
self.id: str = id
|
@@ -121,44 +117,51 @@ class Field:
|
|
121
117
|
return f"{self.uuid}/{self.type}/{self.id}"
|
122
118
|
|
123
119
|
def get_storage_field(self, field_type: FieldTypes) -> StorageField:
|
124
|
-
return self.storage.file_extracted(
|
125
|
-
|
126
|
-
|
120
|
+
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, field_type.value)
|
121
|
+
|
122
|
+
def _get_extracted_vectors_storage_field(self, vectorset: Optional[str] = None) -> StorageField:
|
123
|
+
if vectorset:
|
124
|
+
key = FieldTypes.FIELD_VECTORSET.value.format(vectorset=vectorset)
|
125
|
+
else:
|
126
|
+
key = FieldTypes.FIELD_VECTORS.value
|
127
|
+
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
|
127
128
|
|
128
|
-
async def db_get_value(self):
|
129
|
+
async def db_get_value(self) -> Optional[PbType]:
|
129
130
|
if self.value is None:
|
130
|
-
payload = await
|
131
|
-
|
132
|
-
|
133
|
-
|
131
|
+
payload = await datamanagers.fields.get_raw(
|
132
|
+
self.resource.txn,
|
133
|
+
kbid=self.kbid,
|
134
|
+
rid=self.uuid,
|
135
|
+
field_type=self.type,
|
136
|
+
field_id=self.id,
|
134
137
|
)
|
135
138
|
if payload is None:
|
136
|
-
return
|
139
|
+
return None
|
137
140
|
|
138
141
|
self.value = self.pbklass()
|
139
142
|
self.value.ParseFromString(payload)
|
140
143
|
return self.value
|
141
144
|
|
142
145
|
async def db_set_value(self, payload: Any):
|
143
|
-
await
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
146
|
+
await datamanagers.fields.set(
|
147
|
+
self.resource.txn,
|
148
|
+
kbid=self.kbid,
|
149
|
+
rid=self.uuid,
|
150
|
+
field_type=self.type,
|
151
|
+
field_id=self.id,
|
152
|
+
value=payload,
|
148
153
|
)
|
149
154
|
self.value = payload
|
150
155
|
self.resource.modified = True
|
151
156
|
|
152
157
|
async def delete(self):
|
153
|
-
|
154
|
-
|
158
|
+
await datamanagers.fields.delete(
|
159
|
+
self.resource.txn,
|
160
|
+
kbid=self.kbid,
|
161
|
+
rid=self.uuid,
|
162
|
+
field_type=self.type,
|
163
|
+
field_id=self.id,
|
155
164
|
)
|
156
|
-
# Make sure we explicitly delete the field and any nested key
|
157
|
-
keys_to_delete = []
|
158
|
-
async for key in self.resource.txn.keys(field_base_key):
|
159
|
-
keys_to_delete.append(key)
|
160
|
-
for key in keys_to_delete:
|
161
|
-
await self.resource.txn.delete(key)
|
162
165
|
await self.delete_extracted_text()
|
163
166
|
await self.delete_vectors()
|
164
167
|
await self.delete_metadata()
|
@@ -178,9 +181,9 @@ class Field:
|
|
178
181
|
except KeyError:
|
179
182
|
pass
|
180
183
|
|
181
|
-
async def delete_vectors(self) -> None:
|
184
|
+
async def delete_vectors(self, vectorset: Optional[str] = None) -> None:
|
182
185
|
# Try delete vectors
|
183
|
-
sf = self.
|
186
|
+
sf = self._get_extracted_vectors_storage_field(vectorset)
|
184
187
|
try:
|
185
188
|
await self.storage.delete_upload(sf.key, sf.bucket)
|
186
189
|
except KeyError:
|
@@ -194,53 +197,79 @@ class Field:
|
|
194
197
|
pass
|
195
198
|
|
196
199
|
async def get_error(self) -> Optional[Error]:
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
200
|
+
return await datamanagers.fields.get_error(
|
201
|
+
self.resource.txn,
|
202
|
+
kbid=self.kbid,
|
203
|
+
rid=self.uuid,
|
204
|
+
field_type=self.type,
|
205
|
+
field_id=self.id,
|
201
206
|
)
|
202
|
-
if payload is None:
|
203
|
-
return None
|
204
|
-
pberror = Error()
|
205
|
-
pberror.ParseFromString(payload)
|
206
|
-
return pberror
|
207
207
|
|
208
208
|
async def set_error(self, error: Error) -> None:
|
209
|
-
await
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
209
|
+
await datamanagers.fields.set_error(
|
210
|
+
self.resource.txn,
|
211
|
+
kbid=self.kbid,
|
212
|
+
rid=self.uuid,
|
213
|
+
field_type=self.type,
|
214
|
+
field_id=self.id,
|
215
|
+
error=error,
|
214
216
|
)
|
215
217
|
|
216
|
-
async def get_question_answers(self) -> Optional[
|
217
|
-
if self.question_answers is None:
|
218
|
+
async def get_question_answers(self, force=False) -> Optional[FieldQuestionAnswers]:
|
219
|
+
if self.question_answers is None or force:
|
218
220
|
sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
|
219
|
-
|
221
|
+
try:
|
222
|
+
payload = await self.storage.download_pb(sf, FieldQuestionAnswers)
|
223
|
+
except DecodeError:
|
224
|
+
deprecated_payload = await self.storage.download_pb(sf, QuestionAnswers)
|
225
|
+
if deprecated_payload is not None:
|
226
|
+
payload = FieldQuestionAnswers()
|
227
|
+
payload.question_answers.CopyFrom(deprecated_payload)
|
220
228
|
if payload is not None:
|
221
229
|
self.question_answers = payload
|
222
230
|
return self.question_answers
|
223
231
|
|
224
232
|
async def set_question_answers(self, payload: FieldQuestionAnswerWrapper) -> None:
|
233
|
+
if self.type in SUBFIELDFIELDS:
|
234
|
+
try:
|
235
|
+
actual_payload: Optional[FieldQuestionAnswers] = await self.get_question_answers(
|
236
|
+
force=True
|
237
|
+
)
|
238
|
+
except KeyError:
|
239
|
+
actual_payload = None
|
240
|
+
else:
|
241
|
+
actual_payload = None
|
225
242
|
sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
|
226
243
|
|
227
|
-
if
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
244
|
+
if actual_payload is None:
|
245
|
+
# Its first question answer
|
246
|
+
if payload.HasField("file"):
|
247
|
+
await self.storage.normalize_binary(payload.file, sf)
|
248
|
+
else:
|
249
|
+
await self.storage.upload_pb(sf, payload.question_answers)
|
250
|
+
self.question_answers = payload.question_answers
|
233
251
|
else:
|
234
|
-
|
235
|
-
|
236
|
-
|
252
|
+
if payload.HasField("file"):
|
253
|
+
raw_payload = await self.storage.downloadbytescf(payload.file)
|
254
|
+
pb = FieldQuestionAnswers()
|
255
|
+
pb.ParseFromString(raw_payload.read())
|
256
|
+
raw_payload.flush()
|
257
|
+
payload.question_answers.CopyFrom(pb)
|
258
|
+
# We know its payload.question_answers
|
259
|
+
for key, value in payload.question_answers.split_question_answers.items():
|
260
|
+
actual_payload.split_question_answers[key] = value
|
261
|
+
for key in payload.question_answers.deleted_splits:
|
262
|
+
if key in actual_payload.split_question_answers:
|
263
|
+
del actual_payload.split_question_answers[key]
|
264
|
+
if payload.question_answers.HasField("question_answers") != "":
|
265
|
+
actual_payload.question_answers.CopyFrom(payload.question_answers.question_answers)
|
266
|
+
await self.storage.upload_pb(sf, actual_payload)
|
267
|
+
self.question_answers = actual_payload
|
237
268
|
|
238
269
|
async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
|
239
270
|
if self.type in SUBFIELDFIELDS:
|
240
271
|
try:
|
241
|
-
actual_payload: Optional[ExtractedText] = await self.get_extracted_text(
|
242
|
-
force=True
|
243
|
-
)
|
272
|
+
actual_payload: Optional[ExtractedText] = await self.get_extracted_text(force=True)
|
244
273
|
except KeyError:
|
245
274
|
actual_payload = None
|
246
275
|
else:
|
@@ -280,30 +309,21 @@ class Field:
|
|
280
309
|
self.extracted_text = payload
|
281
310
|
return self.extracted_text
|
282
311
|
|
283
|
-
async def
|
284
|
-
|
285
|
-
if await sf.exists() is not None:
|
286
|
-
return sf.build_cf()
|
287
|
-
else:
|
288
|
-
return None
|
289
|
-
|
290
|
-
async def set_vectors(
|
291
|
-
self, payload: ExtractedVectorsWrapper
|
292
|
-
) -> tuple[Optional[VectorObject], bool, list[str]]:
|
312
|
+
async def set_vectors(self, payload: ExtractedVectorsWrapper) -> Optional[VectorObject]:
|
313
|
+
vectorset = payload.vectorset_id or None
|
293
314
|
if self.type in SUBFIELDFIELDS:
|
294
315
|
try:
|
295
316
|
actual_payload: Optional[VectorObject] = await self.get_vectors(
|
296
|
-
|
317
|
+
vectorset=vectorset,
|
318
|
+
force=True,
|
297
319
|
)
|
298
320
|
except KeyError:
|
299
321
|
actual_payload = None
|
300
322
|
else:
|
301
323
|
actual_payload = None
|
302
324
|
|
303
|
-
sf = self.
|
325
|
+
sf = self._get_extracted_vectors_storage_field(vectorset)
|
304
326
|
vo: Optional[VectorObject] = None
|
305
|
-
replace_field: bool = True
|
306
|
-
replace_splits = []
|
307
327
|
if actual_payload is None:
|
308
328
|
# Its first extracted text
|
309
329
|
if payload.HasField("file"):
|
@@ -312,7 +332,7 @@ class Field:
|
|
312
332
|
else:
|
313
333
|
await self.storage.upload_pb(sf, payload.vectors)
|
314
334
|
vo = payload.vectors
|
315
|
-
self.extracted_vectors = payload.vectors
|
335
|
+
self.extracted_vectors[vectorset] = payload.vectors
|
316
336
|
else:
|
317
337
|
if payload.HasField("file"):
|
318
338
|
raw_payload = await self.storage.downloadbytescf(payload.file)
|
@@ -320,88 +340,39 @@ class Field:
|
|
320
340
|
pb.ParseFromString(raw_payload.read())
|
321
341
|
raw_payload.flush()
|
322
342
|
payload.vectors.CopyFrom(pb)
|
323
|
-
vo =
|
343
|
+
vo = actual_payload
|
324
344
|
# We know its payload.body
|
325
345
|
for key, value in payload.vectors.split_vectors.items():
|
326
346
|
actual_payload.split_vectors[key].CopyFrom(value)
|
327
347
|
for key in payload.vectors.deleted_splits:
|
328
348
|
if key in actual_payload.split_vectors:
|
329
|
-
replace_splits.append(key)
|
330
349
|
del actual_payload.split_vectors[key]
|
331
350
|
if len(payload.vectors.vectors.vectors) > 0:
|
332
|
-
replace_field = True
|
333
351
|
actual_payload.vectors.CopyFrom(payload.vectors.vectors)
|
334
352
|
await self.storage.upload_pb(sf, actual_payload)
|
335
|
-
self.extracted_vectors = actual_payload
|
336
|
-
return vo
|
337
|
-
|
338
|
-
async def get_vectors(
|
339
|
-
|
340
|
-
|
353
|
+
self.extracted_vectors[vectorset] = actual_payload
|
354
|
+
return vo
|
355
|
+
|
356
|
+
async def get_vectors(
|
357
|
+
self, vectorset: Optional[str] = None, force: bool = False
|
358
|
+
) -> Optional[VectorObject]:
|
359
|
+
# compat with vectorsets coming from protobuffers where no value is
|
360
|
+
# empty string instead of None. This shouldn't be handled here but we
|
361
|
+
# have to make sure it gets the correct vectorset
|
362
|
+
vectorset = vectorset or None
|
363
|
+
if self.extracted_vectors.get(vectorset, None) is None or force:
|
364
|
+
sf = self._get_extracted_vectors_storage_field(vectorset)
|
341
365
|
payload = await self.storage.download_pb(sf, VectorObject)
|
342
366
|
if payload is not None:
|
343
|
-
self.extracted_vectors = payload
|
344
|
-
return self.extracted_vectors
|
345
|
-
|
346
|
-
async def set_user_vectors(
|
347
|
-
self, user_vectors: UserVectorsWrapper
|
348
|
-
) -> tuple[UserVectorSet, dict[str, UserVectorsList]]:
|
349
|
-
try:
|
350
|
-
actual_payload: Optional[UserVectorSet] = await self.get_user_vectors(
|
351
|
-
force=True
|
352
|
-
)
|
353
|
-
except KeyError:
|
354
|
-
actual_payload = None
|
355
|
-
|
356
|
-
sf = self.get_storage_field(FieldTypes.USER_FIELD_VECTORS)
|
357
|
-
|
358
|
-
vectors_to_delete: dict[str, UserVectorsList] = {}
|
359
|
-
if actual_payload is not None:
|
360
|
-
for vectorset, user_vector in user_vectors.vectors.vectors.items():
|
361
|
-
for key, vector in user_vector.vectors.items():
|
362
|
-
if key in actual_payload.vectors[vectorset].vectors.keys():
|
363
|
-
if vectorset not in vectors_to_delete:
|
364
|
-
vectors_to_delete[vectorset] = UserVectorsList()
|
365
|
-
vectors_to_delete[vectorset].vectors.append(key)
|
366
|
-
actual_payload.vectors[vectorset].vectors[key].CopyFrom(vector)
|
367
|
-
for vectorset, delete_vectors in user_vectors.vectors_to_delete.items():
|
368
|
-
for vector_to_delete in delete_vectors.vectors:
|
369
|
-
if (
|
370
|
-
actual_payload.vectors.get(vectorset).vectors.get(
|
371
|
-
vector_to_delete
|
372
|
-
)
|
373
|
-
is not None
|
374
|
-
):
|
375
|
-
del actual_payload.vectors[vectorset].vectors[vector_to_delete]
|
376
|
-
else:
|
377
|
-
actual_payload = user_vectors.vectors
|
378
|
-
await self.storage.upload_pb(sf, actual_payload)
|
379
|
-
self.extracted_user_vectors = actual_payload
|
380
|
-
return actual_payload, vectors_to_delete
|
381
|
-
|
382
|
-
async def get_user_vectors(self, force=False) -> Optional[UserVectorSet]:
|
383
|
-
if self.extracted_user_vectors is None or force:
|
384
|
-
sf = self.get_storage_field(FieldTypes.USER_FIELD_VECTORS)
|
385
|
-
payload = await self.storage.download_pb(sf, UserVectorSet)
|
386
|
-
if payload is not None:
|
387
|
-
self.extracted_user_vectors = payload
|
388
|
-
return self.extracted_user_vectors
|
389
|
-
|
390
|
-
async def get_vectors_cf(self) -> Optional[CloudFile]:
|
391
|
-
sf = self.get_storage_field(FieldTypes.FIELD_VECTORS)
|
392
|
-
if await sf.exists() is not None:
|
393
|
-
return sf.build_cf()
|
394
|
-
else:
|
395
|
-
return None
|
367
|
+
self.extracted_vectors[vectorset] = payload
|
368
|
+
return self.extracted_vectors.get(vectorset, None)
|
396
369
|
|
397
|
-
async def set_field_metadata(
|
398
|
-
self, payload: FieldComputedMetadataWrapper
|
399
|
-
) -> tuple[FieldComputedMetadata, list[str], dict[str, list[str]]]:
|
370
|
+
async def set_field_metadata(self, payload: FieldComputedMetadataWrapper) -> FieldComputedMetadata:
|
400
371
|
if self.type in SUBFIELDFIELDS:
|
401
372
|
try:
|
402
|
-
actual_payload: Optional[
|
403
|
-
|
404
|
-
|
373
|
+
actual_payload: Optional[FieldComputedMetadata] = await self.get_field_metadata(
|
374
|
+
force=True
|
375
|
+
)
|
405
376
|
except KeyError:
|
406
377
|
actual_payload = None
|
407
378
|
else:
|
@@ -426,8 +397,6 @@ class Field:
|
|
426
397
|
metadata.thumbnail.CopyFrom(cf_split)
|
427
398
|
metadata.last_index.FromDatetime(datetime.now())
|
428
399
|
|
429
|
-
replace_field = []
|
430
|
-
replace_splits = {}
|
431
400
|
if actual_payload is None:
|
432
401
|
# Its first metadata
|
433
402
|
await self.storage.upload_pb(sf, payload.metadata)
|
@@ -438,22 +407,15 @@ class Field:
|
|
438
407
|
actual_payload.split_metadata[key].CopyFrom(value)
|
439
408
|
for key in payload.metadata.deleted_splits:
|
440
409
|
if key in actual_payload.split_metadata:
|
441
|
-
replace_splits[key] = [
|
442
|
-
f"{x.start}-{x.end}"
|
443
|
-
for x in actual_payload.split_metadata[key].paragraphs
|
444
|
-
]
|
445
410
|
del actual_payload.split_metadata[key]
|
446
411
|
if payload.metadata.metadata:
|
447
412
|
actual_payload.metadata.CopyFrom(payload.metadata.metadata)
|
448
|
-
replace_field = [f"{x.start}-{x.end}" for x in metadata.paragraphs]
|
449
413
|
await self.storage.upload_pb(sf, actual_payload)
|
450
414
|
self.computed_metadata = actual_payload
|
451
415
|
|
452
|
-
return self.computed_metadata
|
416
|
+
return self.computed_metadata
|
453
417
|
|
454
|
-
async def get_field_metadata(
|
455
|
-
self, force: bool = False
|
456
|
-
) -> Optional[FieldComputedMetadata]:
|
418
|
+
async def get_field_metadata(self, force: bool = False) -> Optional[FieldComputedMetadata]:
|
457
419
|
if self.computed_metadata is None or force:
|
458
420
|
sf = self.get_storage_field(FieldTypes.FIELD_METADATA)
|
459
421
|
payload = await self.storage.download_pb(sf, FieldComputedMetadata)
|
@@ -461,19 +423,12 @@ class Field:
|
|
461
423
|
self.computed_metadata = payload
|
462
424
|
return self.computed_metadata
|
463
425
|
|
464
|
-
async def get_field_metadata_cf(self) -> Optional[CloudFile]:
|
465
|
-
sf = self.get_storage_field(FieldTypes.FIELD_METADATA)
|
466
|
-
if await sf.exists() is not None:
|
467
|
-
return sf.build_cf()
|
468
|
-
else:
|
469
|
-
return None
|
470
|
-
|
471
426
|
async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
|
472
427
|
if self.type in SUBFIELDFIELDS:
|
473
428
|
try:
|
474
|
-
actual_payload: Optional[
|
475
|
-
|
476
|
-
|
429
|
+
actual_payload: Optional[LargeComputedMetadata] = await self.get_large_field_metadata(
|
430
|
+
force=True
|
431
|
+
)
|
477
432
|
except KeyError:
|
478
433
|
actual_payload = None
|
479
434
|
else:
|
@@ -508,9 +463,7 @@ class Field:
|
|
508
463
|
|
509
464
|
return self.large_computed_metadata
|
510
465
|
|
511
|
-
async def get_large_field_metadata(
|
512
|
-
self, force: bool = False
|
513
|
-
) -> Optional[LargeComputedMetadata]:
|
466
|
+
async def get_large_field_metadata(self, force: bool = False) -> Optional[LargeComputedMetadata]:
|
514
467
|
if self.large_computed_metadata is None or force:
|
515
468
|
sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
|
516
469
|
payload = await self.storage.download_pb(
|
@@ -521,12 +474,10 @@ class Field:
|
|
521
474
|
self.large_computed_metadata = payload
|
522
475
|
return self.large_computed_metadata
|
523
476
|
|
524
|
-
async def
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
else:
|
529
|
-
return None
|
477
|
+
async def generated_by(self) -> FieldAuthor:
|
478
|
+
author = FieldAuthor()
|
479
|
+
author.user.SetInParent()
|
480
|
+
return author
|
530
481
|
|
531
482
|
def serialize(self):
|
532
483
|
return self.value.SerializeToString()
|
@@ -20,11 +20,9 @@
|
|
20
20
|
import uuid
|
21
21
|
from typing import Any, Optional
|
22
22
|
|
23
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
24
|
-
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
25
|
-
from nucliadb_protos.resources_pb2 import FieldConversation
|
26
|
-
|
27
23
|
from nucliadb.ingest.fields.base import Field
|
24
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
|
25
|
+
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
28
26
|
from nucliadb_utils.storages.storage import StorageField
|
29
27
|
|
30
28
|
PAGE_SIZE = 200
|
@@ -36,7 +34,7 @@ class PageNotFound(Exception):
|
|
36
34
|
pass
|
37
35
|
|
38
36
|
|
39
|
-
class Conversation(Field):
|
37
|
+
class Conversation(Field[PBConversation]):
|
40
38
|
pbklass = PBConversation
|
41
39
|
type: str = "c"
|
42
40
|
value: dict[int, PBConversation]
|
@@ -120,6 +118,21 @@ class Conversation(Field):
|
|
120
118
|
except PageNotFound:
|
121
119
|
return None
|
122
120
|
|
121
|
+
async def get_full_conversation(self) -> Optional[PBConversation]:
|
122
|
+
"""
|
123
|
+
Messages of a conversations may be stored across several pages.
|
124
|
+
This method fetches them all and returns a single complete conversation.
|
125
|
+
"""
|
126
|
+
full_conv = PBConversation()
|
127
|
+
n_page = 1
|
128
|
+
while True:
|
129
|
+
page = await self.get_value(page=n_page)
|
130
|
+
if page is None:
|
131
|
+
break
|
132
|
+
full_conv.messages.extend(page.messages)
|
133
|
+
n_page += 1
|
134
|
+
return full_conv
|
135
|
+
|
123
136
|
async def get_metadata(self) -> FieldConversation:
|
124
137
|
if self.metadata is None:
|
125
138
|
payload = await self.resource.txn.get(
|
@@ -28,7 +28,4 @@ class InvalidPBClass(Exception):
|
|
28
28
|
def __init__(self, source: Type, destination: Type):
|
29
29
|
self.source = source
|
30
30
|
self.destination = destination
|
31
|
-
super().__init__(
|
32
|
-
"Source and destination does not match "
|
33
|
-
f"{self.source} - {self.destination}"
|
34
|
-
)
|
31
|
+
super().__init__("Source and destination does not match " f"{self.source} - {self.destination}")
|
nucliadb/ingest/fields/file.py
CHANGED
@@ -19,15 +19,14 @@
|
|
19
19
|
#
|
20
20
|
from typing import Any, Optional
|
21
21
|
|
22
|
-
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
|
23
|
-
|
24
22
|
from nucliadb.ingest.fields.base import Field
|
23
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
|
25
24
|
from nucliadb_utils.storages.storage import StorageField
|
26
25
|
|
27
26
|
FILE_METADATA = "file_metadata"
|
28
27
|
|
29
28
|
|
30
|
-
class File(Field):
|
29
|
+
class File(Field[FieldFile]):
|
31
30
|
pbklass = FieldFile
|
32
31
|
value: FieldFile
|
33
32
|
type: str = "f"
|
@@ -52,15 +51,13 @@ class File(Field):
|
|
52
51
|
|
53
52
|
is_external_file = payload.file.source == CloudFile.Source.EXTERNAL
|
54
53
|
if not is_external_file:
|
55
|
-
sf: StorageField = self.storage.file_field(
|
56
|
-
self.kbid, self.uuid, self.id, old_cf
|
57
|
-
)
|
54
|
+
sf: StorageField = self.storage.file_field(self.kbid, self.uuid, self.id, old_cf)
|
58
55
|
cf: CloudFile = await self.storage.normalize_binary(payload.file, sf)
|
59
56
|
payload.file.CopyFrom(cf)
|
60
57
|
|
61
58
|
await self.db_set_value(payload)
|
62
59
|
|
63
|
-
async def get_value(self) -> FieldFile:
|
60
|
+
async def get_value(self) -> Optional[FieldFile]:
|
64
61
|
return await self.db_get_value()
|
65
62
|
|
66
63
|
async def set_file_extracted_data(self, file_extracted_data: FileExtractedData):
|
@@ -80,17 +77,13 @@ class File(Field):
|
|
80
77
|
cf_file_page_preview: CloudFile = await self.storage.normalize_binary(
|
81
78
|
preview, sf_file_page_preview
|
82
79
|
)
|
83
|
-
file_extracted_data.file_pages_previews.pages[page].CopyFrom(
|
84
|
-
cf_file_page_preview
|
85
|
-
)
|
80
|
+
file_extracted_data.file_pages_previews.pages[page].CopyFrom(cf_file_page_preview)
|
86
81
|
|
87
82
|
for fileid, origincf in file_extracted_data.file_generated.items():
|
88
83
|
sf_generated: StorageField = self.storage.file_extracted(
|
89
84
|
self.kbid, self.uuid, self.type, self.id, f"generated/{fileid}"
|
90
85
|
)
|
91
|
-
cf_generated: CloudFile = await self.storage.normalize_binary(
|
92
|
-
origincf, sf_generated
|
93
|
-
)
|
86
|
+
cf_generated: CloudFile = await self.storage.normalize_binary(origincf, sf_generated)
|
94
87
|
file_extracted_data.file_generated[fileid].CopyFrom(cf_generated)
|
95
88
|
|
96
89
|
if file_extracted_data.HasField("file_thumbnail"):
|
@@ -113,16 +106,5 @@ class File(Field):
|
|
113
106
|
sf: StorageField = self.storage.file_extracted(
|
114
107
|
self.kbid, self.uuid, self.type, self.id, FILE_METADATA
|
115
108
|
)
|
116
|
-
self.file_extracted_data = await self.storage.download_pb(
|
117
|
-
sf, FileExtractedData
|
118
|
-
)
|
109
|
+
self.file_extracted_data = await self.storage.download_pb(sf, FileExtractedData)
|
119
110
|
return self.file_extracted_data
|
120
|
-
|
121
|
-
async def get_file_extracted_data_cf(self) -> Optional[CloudFile]:
|
122
|
-
sf: StorageField = self.storage.file_extracted(
|
123
|
-
self.kbid, self.uuid, self.type, self.id, FILE_METADATA
|
124
|
-
)
|
125
|
-
if await sf.exists() is not None:
|
126
|
-
return sf.build_cf()
|
127
|
-
else:
|
128
|
-
return None
|