nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/ingest/orm/resource.py
CHANGED
@@ -23,19 +23,33 @@ import asyncio
|
|
23
23
|
import logging
|
24
24
|
from concurrent.futures import ThreadPoolExecutor
|
25
25
|
from functools import partial
|
26
|
-
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional, Type
|
26
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, MutableMapping, Optional, Type
|
27
27
|
|
28
|
+
from nucliadb.common import datamanagers
|
29
|
+
from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
|
30
|
+
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
|
31
|
+
from nucliadb.common.maindb.driver import Transaction
|
32
|
+
from nucliadb.ingest.fields.base import Field
|
33
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
34
|
+
from nucliadb.ingest.fields.file import File
|
35
|
+
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
36
|
+
from nucliadb.ingest.fields.link import Link
|
37
|
+
from nucliadb.ingest.fields.text import Text
|
38
|
+
from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
|
39
|
+
from nucliadb.ingest.orm.metrics import processor_observer
|
40
|
+
from nucliadb_models import content_types
|
41
|
+
from nucliadb_models.common import CloudLink
|
42
|
+
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
43
|
+
from nucliadb_protos import utils_pb2, writer_pb2
|
28
44
|
from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
|
29
|
-
from nucliadb_protos.resources_pb2 import Basic
|
30
|
-
from nucliadb_protos.resources_pb2 import Basic as PBBasic
|
31
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
32
|
-
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
33
|
-
from nucliadb_protos.resources_pb2 import Extra as PBExtra
|
34
45
|
from nucliadb_protos.resources_pb2 import (
|
46
|
+
Basic,
|
47
|
+
CloudFile,
|
35
48
|
ExtractedTextWrapper,
|
36
49
|
ExtractedVectorsWrapper,
|
37
50
|
FieldClassifications,
|
38
51
|
FieldComputedMetadataWrapper,
|
52
|
+
FieldFile,
|
39
53
|
FieldID,
|
40
54
|
FieldMetadata,
|
41
55
|
FieldQuestionAnswerWrapper,
|
@@ -44,41 +58,27 @@ from nucliadb_protos.resources_pb2 import (
|
|
44
58
|
FileExtractedData,
|
45
59
|
LargeComputedMetadataWrapper,
|
46
60
|
LinkExtractedData,
|
61
|
+
Metadata,
|
62
|
+
Paragraph,
|
63
|
+
ParagraphAnnotation,
|
47
64
|
)
|
48
|
-
from nucliadb_protos.resources_pb2 import
|
65
|
+
from nucliadb_protos.resources_pb2 import Basic as PBBasic
|
66
|
+
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
67
|
+
from nucliadb_protos.resources_pb2 import Extra as PBExtra
|
49
68
|
from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
50
69
|
from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
51
|
-
from nucliadb_protos.resources_pb2 import Paragraph, ParagraphAnnotation
|
52
70
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
53
|
-
from nucliadb_protos.resources_pb2 import UserVectorsWrapper
|
54
|
-
from nucliadb_protos.train_pb2 import EnabledMetadata
|
55
|
-
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
56
71
|
from nucliadb_protos.train_pb2 import (
|
72
|
+
EnabledMetadata,
|
57
73
|
TrainField,
|
58
74
|
TrainMetadata,
|
59
75
|
TrainParagraph,
|
60
76
|
TrainResource,
|
61
77
|
TrainSentence,
|
62
78
|
)
|
79
|
+
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
63
80
|
from nucliadb_protos.utils_pb2 import Relation as PBRelation
|
64
81
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
65
|
-
|
66
|
-
from nucliadb.common.maindb.driver import Transaction
|
67
|
-
from nucliadb.ingest.fields.base import Field
|
68
|
-
from nucliadb.ingest.fields.conversation import Conversation
|
69
|
-
from nucliadb.ingest.fields.date import Datetime
|
70
|
-
from nucliadb.ingest.fields.file import File
|
71
|
-
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
72
|
-
from nucliadb.ingest.fields.keywordset import Keywordset
|
73
|
-
from nucliadb.ingest.fields.layout import Layout
|
74
|
-
from nucliadb.ingest.fields.link import Link
|
75
|
-
from nucliadb.ingest.fields.text import Text
|
76
|
-
from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
|
77
|
-
from nucliadb.ingest.orm.metrics import processor_observer
|
78
|
-
from nucliadb.ingest.orm.utils import get_basic, set_basic
|
79
|
-
from nucliadb_models.common import CloudLink
|
80
|
-
from nucliadb_models.writer import GENERIC_MIME_TYPE
|
81
|
-
from nucliadb_protos import utils_pb2, writer_pb2
|
82
82
|
from nucliadb_utils.storages.storage import Storage
|
83
83
|
|
84
84
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -86,41 +86,14 @@ if TYPE_CHECKING: # pragma: no cover
|
|
86
86
|
|
87
87
|
logger = logging.getLogger(__name__)
|
88
88
|
|
89
|
-
KB_RESOURCE_ORIGIN = "/kbs/{kbid}/r/{uuid}/origin"
|
90
|
-
KB_RESOURCE_EXTRA = "/kbs/{kbid}/r/{uuid}/extra"
|
91
|
-
KB_RESOURCE_SECURITY = "/kbs/{kbid}/r/{uuid}/security"
|
92
|
-
KB_RESOURCE_METADATA = "/kbs/{kbid}/r/{uuid}/metadata"
|
93
|
-
KB_RESOURCE_RELATIONS = "/kbs/{kbid}/r/{uuid}/relations"
|
94
|
-
KB_RESOURCE_FIELDS = "/kbs/{kbid}/r/{uuid}/f/"
|
95
|
-
KB_RESOURCE_ALL_FIELDS = "/kbs/{kbid}/r/{uuid}/allfields"
|
96
|
-
KB_RESOURCE_SLUG_BASE = "/kbs/{kbid}/s/"
|
97
|
-
KB_RESOURCE_SLUG = f"{KB_RESOURCE_SLUG_BASE}{{slug}}"
|
98
|
-
KB_RESOURCE_CONVERSATION = "/kbs/{kbid}/r/{uuid}/c/{page}"
|
99
|
-
GLOBAL_FIELD = "a"
|
100
89
|
KB_FIELDS: dict[int, Type] = {
|
101
|
-
FieldType.LAYOUT: Layout,
|
102
90
|
FieldType.TEXT: Text,
|
103
91
|
FieldType.FILE: File,
|
104
92
|
FieldType.LINK: Link,
|
105
|
-
FieldType.DATETIME: Datetime,
|
106
|
-
FieldType.KEYWORDSET: Keywordset,
|
107
93
|
FieldType.GENERIC: Generic,
|
108
94
|
FieldType.CONVERSATION: Conversation,
|
109
95
|
}
|
110
96
|
|
111
|
-
KB_REVERSE: dict[str, FieldType.ValueType] = {
|
112
|
-
"l": FieldType.LAYOUT,
|
113
|
-
"t": FieldType.TEXT,
|
114
|
-
"f": FieldType.FILE,
|
115
|
-
"u": FieldType.LINK,
|
116
|
-
"d": FieldType.DATETIME,
|
117
|
-
"k": FieldType.KEYWORDSET,
|
118
|
-
"a": FieldType.GENERIC,
|
119
|
-
"c": FieldType.CONVERSATION,
|
120
|
-
}
|
121
|
-
|
122
|
-
FIELD_TYPE_TO_ID = {v: k for k, v in KB_REVERSE.items()}
|
123
|
-
|
124
97
|
_executor = ThreadPoolExecutor(10)
|
125
98
|
|
126
99
|
|
@@ -131,6 +104,8 @@ PB_TEXT_FORMAT_TO_MIMETYPE = {
|
|
131
104
|
FieldText.Format.MARKDOWN: "text/markdown",
|
132
105
|
FieldText.Format.JSON: "application/json",
|
133
106
|
FieldText.Format.KEEP_MARKDOWN: "text/markdown",
|
107
|
+
FieldText.Format.JSONL: "application/x-ndjson",
|
108
|
+
FieldText.Format.PLAIN_BLANKLINE_SPLIT: "text/plain+blankline",
|
134
109
|
}
|
135
110
|
|
136
111
|
BASIC_IMMUTABLE_FIELDS = ("icon",)
|
@@ -179,32 +154,11 @@ class Resource:
|
|
179
154
|
new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
|
180
155
|
await self.txn.set(new_key, self.uuid.encode())
|
181
156
|
|
182
|
-
@staticmethod
|
183
|
-
def parse_basic(payload: bytes) -> PBBasic:
|
184
|
-
pb = PBBasic()
|
185
|
-
if payload is None:
|
186
|
-
return None
|
187
|
-
|
188
|
-
pb.ParseFromString(payload)
|
189
|
-
return pb
|
190
|
-
|
191
|
-
async def exists(self) -> bool:
|
192
|
-
exists = True
|
193
|
-
if self.basic is None:
|
194
|
-
payload = await get_basic(self.txn, self.kb.kbid, self.uuid)
|
195
|
-
if payload is not None:
|
196
|
-
pb = PBBasic()
|
197
|
-
pb.ParseFromString(payload)
|
198
|
-
self.basic = pb
|
199
|
-
else:
|
200
|
-
exists = False
|
201
|
-
return exists
|
202
|
-
|
203
157
|
# Basic
|
204
158
|
async def get_basic(self) -> Optional[PBBasic]:
|
205
159
|
if self.basic is None:
|
206
|
-
|
207
|
-
self.basic =
|
160
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
161
|
+
self.basic = basic if basic is not None else PBBasic()
|
208
162
|
return self.basic
|
209
163
|
|
210
164
|
def set_processing_status(self, current_basic: PBBasic, basic_in_payload: PBBasic):
|
@@ -249,9 +203,7 @@ class Resource:
|
|
249
203
|
fields.append(field_id)
|
250
204
|
positions[field_id] = i
|
251
205
|
|
252
|
-
updated = [
|
253
|
-
self.basic.fieldmetadata[positions[field]] for field in fields
|
254
|
-
]
|
206
|
+
updated = [self.basic.fieldmetadata[positions[field]] for field in fields]
|
255
207
|
|
256
208
|
del self.basic.fieldmetadata[:]
|
257
209
|
self.basic.fieldmetadata.extend(updated)
|
@@ -272,11 +224,10 @@ class Resource:
|
|
272
224
|
self.indexer.apply_field_metadata(
|
273
225
|
field_id,
|
274
226
|
field_metadata,
|
275
|
-
replace_field=[],
|
276
|
-
replace_splits={},
|
277
227
|
page_positions=page_positions,
|
278
228
|
extracted_text=await field_obj.get_extracted_text(),
|
279
229
|
basic_user_field_metadata=user_field_metadata,
|
230
|
+
replace_field=True,
|
280
231
|
)
|
281
232
|
|
282
233
|
# Some basic fields are computed off field metadata.
|
@@ -284,27 +235,21 @@ class Resource:
|
|
284
235
|
if deleted_fields is not None and len(deleted_fields) > 0:
|
285
236
|
remove_field_classifications(self.basic, deleted_fields=deleted_fields)
|
286
237
|
|
287
|
-
await set_basic(
|
238
|
+
await datamanagers.resources.set_basic(
|
239
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, basic=self.basic
|
240
|
+
)
|
288
241
|
self.modified = True
|
289
242
|
|
290
243
|
# Origin
|
291
244
|
async def get_origin(self) -> Optional[PBOrigin]:
|
292
245
|
if self.origin is None:
|
293
|
-
|
294
|
-
|
295
|
-
KB_RESOURCE_ORIGIN.format(kbid=self.kb.kbid, uuid=self.uuid)
|
296
|
-
)
|
297
|
-
if payload is None:
|
298
|
-
return None
|
299
|
-
|
300
|
-
pb.ParseFromString(payload)
|
301
|
-
self.origin = pb
|
246
|
+
origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
247
|
+
self.origin = origin
|
302
248
|
return self.origin
|
303
249
|
|
304
250
|
async def set_origin(self, payload: PBOrigin):
|
305
|
-
await
|
306
|
-
|
307
|
-
payload.SerializeToString(),
|
251
|
+
await datamanagers.resources.set_origin(
|
252
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, origin=payload
|
308
253
|
)
|
309
254
|
self.modified = True
|
310
255
|
self.origin = payload
|
@@ -312,42 +257,27 @@ class Resource:
|
|
312
257
|
# Extra
|
313
258
|
async def get_extra(self) -> Optional[PBExtra]:
|
314
259
|
if self.extra is None:
|
315
|
-
|
316
|
-
|
317
|
-
KB_RESOURCE_EXTRA.format(kbid=self.kb.kbid, uuid=self.uuid)
|
318
|
-
)
|
319
|
-
if payload is None:
|
320
|
-
return None
|
321
|
-
pb.ParseFromString(payload)
|
322
|
-
self.extra = pb
|
260
|
+
extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
261
|
+
self.extra = extra
|
323
262
|
return self.extra
|
324
263
|
|
325
264
|
async def set_extra(self, payload: PBExtra):
|
326
|
-
|
327
|
-
await self.txn.set(
|
328
|
-
key,
|
329
|
-
payload.SerializeToString(),
|
330
|
-
)
|
265
|
+
await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
|
331
266
|
self.modified = True
|
332
267
|
self.extra = payload
|
333
268
|
|
334
269
|
# Security
|
335
270
|
async def get_security(self) -> Optional[utils_pb2.Security]:
|
336
271
|
if self.security is None:
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
return None
|
342
|
-
pb.ParseFromString(payload)
|
343
|
-
self.security = pb
|
272
|
+
security = await datamanagers.resources.get_security(
|
273
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
274
|
+
)
|
275
|
+
self.security = security
|
344
276
|
return self.security
|
345
277
|
|
346
278
|
async def set_security(self, payload: utils_pb2.Security) -> None:
|
347
|
-
|
348
|
-
|
349
|
-
key,
|
350
|
-
payload.SerializeToString(),
|
279
|
+
await datamanagers.resources.set_security(
|
280
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, security=payload
|
351
281
|
)
|
352
282
|
self.modified = True
|
353
283
|
self.security = payload
|
@@ -355,29 +285,24 @@ class Resource:
|
|
355
285
|
# Relations
|
356
286
|
async def get_relations(self) -> Optional[PBRelations]:
|
357
287
|
if self.relations is None:
|
358
|
-
|
359
|
-
|
360
|
-
KB_RESOURCE_RELATIONS.format(kbid=self.kb.kbid, uuid=self.uuid)
|
288
|
+
relations = await datamanagers.resources.get_relations(
|
289
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
361
290
|
)
|
362
|
-
|
363
|
-
return None
|
364
|
-
pb.ParseFromString(payload)
|
365
|
-
self.relations = pb
|
291
|
+
self.relations = relations
|
366
292
|
return self.relations
|
367
293
|
|
368
294
|
async def set_relations(self, payload: list[PBRelation]):
|
369
295
|
relations = PBRelations()
|
370
296
|
for relation in payload:
|
371
297
|
relations.relations.append(relation)
|
372
|
-
await
|
373
|
-
|
374
|
-
relations.SerializeToString(),
|
298
|
+
await datamanagers.resources.set_relations(
|
299
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, relations=relations
|
375
300
|
)
|
376
301
|
self.modified = True
|
377
302
|
self.relations = relations
|
378
303
|
|
379
304
|
@processor_observer.wrap({"type": "generate_index_message"})
|
380
|
-
async def generate_index_message(self) -> ResourceBrain:
|
305
|
+
async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
|
381
306
|
brain = ResourceBrain(rid=self.uuid)
|
382
307
|
origin = await self.get_origin()
|
383
308
|
basic = await self.get_basic()
|
@@ -387,7 +312,7 @@ class Resource:
|
|
387
312
|
await self.compute_global_tags(brain)
|
388
313
|
fields = await self.get_fields(force=True)
|
389
314
|
for (type_id, field_id), field in fields.items():
|
390
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
315
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
391
316
|
await self.compute_global_text_field(fieldid, brain)
|
392
317
|
|
393
318
|
field_metadata = await field.get_field_metadata()
|
@@ -403,251 +328,66 @@ class Resource:
|
|
403
328
|
(
|
404
329
|
fm
|
405
330
|
for fm in basic.fieldmetadata
|
406
|
-
if fm.field.field == field_id
|
407
|
-
and fm.field.field_type == type_id
|
331
|
+
if fm.field.field == field_id and fm.field.field_type == type_id
|
408
332
|
),
|
409
333
|
None,
|
410
334
|
)
|
411
335
|
brain.apply_field_metadata(
|
412
336
|
field_key,
|
413
337
|
field_metadata,
|
414
|
-
replace_field=[],
|
415
|
-
replace_splits={},
|
416
338
|
page_positions=page_positions,
|
417
339
|
extracted_text=await field.get_extracted_text(),
|
418
340
|
basic_user_field_metadata=user_field_metadata,
|
341
|
+
replace_field=reindex,
|
419
342
|
)
|
420
343
|
|
421
344
|
if self.disable_vectors is False:
|
345
|
+
# XXX: while we don't remove the "default" vectorset concept, we
|
346
|
+
# need to do use None as the default one
|
422
347
|
vo = await field.get_vectors()
|
423
348
|
if vo is not None:
|
424
|
-
|
349
|
+
async with datamanagers.with_ro_transaction() as ro_txn:
|
350
|
+
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
351
|
+
ro_txn, kbid=self.kb.kbid
|
352
|
+
)
|
353
|
+
brain.apply_field_vectors(
|
354
|
+
field_key,
|
355
|
+
vo,
|
356
|
+
matryoshka_vector_dimension=dimension,
|
357
|
+
replace_field=reindex,
|
358
|
+
)
|
425
359
|
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
360
|
+
vectorset_configs = []
|
361
|
+
async with datamanagers.with_ro_transaction() as ro_txn:
|
362
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
363
|
+
ro_txn, kbid=self.kb.kbid
|
364
|
+
):
|
365
|
+
vectorset_configs.append(vectorset_config)
|
366
|
+
for vectorset_config in vectorset_configs:
|
367
|
+
vo = await field.get_vectors(vectorset=vectorset_config.vectorset_id)
|
368
|
+
if vo is not None:
|
369
|
+
dimension = vectorset_config.vectorset_index_config.vector_dimension
|
370
|
+
brain.apply_field_vectors(
|
371
|
+
field_key,
|
372
|
+
vo,
|
373
|
+
vectorset=vectorset_config.vectorset_id,
|
374
|
+
matryoshka_vector_dimension=dimension,
|
375
|
+
replace_field=reindex,
|
376
|
+
)
|
430
377
|
return brain
|
431
378
|
|
432
|
-
async def generate_field_vectors(
|
433
|
-
self,
|
434
|
-
bm: BrokerMessage,
|
435
|
-
type_id: FieldType.ValueType,
|
436
|
-
field_id: str,
|
437
|
-
field: Field,
|
438
|
-
):
|
439
|
-
vo = await field.get_vectors()
|
440
|
-
if vo is None:
|
441
|
-
return
|
442
|
-
evw = ExtractedVectorsWrapper()
|
443
|
-
evw.field.field = field_id
|
444
|
-
evw.field.field_type = type_id # type: ignore
|
445
|
-
evw.vectors.CopyFrom(vo)
|
446
|
-
bm.field_vectors.append(evw)
|
447
|
-
|
448
|
-
async def generate_user_vectors(
|
449
|
-
self,
|
450
|
-
bm: BrokerMessage,
|
451
|
-
type_id: FieldType.ValueType,
|
452
|
-
field_id: str,
|
453
|
-
field: Field,
|
454
|
-
):
|
455
|
-
uv = await field.get_user_vectors()
|
456
|
-
if uv is None:
|
457
|
-
return
|
458
|
-
uvw = UserVectorsWrapper()
|
459
|
-
uvw.field.field = field_id
|
460
|
-
uvw.field.field_type = type_id # type: ignore
|
461
|
-
uvw.vectors.CopyFrom(uv)
|
462
|
-
bm.user_vectors.append(uvw)
|
463
|
-
|
464
|
-
async def generate_field_large_computed_metadata(
|
465
|
-
self,
|
466
|
-
bm: BrokerMessage,
|
467
|
-
type_id: FieldType.ValueType,
|
468
|
-
field_id: str,
|
469
|
-
field: Field,
|
470
|
-
):
|
471
|
-
lcm = await field.get_large_field_metadata()
|
472
|
-
if lcm is None:
|
473
|
-
return
|
474
|
-
lcmw = LargeComputedMetadataWrapper()
|
475
|
-
lcmw.field.field = field_id
|
476
|
-
lcmw.field.field_type = type_id # type: ignore
|
477
|
-
lcmw.real.CopyFrom(lcm)
|
478
|
-
bm.field_large_metadata.append(lcmw)
|
479
|
-
|
480
|
-
async def generate_field_computed_metadata(
|
481
|
-
self,
|
482
|
-
bm: BrokerMessage,
|
483
|
-
type_id: FieldType.ValueType,
|
484
|
-
field_id: str,
|
485
|
-
field: Field,
|
486
|
-
):
|
487
|
-
fcmw = FieldComputedMetadataWrapper()
|
488
|
-
fcmw.field.field = field_id
|
489
|
-
fcmw.field.field_type = type_id # type: ignore
|
490
|
-
|
491
|
-
field_metadata = await field.get_field_metadata()
|
492
|
-
if field_metadata is not None:
|
493
|
-
fcmw.metadata.CopyFrom(field_metadata)
|
494
|
-
fcmw.field.field = field_id
|
495
|
-
fcmw.field.field_type = type_id # type: ignore
|
496
|
-
bm.field_metadata.append(fcmw)
|
497
|
-
# Make sure cloud files are removed for exporting
|
498
|
-
|
499
|
-
async def generate_extracted_text(
|
500
|
-
self,
|
501
|
-
bm: BrokerMessage,
|
502
|
-
type_id: FieldType.ValueType,
|
503
|
-
field_id: str,
|
504
|
-
field: Field,
|
505
|
-
):
|
506
|
-
etw = ExtractedTextWrapper()
|
507
|
-
etw.field.field = field_id
|
508
|
-
etw.field.field_type = type_id # type: ignore
|
509
|
-
extracted_text = await field.get_extracted_text()
|
510
|
-
if extracted_text is not None:
|
511
|
-
etw.body.CopyFrom(extracted_text)
|
512
|
-
bm.extracted_text.append(etw)
|
513
|
-
|
514
|
-
async def generate_field(
|
515
|
-
self,
|
516
|
-
bm: BrokerMessage,
|
517
|
-
type_id: FieldType.ValueType,
|
518
|
-
field_id: str,
|
519
|
-
field: Field,
|
520
|
-
):
|
521
|
-
# Used for exporting a field
|
522
|
-
if type_id == FieldType.TEXT:
|
523
|
-
value = await field.get_value()
|
524
|
-
bm.texts[field_id].CopyFrom(value)
|
525
|
-
elif type_id == FieldType.LINK:
|
526
|
-
value = await field.get_value()
|
527
|
-
bm.links[field_id].CopyFrom(value)
|
528
|
-
elif type_id == FieldType.FILE:
|
529
|
-
value = await field.get_value()
|
530
|
-
bm.files[field_id].CopyFrom(value)
|
531
|
-
elif type_id == FieldType.CONVERSATION:
|
532
|
-
value = await self.get_full_conversation(field) # type: ignore
|
533
|
-
bm.conversations[field_id].CopyFrom(value)
|
534
|
-
elif type_id == FieldType.KEYWORDSET:
|
535
|
-
value = await field.get_value()
|
536
|
-
bm.keywordsets[field_id].CopyFrom(value)
|
537
|
-
elif type_id == FieldType.DATETIME:
|
538
|
-
value = await field.get_value()
|
539
|
-
bm.datetimes[field_id].CopyFrom(value)
|
540
|
-
elif type_id == FieldType.LAYOUT:
|
541
|
-
value = await field.get_value()
|
542
|
-
bm.layouts[field_id].CopyFrom(value)
|
543
|
-
|
544
|
-
async def get_full_conversation(
|
545
|
-
self,
|
546
|
-
conversation_field: Conversation,
|
547
|
-
) -> Optional[PBConversation]:
|
548
|
-
"""
|
549
|
-
Messages of a conversations may be stored across several pages.
|
550
|
-
This method fetches them all and returns a single complete conversation.
|
551
|
-
"""
|
552
|
-
full_conv = PBConversation()
|
553
|
-
n_page = 1
|
554
|
-
while True:
|
555
|
-
page = await conversation_field.get_value(page=n_page)
|
556
|
-
if page is None:
|
557
|
-
break
|
558
|
-
full_conv.messages.extend(page.messages)
|
559
|
-
n_page += 1
|
560
|
-
return full_conv
|
561
|
-
|
562
|
-
async def generate_broker_message(self) -> BrokerMessage:
|
563
|
-
# full means downloading all the pointers
|
564
|
-
# minuts the ones to external files that are not PB
|
565
|
-
# Go for all fields and recreate brain
|
566
|
-
bm = BrokerMessage()
|
567
|
-
bm.kbid = self.kb.kbid
|
568
|
-
bm.uuid = self.uuid
|
569
|
-
basic = await self.get_basic()
|
570
|
-
if basic is not None:
|
571
|
-
bm.basic.CopyFrom(basic)
|
572
|
-
bm.slug = bm.basic.slug
|
573
|
-
origin = await self.get_origin()
|
574
|
-
if origin is not None:
|
575
|
-
bm.origin.CopyFrom(origin)
|
576
|
-
relations = await self.get_relations()
|
577
|
-
if relations is not None:
|
578
|
-
for relation in relations.relations:
|
579
|
-
bm.relations.append(relation)
|
580
|
-
|
581
|
-
fields = await self.get_fields(force=True)
|
582
|
-
for (type_id, field_id), field in fields.items():
|
583
|
-
# Value
|
584
|
-
await self.generate_field(bm, type_id, field_id, field)
|
585
|
-
|
586
|
-
# Extracted text
|
587
|
-
await self.generate_extracted_text(bm, type_id, field_id, field)
|
588
|
-
|
589
|
-
# Field Computed Metadata
|
590
|
-
await self.generate_field_computed_metadata(bm, type_id, field_id, field)
|
591
|
-
|
592
|
-
if type_id == FieldType.FILE and isinstance(field, File):
|
593
|
-
field_extracted_data = await field.get_file_extracted_data()
|
594
|
-
if field_extracted_data is not None:
|
595
|
-
bm.file_extracted_data.append(field_extracted_data)
|
596
|
-
|
597
|
-
elif type_id == FieldType.LINK and isinstance(field, Link):
|
598
|
-
link_extracted_data = await field.get_link_extracted_data()
|
599
|
-
if link_extracted_data is not None:
|
600
|
-
bm.link_extracted_data.append(link_extracted_data)
|
601
|
-
|
602
|
-
# Field vectors
|
603
|
-
await self.generate_field_vectors(bm, type_id, field_id, field)
|
604
|
-
|
605
|
-
# User vectors
|
606
|
-
await self.generate_user_vectors(bm, type_id, field_id, field)
|
607
|
-
|
608
|
-
# Large metadata
|
609
|
-
await self.generate_field_large_computed_metadata(
|
610
|
-
bm, type_id, field_id, field
|
611
|
-
)
|
612
|
-
|
613
|
-
return bm
|
614
|
-
|
615
379
|
# Fields
|
616
|
-
async def get_fields(
|
617
|
-
self, force: bool = False
|
618
|
-
) -> dict[tuple[FieldType.ValueType, str], Field]:
|
380
|
+
async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
|
619
381
|
# Get all fields
|
620
382
|
for type, field in await self.get_fields_ids(force=force):
|
621
383
|
if (type, field) not in self.fields:
|
622
384
|
self.fields[(type, field)] = await self.get_field(field, type)
|
623
385
|
return self.fields
|
624
386
|
|
625
|
-
async def _deprecated_scan_fields_ids(
|
626
|
-
self,
|
627
|
-
) -> AsyncIterator[tuple[FieldType.ValueType, str]]:
|
628
|
-
logger.warning("Scanning fields ids. This is not optimal.")
|
629
|
-
prefix = KB_RESOURCE_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
|
630
|
-
allfields = set()
|
631
|
-
async for key in self.txn.keys(prefix, count=-1):
|
632
|
-
# The [6:8] `slicing purpose is to match exactly the two
|
633
|
-
# splitted parts corresponding to type and field, and nothing else!
|
634
|
-
type, field = key.split("/")[6:8]
|
635
|
-
type_id = KB_REVERSE.get(type)
|
636
|
-
if type_id is None:
|
637
|
-
raise AttributeError("Invalid field type")
|
638
|
-
result = (type_id, field)
|
639
|
-
if result not in allfields:
|
640
|
-
# fields can have errors that are stored in a subkey:
|
641
|
-
# - field key -> kbs/kbid/r/ruuid/f/myfield
|
642
|
-
# - field error key -> kbs/kbid/r/ruuid/f/myfield/errors
|
643
|
-
# and that would return duplicates here.
|
644
|
-
yield result
|
645
|
-
allfields.add(result)
|
646
|
-
|
647
387
|
async def _inner_get_fields_ids(self) -> list[tuple[FieldType.ValueType, str]]:
|
648
388
|
# Use a set to make sure we don't have duplicate field ids
|
649
389
|
result = set()
|
650
|
-
all_fields = await self.get_all_field_ids()
|
390
|
+
all_fields = await self.get_all_field_ids(for_update=False)
|
651
391
|
if all_fields is not None:
|
652
392
|
for f in all_fields.fields:
|
653
393
|
result.add((f.field_type, f.field))
|
@@ -664,9 +404,7 @@ class Resource:
|
|
664
404
|
result.add((FieldType.GENERIC, generic))
|
665
405
|
return list(result)
|
666
406
|
|
667
|
-
async def get_fields_ids(
|
668
|
-
self, force: bool = False
|
669
|
-
) -> list[tuple[FieldType.ValueType, str]]:
|
407
|
+
async def get_fields_ids(self, force: bool = False) -> list[tuple[FieldType.ValueType, str]]:
|
670
408
|
"""
|
671
409
|
Get all ids of the fields of the resource and cache them.
|
672
410
|
"""
|
@@ -710,32 +448,26 @@ class Resource:
|
|
710
448
|
if field in self.all_fields_keys:
|
711
449
|
self.all_fields_keys.remove(field)
|
712
450
|
|
713
|
-
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
714
|
-
vo = await field_obj.get_vectors()
|
715
|
-
if vo is not None:
|
716
|
-
self.indexer.delete_vectors(field_key=field_key, vo=vo)
|
451
|
+
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
717
452
|
|
718
453
|
metadata = await field_obj.get_field_metadata()
|
719
454
|
if metadata is not None:
|
720
|
-
self.indexer.
|
455
|
+
self.indexer.delete_field(field_key=field_key)
|
721
456
|
|
722
457
|
await field_obj.delete()
|
723
458
|
|
724
459
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
725
460
|
return (type, field) in self.fields
|
726
461
|
|
727
|
-
async def get_all_field_ids(self) -> Optional[PBAllFieldIDs]:
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
return None
|
732
|
-
all_fields = PBAllFieldIDs()
|
733
|
-
all_fields.ParseFromString(payload)
|
734
|
-
return all_fields
|
462
|
+
async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
|
463
|
+
return await datamanagers.resources.get_all_field_ids(
|
464
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
|
465
|
+
)
|
735
466
|
|
736
467
|
async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
|
737
|
-
|
738
|
-
|
468
|
+
return await datamanagers.resources.set_all_field_ids(
|
469
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, allfields=all_fields
|
470
|
+
)
|
739
471
|
|
740
472
|
async def update_all_field_ids(
|
741
473
|
self,
|
@@ -745,7 +477,7 @@ class Resource:
|
|
745
477
|
errors: Optional[list[writer_pb2.Error]] = None,
|
746
478
|
):
|
747
479
|
needs_update = False
|
748
|
-
all_fields = await self.get_all_field_ids()
|
480
|
+
all_fields = await self.get_all_field_ids(for_update=True)
|
749
481
|
if all_fields is None:
|
750
482
|
needs_update = True
|
751
483
|
all_fields = PBAllFieldIDs()
|
@@ -772,26 +504,12 @@ class Resource:
|
|
772
504
|
@processor_observer.wrap({"type": "apply_fields"})
|
773
505
|
async def apply_fields(self, message: BrokerMessage):
|
774
506
|
message_updated_fields = []
|
775
|
-
for field, layout in message.layouts.items():
|
776
|
-
fid = FieldID(field_type=FieldType.LAYOUT, field=field)
|
777
|
-
await self.set_field(fid.field_type, fid.field, layout)
|
778
|
-
message_updated_fields.append(fid)
|
779
507
|
|
780
508
|
for field, text in message.texts.items():
|
781
509
|
fid = FieldID(field_type=FieldType.TEXT, field=field)
|
782
510
|
await self.set_field(fid.field_type, fid.field, text)
|
783
511
|
message_updated_fields.append(fid)
|
784
512
|
|
785
|
-
for field, keywordset in message.keywordsets.items():
|
786
|
-
fid = FieldID(field_type=FieldType.KEYWORDSET, field=field)
|
787
|
-
await self.set_field(fid.field_type, fid.field, keywordset)
|
788
|
-
message_updated_fields.append(fid)
|
789
|
-
|
790
|
-
for field, datetimeobj in message.datetimes.items():
|
791
|
-
fid = FieldID(field_type=FieldType.DATETIME, field=field)
|
792
|
-
await self.set_field(fid.field_type, fid.field, datetimeobj)
|
793
|
-
message_updated_fields.append(fid)
|
794
|
-
|
795
513
|
for field, link in message.links.items():
|
796
514
|
fid = FieldID(field_type=FieldType.LINK, field=field)
|
797
515
|
await self.set_field(fid.field_type, fid.field, link)
|
@@ -810,13 +528,11 @@ class Resource:
|
|
810
528
|
for fieldid in message.delete_fields:
|
811
529
|
await self.delete_field(fieldid.field_type, fieldid.field)
|
812
530
|
|
813
|
-
if (
|
814
|
-
len(message_updated_fields)
|
815
|
-
or len(message.delete_fields)
|
816
|
-
or len(message.errors)
|
817
|
-
):
|
531
|
+
if len(message_updated_fields) or len(message.delete_fields) or len(message.errors):
|
818
532
|
await self.update_all_field_ids(
|
819
|
-
updated=message_updated_fields,
|
533
|
+
updated=message_updated_fields,
|
534
|
+
deleted=message.delete_fields, # type: ignore
|
535
|
+
errors=message.errors, # type: ignore
|
820
536
|
)
|
821
537
|
|
822
538
|
@processor_observer.wrap({"type": "apply_extracted"})
|
@@ -852,13 +568,15 @@ class Resource:
|
|
852
568
|
|
853
569
|
for link_extracted_data in message.link_extracted_data:
|
854
570
|
await self._apply_link_extracted_data(link_extracted_data)
|
855
|
-
await self.
|
571
|
+
await self.maybe_update_resource_title_from_link(link_extracted_data)
|
856
572
|
extracted_languages.append(link_extracted_data.language)
|
857
573
|
|
858
574
|
for file_extracted_data in message.file_extracted_data:
|
859
575
|
await self._apply_file_extracted_data(file_extracted_data)
|
860
576
|
extracted_languages.append(file_extracted_data.language)
|
861
577
|
|
578
|
+
await self.maybe_update_resource_title_from_file_extracted_data(message)
|
579
|
+
|
862
580
|
# Metadata should go first
|
863
581
|
for field_metadata in message.field_metadata:
|
864
582
|
await self._apply_field_computed_metadata(field_metadata)
|
@@ -869,10 +587,9 @@ class Resource:
|
|
869
587
|
# Upload to binary storage
|
870
588
|
# Vector indexing
|
871
589
|
if self.disable_vectors is False:
|
590
|
+
await self.get_fields(force=True)
|
872
591
|
for field_vectors in message.field_vectors:
|
873
592
|
await self._apply_extracted_vectors(field_vectors)
|
874
|
-
for user_vectors in message.user_vectors:
|
875
|
-
await self._apply_user_vectors(user_vectors)
|
876
593
|
|
877
594
|
# Only uploading to binary storage
|
878
595
|
for field_large_metadata in message.field_large_metadata:
|
@@ -896,9 +613,7 @@ class Resource:
|
|
896
613
|
extracted_text.field,
|
897
614
|
)
|
898
615
|
|
899
|
-
async def _apply_question_answers(
|
900
|
-
self, question_answers: FieldQuestionAnswerWrapper
|
901
|
-
):
|
616
|
+
async def _apply_question_answers(self, question_answers: FieldQuestionAnswerWrapper):
|
902
617
|
field = question_answers.field
|
903
618
|
field_obj = await self.get_field(field.field, field.field_type, load=False)
|
904
619
|
await field_obj.set_question_answers(question_answers)
|
@@ -918,19 +633,27 @@ class Resource:
|
|
918
633
|
|
919
634
|
maybe_update_basic_summary(self.basic, link_extracted_data.description)
|
920
635
|
|
921
|
-
async def
|
636
|
+
async def maybe_update_resource_title_from_link(self, link_extracted_data: LinkExtractedData):
|
637
|
+
"""
|
638
|
+
When parsing link extracted data, we want to replace the resource title for the first link
|
639
|
+
that gets processed and has a title, and only if the current title is a URL, which we take
|
640
|
+
as a hint that the title was not set by the user.
|
641
|
+
"""
|
922
642
|
assert self.basic is not None
|
923
643
|
if not link_extracted_data.title:
|
924
644
|
return
|
925
645
|
if not (self.basic.title.startswith("http") or self.basic.title == ""):
|
926
646
|
return
|
927
|
-
|
928
647
|
title = link_extracted_data.title
|
929
|
-
self.
|
648
|
+
await self.update_resource_title(title)
|
649
|
+
|
650
|
+
async def update_resource_title(self, computed_title: str) -> None:
|
651
|
+
assert self.basic is not None
|
652
|
+
self.basic.title = computed_title
|
930
653
|
# Extracted text
|
931
654
|
field = await self.get_field("title", FieldType.GENERIC, load=False)
|
932
655
|
etw = ExtractedTextWrapper()
|
933
|
-
etw.body.text =
|
656
|
+
etw.body.text = computed_title
|
934
657
|
await field.set_extracted_text(etw)
|
935
658
|
|
936
659
|
# Field computed metadata
|
@@ -942,11 +665,8 @@ class Resource:
|
|
942
665
|
fcm = await field.get_field_metadata(force=True)
|
943
666
|
if fcm is not None:
|
944
667
|
fcmw.metadata.CopyFrom(fcm)
|
945
|
-
|
946
668
|
fcmw.metadata.metadata.ClearField("paragraphs")
|
947
|
-
paragraph = Paragraph(
|
948
|
-
start=0, end=len(title), kind=Paragraph.TypeParagraph.TITLE
|
949
|
-
)
|
669
|
+
paragraph = Paragraph(start=0, end=len(computed_title), kind=Paragraph.TypeParagraph.TITLE)
|
950
670
|
fcmw.metadata.metadata.paragraphs.append(paragraph)
|
951
671
|
|
952
672
|
await field.set_field_metadata(fcmw)
|
@@ -963,9 +683,54 @@ class Resource:
|
|
963
683
|
maybe_update_basic_icon(self.basic, file_extracted_data.icon)
|
964
684
|
maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
|
965
685
|
|
966
|
-
async def
|
967
|
-
|
968
|
-
|
686
|
+
async def _should_update_resource_title_from_file_metadata(self) -> bool:
|
687
|
+
"""
|
688
|
+
We only want to update resource title from file metadata if the title is empty,
|
689
|
+
equal to the resource uuid or equal to any of the file filenames in the resource.
|
690
|
+
"""
|
691
|
+
basic = await self.get_basic()
|
692
|
+
if basic is None:
|
693
|
+
return True
|
694
|
+
current_title = basic.title
|
695
|
+
if current_title == "":
|
696
|
+
# If the title is empty, we should update it
|
697
|
+
return True
|
698
|
+
if current_title == self.uuid:
|
699
|
+
# If the title is the same as the resource uuid, we should update it
|
700
|
+
return True
|
701
|
+
fields = await self.get_fields(force=True)
|
702
|
+
filenames = set()
|
703
|
+
for (field_type, _), field_obj in fields.items():
|
704
|
+
if field_type == FieldType.FILE:
|
705
|
+
field_value: Optional[FieldFile] = await field_obj.get_value()
|
706
|
+
if field_value is not None:
|
707
|
+
if field_value.file.filename not in ("", None):
|
708
|
+
filenames.add(field_value.file.filename)
|
709
|
+
if current_title in filenames:
|
710
|
+
# If the title is equal to any of the file filenames, we should update it
|
711
|
+
return True
|
712
|
+
return False
|
713
|
+
|
714
|
+
async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
|
715
|
+
"""
|
716
|
+
Update the resource title with the first file that has a title extracted.
|
717
|
+
"""
|
718
|
+
if not await self._should_update_resource_title_from_file_metadata():
|
719
|
+
return
|
720
|
+
for fed in message.file_extracted_data:
|
721
|
+
if fed.title == "":
|
722
|
+
# Skip if the extracted title is empty
|
723
|
+
continue
|
724
|
+
fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
|
725
|
+
logger.info(
|
726
|
+
"Updating resource title from file extracted data",
|
727
|
+
extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
|
728
|
+
)
|
729
|
+
await self.update_resource_title(fed.title)
|
730
|
+
# Break after the first file with a title is found
|
731
|
+
break
|
732
|
+
|
733
|
+
async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
|
969
734
|
assert self.basic is not None
|
970
735
|
maybe_update_basic_summary(self.basic, field_metadata.metadata.metadata.summary)
|
971
736
|
|
@@ -974,17 +739,11 @@ class Resource:
|
|
974
739
|
field_metadata.field.field_type,
|
975
740
|
load=False,
|
976
741
|
)
|
977
|
-
(
|
978
|
-
metadata,
|
979
|
-
replace_field,
|
980
|
-
replace_splits,
|
981
|
-
) = await field_obj.set_field_metadata(field_metadata)
|
742
|
+
metadata = await field_obj.set_field_metadata(field_metadata)
|
982
743
|
field_key = self.generate_field_id(field_metadata.field)
|
983
744
|
|
984
745
|
page_positions: Optional[FilePagePositions] = None
|
985
|
-
if field_metadata.field.field_type == FieldType.FILE and isinstance(
|
986
|
-
field_obj, File
|
987
|
-
):
|
746
|
+
if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
|
988
747
|
page_positions = await get_file_page_positions(field_obj)
|
989
748
|
|
990
749
|
user_field_metadata = next(
|
@@ -1002,29 +761,24 @@ class Resource:
|
|
1002
761
|
self.indexer.apply_field_metadata,
|
1003
762
|
field_key,
|
1004
763
|
metadata,
|
1005
|
-
replace_field=replace_field,
|
1006
|
-
replace_splits=replace_splits,
|
1007
764
|
page_positions=page_positions,
|
1008
765
|
extracted_text=extracted_text,
|
1009
766
|
basic_user_field_metadata=user_field_metadata,
|
767
|
+
replace_field=True,
|
1010
768
|
)
|
1011
769
|
loop = asyncio.get_running_loop()
|
1012
770
|
await loop.run_in_executor(_executor, apply_field_metadata)
|
1013
771
|
|
1014
|
-
maybe_update_basic_thumbnail(
|
1015
|
-
self.basic, field_metadata.metadata.metadata.thumbnail
|
1016
|
-
)
|
772
|
+
maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
|
1017
773
|
|
1018
774
|
add_field_classifications(self.basic, field_metadata)
|
1019
775
|
|
1020
776
|
async def _apply_extracted_vectors(self, field_vectors: ExtractedVectorsWrapper):
|
1021
|
-
|
1022
|
-
|
1023
|
-
):
|
777
|
+
# Store vectors in the resource
|
778
|
+
|
779
|
+
if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
|
1024
780
|
# skipping because field does not exist
|
1025
|
-
logger.warning(
|
1026
|
-
f'Field "{field_vectors.field.field}" does not exist, skipping vectors'
|
1027
|
-
)
|
781
|
+
logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
|
1028
782
|
return
|
1029
783
|
|
1030
784
|
field_obj = await self.get_field(
|
@@ -1032,49 +786,44 @@ class Resource:
|
|
1032
786
|
field_vectors.field.field_type,
|
1033
787
|
load=False,
|
1034
788
|
)
|
1035
|
-
(
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
) = await field_obj.set_vectors(field_vectors)
|
789
|
+
vo = await field_obj.set_vectors(field_vectors)
|
790
|
+
|
791
|
+
# Prepare vectors to be indexed
|
792
|
+
|
1040
793
|
field_key = self.generate_field_id(field_vectors.field)
|
1041
794
|
if vo is not None:
|
1042
|
-
|
795
|
+
vectorset_id = field_vectors.vectorset_id or None
|
796
|
+
if vectorset_id is None:
|
797
|
+
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
798
|
+
self.txn, kbid=self.kb.kbid
|
799
|
+
)
|
800
|
+
else:
|
801
|
+
config = await datamanagers.vectorsets.get(
|
802
|
+
self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
|
803
|
+
)
|
804
|
+
if config is None:
|
805
|
+
logger.warning(
|
806
|
+
f"Trying to apply a resource on vectorset '{vectorset_id}' that doesn't exist."
|
807
|
+
)
|
808
|
+
return
|
809
|
+
dimension = config.vectorset_index_config.vector_dimension
|
810
|
+
if not dimension:
|
811
|
+
raise ValueError(f"Vector dimension not set for vectorset '{vectorset_id}'")
|
812
|
+
|
813
|
+
apply_field_vectors_partial = partial(
|
1043
814
|
self.indexer.apply_field_vectors,
|
1044
815
|
field_key,
|
1045
816
|
vo,
|
1046
|
-
|
1047
|
-
|
817
|
+
vectorset=vectorset_id,
|
818
|
+
replace_field=True,
|
819
|
+
matryoshka_vector_dimension=dimension,
|
1048
820
|
)
|
1049
821
|
loop = asyncio.get_running_loop()
|
1050
|
-
await loop.run_in_executor(_executor,
|
822
|
+
await loop.run_in_executor(_executor, apply_field_vectors_partial)
|
1051
823
|
else:
|
1052
824
|
raise AttributeError("VO not found on set")
|
1053
825
|
|
1054
|
-
async def
|
1055
|
-
field_obj = await self.get_field(
|
1056
|
-
user_vectors.field.field,
|
1057
|
-
user_vectors.field.field_type,
|
1058
|
-
load=False,
|
1059
|
-
)
|
1060
|
-
uv, vectors_to_delete = await field_obj.set_user_vectors(user_vectors)
|
1061
|
-
field_key = self.generate_field_id(user_vectors.field)
|
1062
|
-
if uv is not None:
|
1063
|
-
# We need to make sure that the vectors replaced are not on the new vectors
|
1064
|
-
# So we extend the vectors to delete with the one replaced by the update
|
1065
|
-
for vectorset, vectors in vectors_to_delete.items():
|
1066
|
-
for vector in vectors.vectors:
|
1067
|
-
if vector not in user_vectors.vectors_to_delete[vectorset].vectors:
|
1068
|
-
user_vectors.vectors_to_delete[vectorset].vectors.append(vector)
|
1069
|
-
self.indexer.apply_user_vectors(
|
1070
|
-
field_key, uv, user_vectors.vectors_to_delete
|
1071
|
-
)
|
1072
|
-
else:
|
1073
|
-
raise AttributeError("User Vectors not found on set")
|
1074
|
-
|
1075
|
-
async def _apply_field_large_metadata(
|
1076
|
-
self, field_large_metadata: LargeComputedMetadataWrapper
|
1077
|
-
):
|
826
|
+
async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
|
1078
827
|
field_obj = await self.get_field(
|
1079
828
|
field_large_metadata.field.field,
|
1080
829
|
field_large_metadata.field.field_type,
|
@@ -1083,7 +832,7 @@ class Resource:
|
|
1083
832
|
await field_obj.set_large_field_metadata(field_large_metadata)
|
1084
833
|
|
1085
834
|
def generate_field_id(self, field: FieldID) -> str:
|
1086
|
-
return f"{
|
835
|
+
return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
|
1087
836
|
|
1088
837
|
async def compute_security(self, brain: ResourceBrain):
|
1089
838
|
security = await self.get_security()
|
@@ -1102,7 +851,7 @@ class Resource:
|
|
1102
851
|
brain.set_resource_metadata(basic=basic, origin=origin)
|
1103
852
|
for type, field in await self.get_fields_ids(force=True):
|
1104
853
|
fieldobj = await self.get_field(field, type, load=False)
|
1105
|
-
fieldid = FieldID(field_type=type, field=field)
|
854
|
+
fieldid = FieldID(field_type=type, field=field)
|
1106
855
|
fieldkey = self.generate_field_id(fieldid)
|
1107
856
|
extracted_metadata = await fieldobj.get_field_metadata()
|
1108
857
|
valid_user_field_metadata = None
|
@@ -1113,16 +862,16 @@ class Resource:
|
|
1113
862
|
):
|
1114
863
|
valid_user_field_metadata = user_field_metadata
|
1115
864
|
break
|
865
|
+
|
866
|
+
generated_by = await fieldobj.generated_by()
|
1116
867
|
brain.apply_field_labels(
|
1117
868
|
fieldkey,
|
1118
869
|
extracted_metadata,
|
1119
870
|
self.uuid,
|
871
|
+
generated_by,
|
1120
872
|
basic.usermetadata,
|
1121
873
|
valid_user_field_metadata,
|
1122
874
|
)
|
1123
|
-
if type == FieldType.KEYWORDSET:
|
1124
|
-
field_data = await fieldobj.db_get_value()
|
1125
|
-
brain.process_keywordset_fields(fieldkey, field_data)
|
1126
875
|
|
1127
876
|
@processor_observer.wrap({"type": "compute_global_text"})
|
1128
877
|
async def compute_global_text(self):
|
@@ -1159,12 +908,10 @@ class Resource:
|
|
1159
908
|
for fieldmetadata in self.basic.fieldmetadata:
|
1160
909
|
field_id = self.generate_field_id(fieldmetadata.field)
|
1161
910
|
for annotationparagraph in fieldmetadata.paragraphs:
|
1162
|
-
userdefinedparagraphclass[
|
1163
|
-
annotationparagraph.key
|
1164
|
-
] = annotationparagraph
|
911
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
1165
912
|
|
1166
913
|
for (type_id, field_id), field in fields.items():
|
1167
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
914
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1168
915
|
field_key = self.generate_field_id(fieldid)
|
1169
916
|
fm = await field.get_field_metadata()
|
1170
917
|
extracted_text = None
|
@@ -1179,9 +926,7 @@ class Resource:
|
|
1179
926
|
if fm is None:
|
1180
927
|
continue
|
1181
928
|
|
1182
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1183
|
-
(None, fm.metadata)
|
1184
|
-
]
|
929
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1185
930
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1186
931
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1187
932
|
|
@@ -1192,7 +937,7 @@ class Resource:
|
|
1192
937
|
|
1193
938
|
entities: dict[str, str] = {}
|
1194
939
|
if enabled_metadata.entities:
|
1195
|
-
entities
|
940
|
+
_update_entities_dict(entities, field_metadata)
|
1196
941
|
|
1197
942
|
precomputed_vectors = {}
|
1198
943
|
if vo is not None:
|
@@ -1203,9 +948,7 @@ class Resource:
|
|
1203
948
|
vectors = vo.vectors
|
1204
949
|
base_vector_key = f"{self.uuid}/{field_key}"
|
1205
950
|
for index, vector in enumerate(vectors.vectors):
|
1206
|
-
vector_key =
|
1207
|
-
f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
1208
|
-
)
|
951
|
+
vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
1209
952
|
precomputed_vectors[vector_key] = vector.vector
|
1210
953
|
|
1211
954
|
if extracted_text is not None:
|
@@ -1216,11 +959,11 @@ class Resource:
|
|
1216
959
|
|
1217
960
|
for paragraph in field_metadata.paragraphs:
|
1218
961
|
if subfield is not None:
|
1219
|
-
paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1220
|
-
else:
|
1221
962
|
paragraph_key = (
|
1222
|
-
f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
963
|
+
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1223
964
|
)
|
965
|
+
else:
|
966
|
+
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1224
967
|
|
1225
968
|
if enabled_metadata.labels:
|
1226
969
|
metadata.labels.ClearField("field")
|
@@ -1234,7 +977,9 @@ class Resource:
|
|
1234
977
|
if subfield is not None:
|
1235
978
|
sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
|
1236
979
|
else:
|
1237
|
-
sentence_key =
|
980
|
+
sentence_key = (
|
981
|
+
f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
|
982
|
+
)
|
1238
983
|
|
1239
984
|
if vo is not None:
|
1240
985
|
metadata.ClearField("vector")
|
@@ -1273,12 +1018,10 @@ class Resource:
|
|
1273
1018
|
for fieldmetadata in self.basic.fieldmetadata:
|
1274
1019
|
field_id = self.generate_field_id(fieldmetadata.field)
|
1275
1020
|
for annotationparagraph in fieldmetadata.paragraphs:
|
1276
|
-
userdefinedparagraphclass[
|
1277
|
-
annotationparagraph.key
|
1278
|
-
] = annotationparagraph
|
1021
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
1279
1022
|
|
1280
1023
|
for (type_id, field_id), field in fields.items():
|
1281
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
1024
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1282
1025
|
field_key = self.generate_field_id(fieldid)
|
1283
1026
|
fm = await field.get_field_metadata()
|
1284
1027
|
extracted_text = None
|
@@ -1289,9 +1032,7 @@ class Resource:
|
|
1289
1032
|
if fm is None:
|
1290
1033
|
continue
|
1291
1034
|
|
1292
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1293
|
-
(None, fm.metadata)
|
1294
|
-
]
|
1035
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1295
1036
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1296
1037
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1297
1038
|
|
@@ -1302,7 +1043,7 @@ class Resource:
|
|
1302
1043
|
|
1303
1044
|
entities: dict[str, str] = {}
|
1304
1045
|
if enabled_metadata.entities:
|
1305
|
-
entities
|
1046
|
+
_update_entities_dict(entities, field_metadata)
|
1306
1047
|
|
1307
1048
|
if extracted_text is not None:
|
1308
1049
|
if subfield is not None:
|
@@ -1312,11 +1053,11 @@ class Resource:
|
|
1312
1053
|
|
1313
1054
|
for paragraph in field_metadata.paragraphs:
|
1314
1055
|
if subfield is not None:
|
1315
|
-
paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1316
|
-
else:
|
1317
1056
|
paragraph_key = (
|
1318
|
-
f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1057
|
+
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1319
1058
|
)
|
1059
|
+
else:
|
1060
|
+
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1320
1061
|
|
1321
1062
|
if enabled_metadata.labels:
|
1322
1063
|
metadata.labels.ClearField("paragraph")
|
@@ -1344,9 +1085,7 @@ class Resource:
|
|
1344
1085
|
|
1345
1086
|
yield pb_paragraph
|
1346
1087
|
|
1347
|
-
async def iterate_fields(
|
1348
|
-
self, enabled_metadata: EnabledMetadata
|
1349
|
-
) -> AsyncIterator[TrainField]:
|
1088
|
+
async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
|
1350
1089
|
fields = await self.get_fields(force=True)
|
1351
1090
|
metadata = TrainMetadata()
|
1352
1091
|
if enabled_metadata.labels:
|
@@ -1356,7 +1095,7 @@ class Resource:
|
|
1356
1095
|
metadata.labels.resource.extend(self.basic.usermetadata.classifications)
|
1357
1096
|
|
1358
1097
|
for (type_id, field_id), field in fields.items():
|
1359
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
1098
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1360
1099
|
fm = await field.get_field_metadata()
|
1361
1100
|
extracted_text = None
|
1362
1101
|
|
@@ -1366,9 +1105,7 @@ class Resource:
|
|
1366
1105
|
if fm is None:
|
1367
1106
|
continue
|
1368
1107
|
|
1369
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1370
|
-
(None, fm.metadata)
|
1371
|
-
]
|
1108
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1372
1109
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1373
1110
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1374
1111
|
|
@@ -1385,7 +1122,7 @@ class Resource:
|
|
1385
1122
|
|
1386
1123
|
if enabled_metadata.entities:
|
1387
1124
|
metadata.ClearField("entities")
|
1388
|
-
metadata.entities
|
1125
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
1389
1126
|
|
1390
1127
|
pb_field = TrainField()
|
1391
1128
|
pb_field.uuid = self.uuid
|
@@ -1393,9 +1130,7 @@ class Resource:
|
|
1393
1130
|
pb_field.metadata.CopyFrom(metadata)
|
1394
1131
|
yield pb_field
|
1395
1132
|
|
1396
|
-
async def generate_train_resource(
|
1397
|
-
self, enabled_metadata: EnabledMetadata
|
1398
|
-
) -> TrainResource:
|
1133
|
+
async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
|
1399
1134
|
fields = await self.get_fields(force=True)
|
1400
1135
|
metadata = TrainMetadata()
|
1401
1136
|
if enabled_metadata.labels:
|
@@ -1422,9 +1157,7 @@ class Resource:
|
|
1422
1157
|
if fm is None:
|
1423
1158
|
continue
|
1424
1159
|
|
1425
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1426
|
-
(None, fm.metadata)
|
1427
|
-
]
|
1160
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1428
1161
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1429
1162
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1430
1163
|
|
@@ -1433,7 +1166,7 @@ class Resource:
|
|
1433
1166
|
metadata.labels.field.extend(splitted_metadata.classifications)
|
1434
1167
|
|
1435
1168
|
if enabled_metadata.entities:
|
1436
|
-
metadata.entities
|
1169
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
1437
1170
|
|
1438
1171
|
pb_resource = TrainResource()
|
1439
1172
|
pb_resource.uuid = self.uuid
|
@@ -1462,33 +1195,35 @@ def remove_field_classifications(basic: PBBasic, deleted_fields: list[FieldID]):
|
|
1462
1195
|
Clean classifications of fields that have been deleted
|
1463
1196
|
"""
|
1464
1197
|
field_classifications = [
|
1465
|
-
fc
|
1466
|
-
for fc in basic.computedmetadata.field_classifications
|
1467
|
-
if fc.field not in deleted_fields
|
1198
|
+
fc for fc in basic.computedmetadata.field_classifications if fc.field not in deleted_fields
|
1468
1199
|
]
|
1469
1200
|
basic.computedmetadata.ClearField("field_classifications")
|
1470
1201
|
basic.computedmetadata.field_classifications.extend(field_classifications)
|
1471
1202
|
|
1472
1203
|
|
1473
|
-
def add_field_classifications(
|
1474
|
-
basic: PBBasic, fcmw: FieldComputedMetadataWrapper
|
1475
|
-
) -> bool:
|
1204
|
+
def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper) -> bool:
|
1476
1205
|
"""
|
1477
1206
|
Returns whether some new field classifications were added
|
1478
1207
|
"""
|
1479
|
-
if len(fcmw.metadata.metadata.classifications) == 0
|
1208
|
+
if len(fcmw.metadata.metadata.classifications) == 0 and all(
|
1209
|
+
len(split.classifications) == 0 for split in fcmw.metadata.split_metadata.values()
|
1210
|
+
):
|
1480
1211
|
return False
|
1212
|
+
|
1481
1213
|
remove_field_classifications(basic, [fcmw.field])
|
1482
1214
|
fcfs = FieldClassifications()
|
1483
1215
|
fcfs.field.CopyFrom(fcmw.field)
|
1484
1216
|
fcfs.classifications.extend(fcmw.metadata.metadata.classifications)
|
1217
|
+
|
1218
|
+
for split_id, split in fcmw.metadata.split_metadata.items():
|
1219
|
+
if split_id not in fcmw.metadata.deleted_splits:
|
1220
|
+
fcfs.classifications.extend(split.classifications)
|
1221
|
+
|
1485
1222
|
basic.computedmetadata.field_classifications.append(fcfs)
|
1486
1223
|
return True
|
1487
1224
|
|
1488
1225
|
|
1489
|
-
def add_entities_to_metadata(
|
1490
|
-
entities: dict[str, str], local_text: str, metadata: TrainMetadata
|
1491
|
-
) -> None:
|
1226
|
+
def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
|
1492
1227
|
for entity_key, entity_value in entities.items():
|
1493
1228
|
if entity_key not in local_text:
|
1494
1229
|
# Add the entity only if found in text
|
@@ -1502,9 +1237,7 @@ def add_entities_to_metadata(
|
|
1502
1237
|
for _ in range(local_text.count(entity_key)):
|
1503
1238
|
start = local_text.index(entity_key, last_occurrence_end)
|
1504
1239
|
end = start + len(entity_key)
|
1505
|
-
metadata.entity_positions[poskey].positions.append(
|
1506
|
-
TrainPosition(start=start, end=end)
|
1507
|
-
)
|
1240
|
+
metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
|
1508
1241
|
last_occurrence_end = end
|
1509
1242
|
|
1510
1243
|
|
@@ -1519,15 +1252,22 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
|
|
1519
1252
|
if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
|
1520
1253
|
# Icon already set or detected
|
1521
1254
|
return False
|
1255
|
+
|
1522
1256
|
if not mimetype:
|
1523
1257
|
return False
|
1258
|
+
|
1259
|
+
if not content_types.valid(mimetype):
|
1260
|
+
logger.warning(
|
1261
|
+
"Invalid mimetype. Skipping icon update.",
|
1262
|
+
extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
|
1263
|
+
)
|
1264
|
+
return False
|
1265
|
+
|
1524
1266
|
basic.icon = mimetype
|
1525
1267
|
return True
|
1526
1268
|
|
1527
1269
|
|
1528
|
-
def maybe_update_basic_thumbnail(
|
1529
|
-
basic: PBBasic, thumbnail: Optional[CloudFile]
|
1530
|
-
) -> bool:
|
1270
|
+
def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile]) -> bool:
|
1531
1271
|
if basic.thumbnail or thumbnail is None:
|
1532
1272
|
return False
|
1533
1273
|
basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
|
@@ -1569,3 +1309,23 @@ def extract_field_metadata_languages(
|
|
1569
1309
|
for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
|
1570
1310
|
languages.add(splitted_metadata.language)
|
1571
1311
|
return list(languages)
|
1312
|
+
|
1313
|
+
|
1314
|
+
def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
|
1315
|
+
"""
|
1316
|
+
Update the entities dict with the entities from the field metadata.
|
1317
|
+
Method created to ease the transition from legacy ner field to new entities field.
|
1318
|
+
"""
|
1319
|
+
# Data Augmentation + Processor entities
|
1320
|
+
# This will overwrite entities detected from more than one data augmentation task
|
1321
|
+
# TODO: Change TrainMetadata proto to accept multiple entities with the same text
|
1322
|
+
entity_map = {
|
1323
|
+
entity.text: entity.label
|
1324
|
+
for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
|
1325
|
+
for entity in entities_wrapper.entities
|
1326
|
+
}
|
1327
|
+
target_entites_dict.update(entity_map)
|
1328
|
+
|
1329
|
+
# Legacy processor entities
|
1330
|
+
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
1331
|
+
target_entites_dict.update(field_metadata.ner)
|