nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/ingest/orm/resource.py
CHANGED
@@ -23,19 +23,33 @@ import asyncio
|
|
23
23
|
import logging
|
24
24
|
from concurrent.futures import ThreadPoolExecutor
|
25
25
|
from functools import partial
|
26
|
-
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional, Type
|
26
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, MutableMapping, Optional, Type
|
27
27
|
|
28
|
+
from nucliadb.common import datamanagers
|
29
|
+
from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
|
30
|
+
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
|
31
|
+
from nucliadb.common.maindb.driver import Transaction
|
32
|
+
from nucliadb.ingest.fields.base import Field
|
33
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
34
|
+
from nucliadb.ingest.fields.file import File
|
35
|
+
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
36
|
+
from nucliadb.ingest.fields.link import Link
|
37
|
+
from nucliadb.ingest.fields.text import Text
|
38
|
+
from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
|
39
|
+
from nucliadb.ingest.orm.metrics import processor_observer
|
40
|
+
from nucliadb_models import content_types
|
41
|
+
from nucliadb_models.common import CloudLink
|
42
|
+
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
43
|
+
from nucliadb_protos import utils_pb2, writer_pb2
|
28
44
|
from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
|
29
|
-
from nucliadb_protos.resources_pb2 import Basic
|
30
|
-
from nucliadb_protos.resources_pb2 import Basic as PBBasic
|
31
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
32
|
-
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
33
|
-
from nucliadb_protos.resources_pb2 import Extra as PBExtra
|
34
45
|
from nucliadb_protos.resources_pb2 import (
|
46
|
+
Basic,
|
47
|
+
CloudFile,
|
35
48
|
ExtractedTextWrapper,
|
36
49
|
ExtractedVectorsWrapper,
|
37
50
|
FieldClassifications,
|
38
51
|
FieldComputedMetadataWrapper,
|
52
|
+
FieldFile,
|
39
53
|
FieldID,
|
40
54
|
FieldMetadata,
|
41
55
|
FieldQuestionAnswerWrapper,
|
@@ -44,40 +58,27 @@ from nucliadb_protos.resources_pb2 import (
|
|
44
58
|
FileExtractedData,
|
45
59
|
LargeComputedMetadataWrapper,
|
46
60
|
LinkExtractedData,
|
61
|
+
Metadata,
|
62
|
+
Paragraph,
|
63
|
+
ParagraphAnnotation,
|
47
64
|
)
|
48
|
-
from nucliadb_protos.resources_pb2 import
|
65
|
+
from nucliadb_protos.resources_pb2 import Basic as PBBasic
|
66
|
+
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
67
|
+
from nucliadb_protos.resources_pb2 import Extra as PBExtra
|
49
68
|
from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
50
69
|
from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
51
|
-
from nucliadb_protos.resources_pb2 import Paragraph, ParagraphAnnotation
|
52
70
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
53
|
-
from nucliadb_protos.train_pb2 import EnabledMetadata
|
54
|
-
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
55
71
|
from nucliadb_protos.train_pb2 import (
|
72
|
+
EnabledMetadata,
|
56
73
|
TrainField,
|
57
74
|
TrainMetadata,
|
58
75
|
TrainParagraph,
|
59
76
|
TrainResource,
|
60
77
|
TrainSentence,
|
61
78
|
)
|
79
|
+
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
62
80
|
from nucliadb_protos.utils_pb2 import Relation as PBRelation
|
63
81
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
64
|
-
|
65
|
-
from nucliadb.common import datamanagers
|
66
|
-
from nucliadb.common.maindb.driver import Transaction
|
67
|
-
from nucliadb.ingest.fields.base import Field
|
68
|
-
from nucliadb.ingest.fields.conversation import Conversation
|
69
|
-
from nucliadb.ingest.fields.date import Datetime
|
70
|
-
from nucliadb.ingest.fields.file import File
|
71
|
-
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
72
|
-
from nucliadb.ingest.fields.keywordset import Keywordset
|
73
|
-
from nucliadb.ingest.fields.layout import Layout
|
74
|
-
from nucliadb.ingest.fields.link import Link
|
75
|
-
from nucliadb.ingest.fields.text import Text
|
76
|
-
from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
|
77
|
-
from nucliadb.ingest.orm.metrics import processor_observer
|
78
|
-
from nucliadb_models.common import CloudLink
|
79
|
-
from nucliadb_models.writer import GENERIC_MIME_TYPE
|
80
|
-
from nucliadb_protos import utils_pb2, writer_pb2
|
81
82
|
from nucliadb_utils.storages.storage import Storage
|
82
83
|
|
83
84
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -85,33 +86,14 @@ if TYPE_CHECKING: # pragma: no cover
|
|
85
86
|
|
86
87
|
logger = logging.getLogger(__name__)
|
87
88
|
|
88
|
-
KB_RESOURCE_FIELDS = "/kbs/{kbid}/r/{uuid}/f/"
|
89
|
-
KB_RESOURCE_SLUG_BASE = "/kbs/{kbid}/s/"
|
90
|
-
KB_RESOURCE_SLUG = f"{KB_RESOURCE_SLUG_BASE}{{slug}}"
|
91
89
|
KB_FIELDS: dict[int, Type] = {
|
92
|
-
FieldType.LAYOUT: Layout,
|
93
90
|
FieldType.TEXT: Text,
|
94
91
|
FieldType.FILE: File,
|
95
92
|
FieldType.LINK: Link,
|
96
|
-
FieldType.DATETIME: Datetime,
|
97
|
-
FieldType.KEYWORDSET: Keywordset,
|
98
93
|
FieldType.GENERIC: Generic,
|
99
94
|
FieldType.CONVERSATION: Conversation,
|
100
95
|
}
|
101
96
|
|
102
|
-
KB_REVERSE: dict[str, FieldType.ValueType] = {
|
103
|
-
"l": FieldType.LAYOUT,
|
104
|
-
"t": FieldType.TEXT,
|
105
|
-
"f": FieldType.FILE,
|
106
|
-
"u": FieldType.LINK,
|
107
|
-
"d": FieldType.DATETIME,
|
108
|
-
"k": FieldType.KEYWORDSET,
|
109
|
-
"a": FieldType.GENERIC,
|
110
|
-
"c": FieldType.CONVERSATION,
|
111
|
-
}
|
112
|
-
|
113
|
-
FIELD_TYPE_TO_ID = {v: k for k, v in KB_REVERSE.items()}
|
114
|
-
|
115
97
|
_executor = ThreadPoolExecutor(10)
|
116
98
|
|
117
99
|
|
@@ -122,6 +104,8 @@ PB_TEXT_FORMAT_TO_MIMETYPE = {
|
|
122
104
|
FieldText.Format.MARKDOWN: "text/markdown",
|
123
105
|
FieldText.Format.JSON: "application/json",
|
124
106
|
FieldText.Format.KEEP_MARKDOWN: "text/markdown",
|
107
|
+
FieldText.Format.JSONL: "application/x-ndjson",
|
108
|
+
FieldText.Format.PLAIN_BLANKLINE_SPLIT: "text/plain+blankline",
|
125
109
|
}
|
126
110
|
|
127
111
|
BASIC_IMMUTABLE_FIELDS = ("icon",)
|
@@ -173,9 +157,7 @@ class Resource:
|
|
173
157
|
# Basic
|
174
158
|
async def get_basic(self) -> Optional[PBBasic]:
|
175
159
|
if self.basic is None:
|
176
|
-
basic = await datamanagers.resources.get_basic(
|
177
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
178
|
-
)
|
160
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
179
161
|
self.basic = basic if basic is not None else PBBasic()
|
180
162
|
return self.basic
|
181
163
|
|
@@ -221,9 +203,7 @@ class Resource:
|
|
221
203
|
fields.append(field_id)
|
222
204
|
positions[field_id] = i
|
223
205
|
|
224
|
-
updated = [
|
225
|
-
self.basic.fieldmetadata[positions[field]] for field in fields
|
226
|
-
]
|
206
|
+
updated = [self.basic.fieldmetadata[positions[field]] for field in fields]
|
227
207
|
|
228
208
|
del self.basic.fieldmetadata[:]
|
229
209
|
self.basic.fieldmetadata.extend(updated)
|
@@ -244,11 +224,10 @@ class Resource:
|
|
244
224
|
self.indexer.apply_field_metadata(
|
245
225
|
field_id,
|
246
226
|
field_metadata,
|
247
|
-
replace_field=[],
|
248
|
-
replace_splits={},
|
249
227
|
page_positions=page_positions,
|
250
228
|
extracted_text=await field_obj.get_extracted_text(),
|
251
229
|
basic_user_field_metadata=user_field_metadata,
|
230
|
+
replace_field=True,
|
252
231
|
)
|
253
232
|
|
254
233
|
# Some basic fields are computed off field metadata.
|
@@ -264,9 +243,7 @@ class Resource:
|
|
264
243
|
# Origin
|
265
244
|
async def get_origin(self) -> Optional[PBOrigin]:
|
266
245
|
if self.origin is None:
|
267
|
-
origin = await datamanagers.resources.get_origin(
|
268
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
269
|
-
)
|
246
|
+
origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
270
247
|
self.origin = origin
|
271
248
|
return self.origin
|
272
249
|
|
@@ -280,16 +257,12 @@ class Resource:
|
|
280
257
|
# Extra
|
281
258
|
async def get_extra(self) -> Optional[PBExtra]:
|
282
259
|
if self.extra is None:
|
283
|
-
extra = await datamanagers.resources.get_extra(
|
284
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
285
|
-
)
|
260
|
+
extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
286
261
|
self.extra = extra
|
287
262
|
return self.extra
|
288
263
|
|
289
264
|
async def set_extra(self, payload: PBExtra):
|
290
|
-
await datamanagers.resources.set_extra(
|
291
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload
|
292
|
-
)
|
265
|
+
await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
|
293
266
|
self.modified = True
|
294
267
|
self.extra = payload
|
295
268
|
|
@@ -329,7 +302,7 @@ class Resource:
|
|
329
302
|
self.relations = relations
|
330
303
|
|
331
304
|
@processor_observer.wrap({"type": "generate_index_message"})
|
332
|
-
async def generate_index_message(self) -> ResourceBrain:
|
305
|
+
async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
|
333
306
|
brain = ResourceBrain(rid=self.uuid)
|
334
307
|
origin = await self.get_origin()
|
335
308
|
basic = await self.get_basic()
|
@@ -339,7 +312,7 @@ class Resource:
|
|
339
312
|
await self.compute_global_tags(brain)
|
340
313
|
fields = await self.get_fields(force=True)
|
341
314
|
for (type_id, field_id), field in fields.items():
|
342
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
315
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
343
316
|
await self.compute_global_text_field(fieldid, brain)
|
344
317
|
|
345
318
|
field_metadata = await field.get_field_metadata()
|
@@ -355,234 +328,66 @@ class Resource:
|
|
355
328
|
(
|
356
329
|
fm
|
357
330
|
for fm in basic.fieldmetadata
|
358
|
-
if fm.field.field == field_id
|
359
|
-
and fm.field.field_type == type_id
|
331
|
+
if fm.field.field == field_id and fm.field.field_type == type_id
|
360
332
|
),
|
361
333
|
None,
|
362
334
|
)
|
363
335
|
brain.apply_field_metadata(
|
364
336
|
field_key,
|
365
337
|
field_metadata,
|
366
|
-
replace_field=[],
|
367
|
-
replace_splits={},
|
368
338
|
page_positions=page_positions,
|
369
339
|
extracted_text=await field.get_extracted_text(),
|
370
340
|
basic_user_field_metadata=user_field_metadata,
|
341
|
+
replace_field=reindex,
|
371
342
|
)
|
372
343
|
|
373
344
|
if self.disable_vectors is False:
|
345
|
+
# XXX: while we don't remove the "default" vectorset concept, we
|
346
|
+
# need to do use None as the default one
|
374
347
|
vo = await field.get_vectors()
|
375
348
|
if vo is not None:
|
376
|
-
|
377
|
-
|
378
|
-
|
349
|
+
async with datamanagers.with_ro_transaction() as ro_txn:
|
350
|
+
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
351
|
+
ro_txn, kbid=self.kb.kbid
|
352
|
+
)
|
379
353
|
brain.apply_field_vectors(
|
380
354
|
field_key,
|
381
355
|
vo,
|
382
356
|
matryoshka_vector_dimension=dimension,
|
357
|
+
replace_field=reindex,
|
383
358
|
)
|
384
|
-
return brain
|
385
|
-
|
386
|
-
async def generate_field_vectors(
|
387
|
-
self,
|
388
|
-
bm: BrokerMessage,
|
389
|
-
type_id: FieldType.ValueType,
|
390
|
-
field_id: str,
|
391
|
-
field: Field,
|
392
|
-
):
|
393
|
-
vo = await field.get_vectors()
|
394
|
-
if vo is None:
|
395
|
-
return
|
396
|
-
evw = ExtractedVectorsWrapper()
|
397
|
-
evw.field.field = field_id
|
398
|
-
evw.field.field_type = type_id # type: ignore
|
399
|
-
evw.vectors.CopyFrom(vo)
|
400
|
-
bm.field_vectors.append(evw)
|
401
|
-
|
402
|
-
async def generate_field_large_computed_metadata(
|
403
|
-
self,
|
404
|
-
bm: BrokerMessage,
|
405
|
-
type_id: FieldType.ValueType,
|
406
|
-
field_id: str,
|
407
|
-
field: Field,
|
408
|
-
):
|
409
|
-
lcm = await field.get_large_field_metadata()
|
410
|
-
if lcm is None:
|
411
|
-
return
|
412
|
-
lcmw = LargeComputedMetadataWrapper()
|
413
|
-
lcmw.field.field = field_id
|
414
|
-
lcmw.field.field_type = type_id # type: ignore
|
415
|
-
lcmw.real.CopyFrom(lcm)
|
416
|
-
bm.field_large_metadata.append(lcmw)
|
417
|
-
|
418
|
-
async def generate_field_computed_metadata(
|
419
|
-
self,
|
420
|
-
bm: BrokerMessage,
|
421
|
-
type_id: FieldType.ValueType,
|
422
|
-
field_id: str,
|
423
|
-
field: Field,
|
424
|
-
):
|
425
|
-
fcmw = FieldComputedMetadataWrapper()
|
426
|
-
fcmw.field.field = field_id
|
427
|
-
fcmw.field.field_type = type_id # type: ignore
|
428
|
-
|
429
|
-
field_metadata = await field.get_field_metadata()
|
430
|
-
if field_metadata is not None:
|
431
|
-
fcmw.metadata.CopyFrom(field_metadata)
|
432
|
-
fcmw.field.field = field_id
|
433
|
-
fcmw.field.field_type = type_id # type: ignore
|
434
|
-
bm.field_metadata.append(fcmw)
|
435
|
-
# Make sure cloud files are removed for exporting
|
436
|
-
|
437
|
-
async def generate_extracted_text(
|
438
|
-
self,
|
439
|
-
bm: BrokerMessage,
|
440
|
-
type_id: FieldType.ValueType,
|
441
|
-
field_id: str,
|
442
|
-
field: Field,
|
443
|
-
):
|
444
|
-
etw = ExtractedTextWrapper()
|
445
|
-
etw.field.field = field_id
|
446
|
-
etw.field.field_type = type_id # type: ignore
|
447
|
-
extracted_text = await field.get_extracted_text()
|
448
|
-
if extracted_text is not None:
|
449
|
-
etw.body.CopyFrom(extracted_text)
|
450
|
-
bm.extracted_text.append(etw)
|
451
|
-
|
452
|
-
async def generate_field(
|
453
|
-
self,
|
454
|
-
bm: BrokerMessage,
|
455
|
-
type_id: FieldType.ValueType,
|
456
|
-
field_id: str,
|
457
|
-
field: Field,
|
458
|
-
):
|
459
|
-
# Used for exporting a field
|
460
|
-
if type_id == FieldType.TEXT:
|
461
|
-
value = await field.get_value()
|
462
|
-
bm.texts[field_id].CopyFrom(value)
|
463
|
-
elif type_id == FieldType.LINK:
|
464
|
-
value = await field.get_value()
|
465
|
-
bm.links[field_id].CopyFrom(value)
|
466
|
-
elif type_id == FieldType.FILE:
|
467
|
-
value = await field.get_value()
|
468
|
-
bm.files[field_id].CopyFrom(value)
|
469
|
-
elif type_id == FieldType.CONVERSATION:
|
470
|
-
value = await self.get_full_conversation(field) # type: ignore
|
471
|
-
bm.conversations[field_id].CopyFrom(value)
|
472
|
-
elif type_id == FieldType.KEYWORDSET:
|
473
|
-
value = await field.get_value()
|
474
|
-
bm.keywordsets[field_id].CopyFrom(value)
|
475
|
-
elif type_id == FieldType.DATETIME:
|
476
|
-
value = await field.get_value()
|
477
|
-
bm.datetimes[field_id].CopyFrom(value)
|
478
|
-
elif type_id == FieldType.LAYOUT:
|
479
|
-
value = await field.get_value()
|
480
|
-
bm.layouts[field_id].CopyFrom(value)
|
481
|
-
|
482
|
-
async def get_full_conversation(
|
483
|
-
self,
|
484
|
-
conversation_field: Conversation,
|
485
|
-
) -> Optional[PBConversation]:
|
486
|
-
"""
|
487
|
-
Messages of a conversations may be stored across several pages.
|
488
|
-
This method fetches them all and returns a single complete conversation.
|
489
|
-
"""
|
490
|
-
full_conv = PBConversation()
|
491
|
-
n_page = 1
|
492
|
-
while True:
|
493
|
-
page = await conversation_field.get_value(page=n_page)
|
494
|
-
if page is None:
|
495
|
-
break
|
496
|
-
full_conv.messages.extend(page.messages)
|
497
|
-
n_page += 1
|
498
|
-
return full_conv
|
499
|
-
|
500
|
-
async def generate_broker_message(self) -> BrokerMessage:
|
501
|
-
# full means downloading all the pointers
|
502
|
-
# minuts the ones to external files that are not PB
|
503
|
-
# Go for all fields and recreate brain
|
504
|
-
bm = BrokerMessage()
|
505
|
-
bm.kbid = self.kb.kbid
|
506
|
-
bm.uuid = self.uuid
|
507
|
-
basic = await self.get_basic()
|
508
|
-
if basic is not None:
|
509
|
-
bm.basic.CopyFrom(basic)
|
510
|
-
bm.slug = bm.basic.slug
|
511
|
-
origin = await self.get_origin()
|
512
|
-
if origin is not None:
|
513
|
-
bm.origin.CopyFrom(origin)
|
514
|
-
relations = await self.get_relations()
|
515
|
-
if relations is not None:
|
516
|
-
for relation in relations.relations:
|
517
|
-
bm.relations.append(relation)
|
518
|
-
|
519
|
-
fields = await self.get_fields(force=True)
|
520
|
-
for (type_id, field_id), field in fields.items():
|
521
|
-
# Value
|
522
|
-
await self.generate_field(bm, type_id, field_id, field)
|
523
|
-
|
524
|
-
# Extracted text
|
525
|
-
await self.generate_extracted_text(bm, type_id, field_id, field)
|
526
|
-
|
527
|
-
# Field Computed Metadata
|
528
|
-
await self.generate_field_computed_metadata(bm, type_id, field_id, field)
|
529
|
-
|
530
|
-
if type_id == FieldType.FILE and isinstance(field, File):
|
531
|
-
field_extracted_data = await field.get_file_extracted_data()
|
532
|
-
if field_extracted_data is not None:
|
533
|
-
bm.file_extracted_data.append(field_extracted_data)
|
534
|
-
|
535
|
-
elif type_id == FieldType.LINK and isinstance(field, Link):
|
536
|
-
link_extracted_data = await field.get_link_extracted_data()
|
537
|
-
if link_extracted_data is not None:
|
538
|
-
bm.link_extracted_data.append(link_extracted_data)
|
539
359
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
360
|
+
vectorset_configs = []
|
361
|
+
async with datamanagers.with_ro_transaction() as ro_txn:
|
362
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
363
|
+
ro_txn, kbid=self.kb.kbid
|
364
|
+
):
|
365
|
+
vectorset_configs.append(vectorset_config)
|
366
|
+
for vectorset_config in vectorset_configs:
|
367
|
+
vo = await field.get_vectors(vectorset=vectorset_config.vectorset_id)
|
368
|
+
if vo is not None:
|
369
|
+
dimension = vectorset_config.vectorset_index_config.vector_dimension
|
370
|
+
brain.apply_field_vectors(
|
371
|
+
field_key,
|
372
|
+
vo,
|
373
|
+
vectorset=vectorset_config.vectorset_id,
|
374
|
+
matryoshka_vector_dimension=dimension,
|
375
|
+
replace_field=reindex,
|
376
|
+
)
|
377
|
+
return brain
|
549
378
|
|
550
379
|
# Fields
|
551
|
-
async def get_fields(
|
552
|
-
self, force: bool = False
|
553
|
-
) -> dict[tuple[FieldType.ValueType, str], Field]:
|
380
|
+
async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
|
554
381
|
# Get all fields
|
555
382
|
for type, field in await self.get_fields_ids(force=force):
|
556
383
|
if (type, field) not in self.fields:
|
557
384
|
self.fields[(type, field)] = await self.get_field(field, type)
|
558
385
|
return self.fields
|
559
386
|
|
560
|
-
async def _deprecated_scan_fields_ids(
|
561
|
-
self,
|
562
|
-
) -> AsyncIterator[tuple[FieldType.ValueType, str]]:
|
563
|
-
logger.warning("Scanning fields ids. This is not optimal.")
|
564
|
-
prefix = KB_RESOURCE_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
|
565
|
-
allfields = set()
|
566
|
-
async for key in self.txn.keys(prefix, count=-1):
|
567
|
-
# The [6:8] `slicing purpose is to match exactly the two
|
568
|
-
# splitted parts corresponding to type and field, and nothing else!
|
569
|
-
type, field = key.split("/")[6:8]
|
570
|
-
type_id = KB_REVERSE.get(type)
|
571
|
-
if type_id is None:
|
572
|
-
raise AttributeError("Invalid field type")
|
573
|
-
result = (type_id, field)
|
574
|
-
if result not in allfields:
|
575
|
-
# fields can have errors that are stored in a subkey:
|
576
|
-
# - field key -> kbs/kbid/r/ruuid/f/myfield
|
577
|
-
# - field error key -> kbs/kbid/r/ruuid/f/myfield/errors
|
578
|
-
# and that would return duplicates here.
|
579
|
-
yield result
|
580
|
-
allfields.add(result)
|
581
|
-
|
582
387
|
async def _inner_get_fields_ids(self) -> list[tuple[FieldType.ValueType, str]]:
|
583
388
|
# Use a set to make sure we don't have duplicate field ids
|
584
389
|
result = set()
|
585
|
-
all_fields = await self.get_all_field_ids()
|
390
|
+
all_fields = await self.get_all_field_ids(for_update=False)
|
586
391
|
if all_fields is not None:
|
587
392
|
for f in all_fields.fields:
|
588
393
|
result.add((f.field_type, f.field))
|
@@ -599,9 +404,7 @@ class Resource:
|
|
599
404
|
result.add((FieldType.GENERIC, generic))
|
600
405
|
return list(result)
|
601
406
|
|
602
|
-
async def get_fields_ids(
|
603
|
-
self, force: bool = False
|
604
|
-
) -> list[tuple[FieldType.ValueType, str]]:
|
407
|
+
async def get_fields_ids(self, force: bool = False) -> list[tuple[FieldType.ValueType, str]]:
|
605
408
|
"""
|
606
409
|
Get all ids of the fields of the resource and cache them.
|
607
410
|
"""
|
@@ -645,23 +448,20 @@ class Resource:
|
|
645
448
|
if field in self.all_fields_keys:
|
646
449
|
self.all_fields_keys.remove(field)
|
647
450
|
|
648
|
-
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
649
|
-
vo = await field_obj.get_vectors()
|
650
|
-
if vo is not None:
|
651
|
-
self.indexer.delete_vectors(field_key=field_key, vo=vo)
|
451
|
+
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
652
452
|
|
653
453
|
metadata = await field_obj.get_field_metadata()
|
654
454
|
if metadata is not None:
|
655
|
-
self.indexer.
|
455
|
+
self.indexer.delete_field(field_key=field_key)
|
656
456
|
|
657
457
|
await field_obj.delete()
|
658
458
|
|
659
459
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
660
460
|
return (type, field) in self.fields
|
661
461
|
|
662
|
-
async def get_all_field_ids(self) -> Optional[PBAllFieldIDs]:
|
462
|
+
async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
|
663
463
|
return await datamanagers.resources.get_all_field_ids(
|
664
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
464
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
|
665
465
|
)
|
666
466
|
|
667
467
|
async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
|
@@ -677,7 +477,7 @@ class Resource:
|
|
677
477
|
errors: Optional[list[writer_pb2.Error]] = None,
|
678
478
|
):
|
679
479
|
needs_update = False
|
680
|
-
all_fields = await self.get_all_field_ids()
|
480
|
+
all_fields = await self.get_all_field_ids(for_update=True)
|
681
481
|
if all_fields is None:
|
682
482
|
needs_update = True
|
683
483
|
all_fields = PBAllFieldIDs()
|
@@ -704,26 +504,11 @@ class Resource:
|
|
704
504
|
@processor_observer.wrap({"type": "apply_fields"})
|
705
505
|
async def apply_fields(self, message: BrokerMessage):
|
706
506
|
message_updated_fields = []
|
707
|
-
for field, layout in message.layouts.items():
|
708
|
-
fid = FieldID(field_type=FieldType.LAYOUT, field=field)
|
709
|
-
await self.set_field(fid.field_type, fid.field, layout)
|
710
|
-
message_updated_fields.append(fid)
|
711
|
-
|
712
507
|
for field, text in message.texts.items():
|
713
508
|
fid = FieldID(field_type=FieldType.TEXT, field=field)
|
714
509
|
await self.set_field(fid.field_type, fid.field, text)
|
715
510
|
message_updated_fields.append(fid)
|
716
511
|
|
717
|
-
for field, keywordset in message.keywordsets.items():
|
718
|
-
fid = FieldID(field_type=FieldType.KEYWORDSET, field=field)
|
719
|
-
await self.set_field(fid.field_type, fid.field, keywordset)
|
720
|
-
message_updated_fields.append(fid)
|
721
|
-
|
722
|
-
for field, datetimeobj in message.datetimes.items():
|
723
|
-
fid = FieldID(field_type=FieldType.DATETIME, field=field)
|
724
|
-
await self.set_field(fid.field_type, fid.field, datetimeobj)
|
725
|
-
message_updated_fields.append(fid)
|
726
|
-
|
727
512
|
for field, link in message.links.items():
|
728
513
|
fid = FieldID(field_type=FieldType.LINK, field=field)
|
729
514
|
await self.set_field(fid.field_type, fid.field, link)
|
@@ -742,13 +527,11 @@ class Resource:
|
|
742
527
|
for fieldid in message.delete_fields:
|
743
528
|
await self.delete_field(fieldid.field_type, fieldid.field)
|
744
529
|
|
745
|
-
if (
|
746
|
-
len(message_updated_fields)
|
747
|
-
or len(message.delete_fields)
|
748
|
-
or len(message.errors)
|
749
|
-
):
|
530
|
+
if len(message_updated_fields) or len(message.delete_fields) or len(message.errors):
|
750
531
|
await self.update_all_field_ids(
|
751
|
-
updated=message_updated_fields,
|
532
|
+
updated=message_updated_fields,
|
533
|
+
deleted=message.delete_fields, # type: ignore
|
534
|
+
errors=message.errors, # type: ignore
|
752
535
|
)
|
753
536
|
|
754
537
|
@processor_observer.wrap({"type": "apply_extracted"})
|
@@ -784,13 +567,15 @@ class Resource:
|
|
784
567
|
|
785
568
|
for link_extracted_data in message.link_extracted_data:
|
786
569
|
await self._apply_link_extracted_data(link_extracted_data)
|
787
|
-
await self.
|
570
|
+
await self.maybe_update_resource_title_from_link(link_extracted_data)
|
788
571
|
extracted_languages.append(link_extracted_data.language)
|
789
572
|
|
790
573
|
for file_extracted_data in message.file_extracted_data:
|
791
574
|
await self._apply_file_extracted_data(file_extracted_data)
|
792
575
|
extracted_languages.append(file_extracted_data.language)
|
793
576
|
|
577
|
+
await self.maybe_update_resource_title_from_file_extracted_data(message)
|
578
|
+
|
794
579
|
# Metadata should go first
|
795
580
|
for field_metadata in message.field_metadata:
|
796
581
|
await self._apply_field_computed_metadata(field_metadata)
|
@@ -801,6 +586,7 @@ class Resource:
|
|
801
586
|
# Upload to binary storage
|
802
587
|
# Vector indexing
|
803
588
|
if self.disable_vectors is False:
|
589
|
+
await self.get_fields(force=True)
|
804
590
|
for field_vectors in message.field_vectors:
|
805
591
|
await self._apply_extracted_vectors(field_vectors)
|
806
592
|
|
@@ -826,9 +612,7 @@ class Resource:
|
|
826
612
|
extracted_text.field,
|
827
613
|
)
|
828
614
|
|
829
|
-
async def _apply_question_answers(
|
830
|
-
self, question_answers: FieldQuestionAnswerWrapper
|
831
|
-
):
|
615
|
+
async def _apply_question_answers(self, question_answers: FieldQuestionAnswerWrapper):
|
832
616
|
field = question_answers.field
|
833
617
|
field_obj = await self.get_field(field.field, field.field_type, load=False)
|
834
618
|
await field_obj.set_question_answers(question_answers)
|
@@ -848,19 +632,27 @@ class Resource:
|
|
848
632
|
|
849
633
|
maybe_update_basic_summary(self.basic, link_extracted_data.description)
|
850
634
|
|
851
|
-
async def
|
635
|
+
async def maybe_update_resource_title_from_link(self, link_extracted_data: LinkExtractedData):
|
636
|
+
"""
|
637
|
+
When parsing link extracted data, we want to replace the resource title for the first link
|
638
|
+
that gets processed and has a title, and only if the current title is a URL, which we take
|
639
|
+
as a hint that the title was not set by the user.
|
640
|
+
"""
|
852
641
|
assert self.basic is not None
|
853
642
|
if not link_extracted_data.title:
|
854
643
|
return
|
855
644
|
if not (self.basic.title.startswith("http") or self.basic.title == ""):
|
856
645
|
return
|
857
|
-
|
858
646
|
title = link_extracted_data.title
|
859
|
-
self.
|
647
|
+
await self.update_resource_title(title)
|
648
|
+
|
649
|
+
async def update_resource_title(self, computed_title: str) -> None:
|
650
|
+
assert self.basic is not None
|
651
|
+
self.basic.title = computed_title
|
860
652
|
# Extracted text
|
861
653
|
field = await self.get_field("title", FieldType.GENERIC, load=False)
|
862
654
|
etw = ExtractedTextWrapper()
|
863
|
-
etw.body.text =
|
655
|
+
etw.body.text = computed_title
|
864
656
|
await field.set_extracted_text(etw)
|
865
657
|
|
866
658
|
# Field computed metadata
|
@@ -872,11 +664,8 @@ class Resource:
|
|
872
664
|
fcm = await field.get_field_metadata(force=True)
|
873
665
|
if fcm is not None:
|
874
666
|
fcmw.metadata.CopyFrom(fcm)
|
875
|
-
|
876
667
|
fcmw.metadata.metadata.ClearField("paragraphs")
|
877
|
-
paragraph = Paragraph(
|
878
|
-
start=0, end=len(title), kind=Paragraph.TypeParagraph.TITLE
|
879
|
-
)
|
668
|
+
paragraph = Paragraph(start=0, end=len(computed_title), kind=Paragraph.TypeParagraph.TITLE)
|
880
669
|
fcmw.metadata.metadata.paragraphs.append(paragraph)
|
881
670
|
|
882
671
|
await field.set_field_metadata(fcmw)
|
@@ -893,9 +682,54 @@ class Resource:
|
|
893
682
|
maybe_update_basic_icon(self.basic, file_extracted_data.icon)
|
894
683
|
maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
|
895
684
|
|
896
|
-
async def
|
897
|
-
|
898
|
-
|
685
|
+
async def _should_update_resource_title_from_file_metadata(self) -> bool:
|
686
|
+
"""
|
687
|
+
We only want to update resource title from file metadata if the title is empty,
|
688
|
+
equal to the resource uuid or equal to any of the file filenames in the resource.
|
689
|
+
"""
|
690
|
+
basic = await self.get_basic()
|
691
|
+
if basic is None:
|
692
|
+
return True
|
693
|
+
current_title = basic.title
|
694
|
+
if current_title == "":
|
695
|
+
# If the title is empty, we should update it
|
696
|
+
return True
|
697
|
+
if current_title == self.uuid:
|
698
|
+
# If the title is the same as the resource uuid, we should update it
|
699
|
+
return True
|
700
|
+
fields = await self.get_fields(force=True)
|
701
|
+
filenames = set()
|
702
|
+
for (field_type, _), field_obj in fields.items():
|
703
|
+
if field_type == FieldType.FILE:
|
704
|
+
field_value: Optional[FieldFile] = await field_obj.get_value()
|
705
|
+
if field_value is not None:
|
706
|
+
if field_value.file.filename not in ("", None):
|
707
|
+
filenames.add(field_value.file.filename)
|
708
|
+
if current_title in filenames:
|
709
|
+
# If the title is equal to any of the file filenames, we should update it
|
710
|
+
return True
|
711
|
+
return False
|
712
|
+
|
713
|
+
async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
|
714
|
+
"""
|
715
|
+
Update the resource title with the first file that has a title extracted.
|
716
|
+
"""
|
717
|
+
if not await self._should_update_resource_title_from_file_metadata():
|
718
|
+
return
|
719
|
+
for fed in message.file_extracted_data:
|
720
|
+
if fed.title == "":
|
721
|
+
# Skip if the extracted title is empty
|
722
|
+
continue
|
723
|
+
fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
|
724
|
+
logger.info(
|
725
|
+
"Updating resource title from file extracted data",
|
726
|
+
extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
|
727
|
+
)
|
728
|
+
await self.update_resource_title(fed.title)
|
729
|
+
# Break after the first file with a title is found
|
730
|
+
break
|
731
|
+
|
732
|
+
async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
|
899
733
|
assert self.basic is not None
|
900
734
|
maybe_update_basic_summary(self.basic, field_metadata.metadata.metadata.summary)
|
901
735
|
|
@@ -904,17 +738,11 @@ class Resource:
|
|
904
738
|
field_metadata.field.field_type,
|
905
739
|
load=False,
|
906
740
|
)
|
907
|
-
(
|
908
|
-
metadata,
|
909
|
-
replace_field,
|
910
|
-
replace_splits,
|
911
|
-
) = await field_obj.set_field_metadata(field_metadata)
|
741
|
+
metadata = await field_obj.set_field_metadata(field_metadata)
|
912
742
|
field_key = self.generate_field_id(field_metadata.field)
|
913
743
|
|
914
744
|
page_positions: Optional[FilePagePositions] = None
|
915
|
-
if field_metadata.field.field_type == FieldType.FILE and isinstance(
|
916
|
-
field_obj, File
|
917
|
-
):
|
745
|
+
if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
|
918
746
|
page_positions = await get_file_page_positions(field_obj)
|
919
747
|
|
920
748
|
user_field_metadata = next(
|
@@ -932,29 +760,24 @@ class Resource:
|
|
932
760
|
self.indexer.apply_field_metadata,
|
933
761
|
field_key,
|
934
762
|
metadata,
|
935
|
-
replace_field=replace_field,
|
936
|
-
replace_splits=replace_splits,
|
937
763
|
page_positions=page_positions,
|
938
764
|
extracted_text=extracted_text,
|
939
765
|
basic_user_field_metadata=user_field_metadata,
|
766
|
+
replace_field=True,
|
940
767
|
)
|
941
768
|
loop = asyncio.get_running_loop()
|
942
769
|
await loop.run_in_executor(_executor, apply_field_metadata)
|
943
770
|
|
944
|
-
maybe_update_basic_thumbnail(
|
945
|
-
self.basic, field_metadata.metadata.metadata.thumbnail
|
946
|
-
)
|
771
|
+
maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
|
947
772
|
|
948
773
|
add_field_classifications(self.basic, field_metadata)
|
949
774
|
|
950
775
|
async def _apply_extracted_vectors(self, field_vectors: ExtractedVectorsWrapper):
|
951
|
-
|
952
|
-
|
953
|
-
):
|
776
|
+
# Store vectors in the resource
|
777
|
+
|
778
|
+
if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
|
954
779
|
# skipping because field does not exist
|
955
|
-
logger.warning(
|
956
|
-
f'Field "{field_vectors.field.field}" does not exist, skipping vectors'
|
957
|
-
)
|
780
|
+
logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
|
958
781
|
return
|
959
782
|
|
960
783
|
field_obj = await self.get_field(
|
@@ -962,22 +785,36 @@ class Resource:
|
|
962
785
|
field_vectors.field.field_type,
|
963
786
|
load=False,
|
964
787
|
)
|
965
|
-
(
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
) = await field_obj.set_vectors(field_vectors)
|
788
|
+
vo = await field_obj.set_vectors(field_vectors)
|
789
|
+
|
790
|
+
# Prepare vectors to be indexed
|
791
|
+
|
970
792
|
field_key = self.generate_field_id(field_vectors.field)
|
971
793
|
if vo is not None:
|
972
|
-
|
973
|
-
|
974
|
-
|
794
|
+
vectorset_id = field_vectors.vectorset_id or None
|
795
|
+
if vectorset_id is None:
|
796
|
+
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
797
|
+
self.txn, kbid=self.kb.kbid
|
798
|
+
)
|
799
|
+
else:
|
800
|
+
config = await datamanagers.vectorsets.get(
|
801
|
+
self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
|
802
|
+
)
|
803
|
+
if config is None:
|
804
|
+
logger.warning(
|
805
|
+
f"Trying to apply a resource on vectorset '{vectorset_id}' that doesn't exist."
|
806
|
+
)
|
807
|
+
return
|
808
|
+
dimension = config.vectorset_index_config.vector_dimension
|
809
|
+
if not dimension:
|
810
|
+
raise ValueError(f"Vector dimension not set for vectorset '{vectorset_id}'")
|
811
|
+
|
975
812
|
apply_field_vectors_partial = partial(
|
976
813
|
self.indexer.apply_field_vectors,
|
977
814
|
field_key,
|
978
815
|
vo,
|
979
|
-
|
980
|
-
|
816
|
+
vectorset=vectorset_id,
|
817
|
+
replace_field=True,
|
981
818
|
matryoshka_vector_dimension=dimension,
|
982
819
|
)
|
983
820
|
loop = asyncio.get_running_loop()
|
@@ -985,9 +822,7 @@ class Resource:
|
|
985
822
|
else:
|
986
823
|
raise AttributeError("VO not found on set")
|
987
824
|
|
988
|
-
async def _apply_field_large_metadata(
|
989
|
-
self, field_large_metadata: LargeComputedMetadataWrapper
|
990
|
-
):
|
825
|
+
async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
|
991
826
|
field_obj = await self.get_field(
|
992
827
|
field_large_metadata.field.field,
|
993
828
|
field_large_metadata.field.field_type,
|
@@ -996,7 +831,7 @@ class Resource:
|
|
996
831
|
await field_obj.set_large_field_metadata(field_large_metadata)
|
997
832
|
|
998
833
|
def generate_field_id(self, field: FieldID) -> str:
|
999
|
-
return f"{
|
834
|
+
return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
|
1000
835
|
|
1001
836
|
async def compute_security(self, brain: ResourceBrain):
|
1002
837
|
security = await self.get_security()
|
@@ -1015,7 +850,7 @@ class Resource:
|
|
1015
850
|
brain.set_resource_metadata(basic=basic, origin=origin)
|
1016
851
|
for type, field in await self.get_fields_ids(force=True):
|
1017
852
|
fieldobj = await self.get_field(field, type, load=False)
|
1018
|
-
fieldid = FieldID(field_type=type, field=field)
|
853
|
+
fieldid = FieldID(field_type=type, field=field)
|
1019
854
|
fieldkey = self.generate_field_id(fieldid)
|
1020
855
|
extracted_metadata = await fieldobj.get_field_metadata()
|
1021
856
|
valid_user_field_metadata = None
|
@@ -1026,16 +861,16 @@ class Resource:
|
|
1026
861
|
):
|
1027
862
|
valid_user_field_metadata = user_field_metadata
|
1028
863
|
break
|
864
|
+
|
865
|
+
generated_by = await fieldobj.generated_by()
|
1029
866
|
brain.apply_field_labels(
|
1030
867
|
fieldkey,
|
1031
868
|
extracted_metadata,
|
1032
869
|
self.uuid,
|
870
|
+
generated_by,
|
1033
871
|
basic.usermetadata,
|
1034
872
|
valid_user_field_metadata,
|
1035
873
|
)
|
1036
|
-
if type == FieldType.KEYWORDSET:
|
1037
|
-
field_data = await fieldobj.db_get_value()
|
1038
|
-
brain.process_keywordset_fields(fieldkey, field_data)
|
1039
874
|
|
1040
875
|
@processor_observer.wrap({"type": "compute_global_text"})
|
1041
876
|
async def compute_global_text(self):
|
@@ -1072,12 +907,10 @@ class Resource:
|
|
1072
907
|
for fieldmetadata in self.basic.fieldmetadata:
|
1073
908
|
field_id = self.generate_field_id(fieldmetadata.field)
|
1074
909
|
for annotationparagraph in fieldmetadata.paragraphs:
|
1075
|
-
userdefinedparagraphclass[annotationparagraph.key] =
|
1076
|
-
annotationparagraph
|
1077
|
-
)
|
910
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
1078
911
|
|
1079
912
|
for (type_id, field_id), field in fields.items():
|
1080
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
913
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1081
914
|
field_key = self.generate_field_id(fieldid)
|
1082
915
|
fm = await field.get_field_metadata()
|
1083
916
|
extracted_text = None
|
@@ -1092,9 +925,7 @@ class Resource:
|
|
1092
925
|
if fm is None:
|
1093
926
|
continue
|
1094
927
|
|
1095
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1096
|
-
(None, fm.metadata)
|
1097
|
-
]
|
928
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1098
929
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1099
930
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1100
931
|
|
@@ -1105,7 +936,7 @@ class Resource:
|
|
1105
936
|
|
1106
937
|
entities: dict[str, str] = {}
|
1107
938
|
if enabled_metadata.entities:
|
1108
|
-
entities
|
939
|
+
_update_entities_dict(entities, field_metadata)
|
1109
940
|
|
1110
941
|
precomputed_vectors = {}
|
1111
942
|
if vo is not None:
|
@@ -1116,9 +947,7 @@ class Resource:
|
|
1116
947
|
vectors = vo.vectors
|
1117
948
|
base_vector_key = f"{self.uuid}/{field_key}"
|
1118
949
|
for index, vector in enumerate(vectors.vectors):
|
1119
|
-
vector_key =
|
1120
|
-
f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
1121
|
-
)
|
950
|
+
vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
1122
951
|
precomputed_vectors[vector_key] = vector.vector
|
1123
952
|
|
1124
953
|
if extracted_text is not None:
|
@@ -1129,11 +958,11 @@ class Resource:
|
|
1129
958
|
|
1130
959
|
for paragraph in field_metadata.paragraphs:
|
1131
960
|
if subfield is not None:
|
1132
|
-
paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1133
|
-
else:
|
1134
961
|
paragraph_key = (
|
1135
|
-
f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
962
|
+
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1136
963
|
)
|
964
|
+
else:
|
965
|
+
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1137
966
|
|
1138
967
|
if enabled_metadata.labels:
|
1139
968
|
metadata.labels.ClearField("field")
|
@@ -1147,7 +976,9 @@ class Resource:
|
|
1147
976
|
if subfield is not None:
|
1148
977
|
sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
|
1149
978
|
else:
|
1150
|
-
sentence_key =
|
979
|
+
sentence_key = (
|
980
|
+
f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
|
981
|
+
)
|
1151
982
|
|
1152
983
|
if vo is not None:
|
1153
984
|
metadata.ClearField("vector")
|
@@ -1186,12 +1017,10 @@ class Resource:
|
|
1186
1017
|
for fieldmetadata in self.basic.fieldmetadata:
|
1187
1018
|
field_id = self.generate_field_id(fieldmetadata.field)
|
1188
1019
|
for annotationparagraph in fieldmetadata.paragraphs:
|
1189
|
-
userdefinedparagraphclass[annotationparagraph.key] =
|
1190
|
-
annotationparagraph
|
1191
|
-
)
|
1020
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
1192
1021
|
|
1193
1022
|
for (type_id, field_id), field in fields.items():
|
1194
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
1023
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1195
1024
|
field_key = self.generate_field_id(fieldid)
|
1196
1025
|
fm = await field.get_field_metadata()
|
1197
1026
|
extracted_text = None
|
@@ -1202,9 +1031,7 @@ class Resource:
|
|
1202
1031
|
if fm is None:
|
1203
1032
|
continue
|
1204
1033
|
|
1205
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1206
|
-
(None, fm.metadata)
|
1207
|
-
]
|
1034
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1208
1035
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1209
1036
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1210
1037
|
|
@@ -1215,7 +1042,7 @@ class Resource:
|
|
1215
1042
|
|
1216
1043
|
entities: dict[str, str] = {}
|
1217
1044
|
if enabled_metadata.entities:
|
1218
|
-
entities
|
1045
|
+
_update_entities_dict(entities, field_metadata)
|
1219
1046
|
|
1220
1047
|
if extracted_text is not None:
|
1221
1048
|
if subfield is not None:
|
@@ -1225,11 +1052,11 @@ class Resource:
|
|
1225
1052
|
|
1226
1053
|
for paragraph in field_metadata.paragraphs:
|
1227
1054
|
if subfield is not None:
|
1228
|
-
paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1229
|
-
else:
|
1230
1055
|
paragraph_key = (
|
1231
|
-
f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1056
|
+
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1232
1057
|
)
|
1058
|
+
else:
|
1059
|
+
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1233
1060
|
|
1234
1061
|
if enabled_metadata.labels:
|
1235
1062
|
metadata.labels.ClearField("paragraph")
|
@@ -1257,9 +1084,7 @@ class Resource:
|
|
1257
1084
|
|
1258
1085
|
yield pb_paragraph
|
1259
1086
|
|
1260
|
-
async def iterate_fields(
|
1261
|
-
self, enabled_metadata: EnabledMetadata
|
1262
|
-
) -> AsyncIterator[TrainField]:
|
1087
|
+
async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
|
1263
1088
|
fields = await self.get_fields(force=True)
|
1264
1089
|
metadata = TrainMetadata()
|
1265
1090
|
if enabled_metadata.labels:
|
@@ -1269,7 +1094,7 @@ class Resource:
|
|
1269
1094
|
metadata.labels.resource.extend(self.basic.usermetadata.classifications)
|
1270
1095
|
|
1271
1096
|
for (type_id, field_id), field in fields.items():
|
1272
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
1097
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1273
1098
|
fm = await field.get_field_metadata()
|
1274
1099
|
extracted_text = None
|
1275
1100
|
|
@@ -1279,9 +1104,7 @@ class Resource:
|
|
1279
1104
|
if fm is None:
|
1280
1105
|
continue
|
1281
1106
|
|
1282
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1283
|
-
(None, fm.metadata)
|
1284
|
-
]
|
1107
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1285
1108
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1286
1109
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1287
1110
|
|
@@ -1298,7 +1121,7 @@ class Resource:
|
|
1298
1121
|
|
1299
1122
|
if enabled_metadata.entities:
|
1300
1123
|
metadata.ClearField("entities")
|
1301
|
-
metadata.entities
|
1124
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
1302
1125
|
|
1303
1126
|
pb_field = TrainField()
|
1304
1127
|
pb_field.uuid = self.uuid
|
@@ -1306,9 +1129,7 @@ class Resource:
|
|
1306
1129
|
pb_field.metadata.CopyFrom(metadata)
|
1307
1130
|
yield pb_field
|
1308
1131
|
|
1309
|
-
async def generate_train_resource(
|
1310
|
-
self, enabled_metadata: EnabledMetadata
|
1311
|
-
) -> TrainResource:
|
1132
|
+
async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
|
1312
1133
|
fields = await self.get_fields(force=True)
|
1313
1134
|
metadata = TrainMetadata()
|
1314
1135
|
if enabled_metadata.labels:
|
@@ -1335,9 +1156,7 @@ class Resource:
|
|
1335
1156
|
if fm is None:
|
1336
1157
|
continue
|
1337
1158
|
|
1338
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1339
|
-
(None, fm.metadata)
|
1340
|
-
]
|
1159
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1341
1160
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1342
1161
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1343
1162
|
|
@@ -1346,7 +1165,7 @@ class Resource:
|
|
1346
1165
|
metadata.labels.field.extend(splitted_metadata.classifications)
|
1347
1166
|
|
1348
1167
|
if enabled_metadata.entities:
|
1349
|
-
metadata.entities
|
1168
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
1350
1169
|
|
1351
1170
|
pb_resource = TrainResource()
|
1352
1171
|
pb_resource.uuid = self.uuid
|
@@ -1375,33 +1194,35 @@ def remove_field_classifications(basic: PBBasic, deleted_fields: list[FieldID]):
|
|
1375
1194
|
Clean classifications of fields that have been deleted
|
1376
1195
|
"""
|
1377
1196
|
field_classifications = [
|
1378
|
-
fc
|
1379
|
-
for fc in basic.computedmetadata.field_classifications
|
1380
|
-
if fc.field not in deleted_fields
|
1197
|
+
fc for fc in basic.computedmetadata.field_classifications if fc.field not in deleted_fields
|
1381
1198
|
]
|
1382
1199
|
basic.computedmetadata.ClearField("field_classifications")
|
1383
1200
|
basic.computedmetadata.field_classifications.extend(field_classifications)
|
1384
1201
|
|
1385
1202
|
|
1386
|
-
def add_field_classifications(
|
1387
|
-
basic: PBBasic, fcmw: FieldComputedMetadataWrapper
|
1388
|
-
) -> bool:
|
1203
|
+
def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper) -> bool:
|
1389
1204
|
"""
|
1390
1205
|
Returns whether some new field classifications were added
|
1391
1206
|
"""
|
1392
|
-
if len(fcmw.metadata.metadata.classifications) == 0
|
1207
|
+
if len(fcmw.metadata.metadata.classifications) == 0 and all(
|
1208
|
+
len(split.classifications) == 0 for split in fcmw.metadata.split_metadata.values()
|
1209
|
+
):
|
1393
1210
|
return False
|
1211
|
+
|
1394
1212
|
remove_field_classifications(basic, [fcmw.field])
|
1395
1213
|
fcfs = FieldClassifications()
|
1396
1214
|
fcfs.field.CopyFrom(fcmw.field)
|
1397
1215
|
fcfs.classifications.extend(fcmw.metadata.metadata.classifications)
|
1216
|
+
|
1217
|
+
for split_id, split in fcmw.metadata.split_metadata.items():
|
1218
|
+
if split_id not in fcmw.metadata.deleted_splits:
|
1219
|
+
fcfs.classifications.extend(split.classifications)
|
1220
|
+
|
1398
1221
|
basic.computedmetadata.field_classifications.append(fcfs)
|
1399
1222
|
return True
|
1400
1223
|
|
1401
1224
|
|
1402
|
-
def add_entities_to_metadata(
|
1403
|
-
entities: dict[str, str], local_text: str, metadata: TrainMetadata
|
1404
|
-
) -> None:
|
1225
|
+
def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
|
1405
1226
|
for entity_key, entity_value in entities.items():
|
1406
1227
|
if entity_key not in local_text:
|
1407
1228
|
# Add the entity only if found in text
|
@@ -1415,9 +1236,7 @@ def add_entities_to_metadata(
|
|
1415
1236
|
for _ in range(local_text.count(entity_key)):
|
1416
1237
|
start = local_text.index(entity_key, last_occurrence_end)
|
1417
1238
|
end = start + len(entity_key)
|
1418
|
-
metadata.entity_positions[poskey].positions.append(
|
1419
|
-
TrainPosition(start=start, end=end)
|
1420
|
-
)
|
1239
|
+
metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
|
1421
1240
|
last_occurrence_end = end
|
1422
1241
|
|
1423
1242
|
|
@@ -1432,15 +1251,22 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
|
|
1432
1251
|
if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
|
1433
1252
|
# Icon already set or detected
|
1434
1253
|
return False
|
1254
|
+
|
1435
1255
|
if not mimetype:
|
1436
1256
|
return False
|
1257
|
+
|
1258
|
+
if not content_types.valid(mimetype):
|
1259
|
+
logger.warning(
|
1260
|
+
"Invalid mimetype. Skipping icon update.",
|
1261
|
+
extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
|
1262
|
+
)
|
1263
|
+
return False
|
1264
|
+
|
1437
1265
|
basic.icon = mimetype
|
1438
1266
|
return True
|
1439
1267
|
|
1440
1268
|
|
1441
|
-
def maybe_update_basic_thumbnail(
|
1442
|
-
basic: PBBasic, thumbnail: Optional[CloudFile]
|
1443
|
-
) -> bool:
|
1269
|
+
def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile]) -> bool:
|
1444
1270
|
if basic.thumbnail or thumbnail is None:
|
1445
1271
|
return False
|
1446
1272
|
basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
|
@@ -1482,3 +1308,23 @@ def extract_field_metadata_languages(
|
|
1482
1308
|
for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
|
1483
1309
|
languages.add(splitted_metadata.language)
|
1484
1310
|
return list(languages)
|
1311
|
+
|
1312
|
+
|
1313
|
+
def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
|
1314
|
+
"""
|
1315
|
+
Update the entities dict with the entities from the field metadata.
|
1316
|
+
Method created to ease the transition from legacy ner field to new entities field.
|
1317
|
+
"""
|
1318
|
+
# Data Augmentation + Processor entities
|
1319
|
+
# This will overwrite entities detected from more than one data augmentation task
|
1320
|
+
# TODO: Change TrainMetadata proto to accept multiple entities with the same text
|
1321
|
+
entity_map = {
|
1322
|
+
entity.text: entity.label
|
1323
|
+
for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
|
1324
|
+
for entity in entities_wrapper.entities
|
1325
|
+
}
|
1326
|
+
target_entites_dict.update(entity_map)
|
1327
|
+
|
1328
|
+
# Legacy processor entities
|
1329
|
+
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
1330
|
+
target_entites_dict.update(field_metadata.ner)
|