nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/ingest/orm/resource.py
CHANGED
@@ -23,19 +23,33 @@ import asyncio
|
|
23
23
|
import logging
|
24
24
|
from concurrent.futures import ThreadPoolExecutor
|
25
25
|
from functools import partial
|
26
|
-
from typing import TYPE_CHECKING, Any, AsyncIterator, Optional, Type
|
26
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, MutableMapping, Optional, Type
|
27
27
|
|
28
|
+
from nucliadb.common import datamanagers
|
29
|
+
from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
|
30
|
+
from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
|
31
|
+
from nucliadb.common.maindb.driver import Transaction
|
32
|
+
from nucliadb.ingest.fields.base import Field
|
33
|
+
from nucliadb.ingest.fields.conversation import Conversation
|
34
|
+
from nucliadb.ingest.fields.file import File
|
35
|
+
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
36
|
+
from nucliadb.ingest.fields.link import Link
|
37
|
+
from nucliadb.ingest.fields.text import Text
|
38
|
+
from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
|
39
|
+
from nucliadb.ingest.orm.metrics import processor_observer
|
40
|
+
from nucliadb_models import content_types
|
41
|
+
from nucliadb_models.common import CloudLink
|
42
|
+
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
43
|
+
from nucliadb_protos import utils_pb2, writer_pb2
|
28
44
|
from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
|
29
|
-
from nucliadb_protos.resources_pb2 import Basic
|
30
|
-
from nucliadb_protos.resources_pb2 import Basic as PBBasic
|
31
|
-
from nucliadb_protos.resources_pb2 import CloudFile
|
32
|
-
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
33
|
-
from nucliadb_protos.resources_pb2 import Extra as PBExtra
|
34
45
|
from nucliadb_protos.resources_pb2 import (
|
46
|
+
Basic,
|
47
|
+
CloudFile,
|
35
48
|
ExtractedTextWrapper,
|
36
49
|
ExtractedVectorsWrapper,
|
37
50
|
FieldClassifications,
|
38
51
|
FieldComputedMetadataWrapper,
|
52
|
+
FieldFile,
|
39
53
|
FieldID,
|
40
54
|
FieldMetadata,
|
41
55
|
FieldQuestionAnswerWrapper,
|
@@ -44,40 +58,27 @@ from nucliadb_protos.resources_pb2 import (
|
|
44
58
|
FileExtractedData,
|
45
59
|
LargeComputedMetadataWrapper,
|
46
60
|
LinkExtractedData,
|
61
|
+
Metadata,
|
62
|
+
Paragraph,
|
63
|
+
ParagraphAnnotation,
|
47
64
|
)
|
48
|
-
from nucliadb_protos.resources_pb2 import
|
65
|
+
from nucliadb_protos.resources_pb2 import Basic as PBBasic
|
66
|
+
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
67
|
+
from nucliadb_protos.resources_pb2 import Extra as PBExtra
|
49
68
|
from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
50
69
|
from nucliadb_protos.resources_pb2 import Origin as PBOrigin
|
51
|
-
from nucliadb_protos.resources_pb2 import Paragraph, ParagraphAnnotation
|
52
70
|
from nucliadb_protos.resources_pb2 import Relations as PBRelations
|
53
|
-
from nucliadb_protos.train_pb2 import EnabledMetadata
|
54
|
-
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
55
71
|
from nucliadb_protos.train_pb2 import (
|
72
|
+
EnabledMetadata,
|
56
73
|
TrainField,
|
57
74
|
TrainMetadata,
|
58
75
|
TrainParagraph,
|
59
76
|
TrainResource,
|
60
77
|
TrainSentence,
|
61
78
|
)
|
79
|
+
from nucliadb_protos.train_pb2 import Position as TrainPosition
|
62
80
|
from nucliadb_protos.utils_pb2 import Relation as PBRelation
|
63
81
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
64
|
-
|
65
|
-
from nucliadb.common import datamanagers
|
66
|
-
from nucliadb.common.maindb.driver import Transaction
|
67
|
-
from nucliadb.ingest.fields.base import Field
|
68
|
-
from nucliadb.ingest.fields.conversation import Conversation
|
69
|
-
from nucliadb.ingest.fields.date import Datetime
|
70
|
-
from nucliadb.ingest.fields.file import File
|
71
|
-
from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
|
72
|
-
from nucliadb.ingest.fields.keywordset import Keywordset
|
73
|
-
from nucliadb.ingest.fields.layout import Layout
|
74
|
-
from nucliadb.ingest.fields.link import Link
|
75
|
-
from nucliadb.ingest.fields.text import Text
|
76
|
-
from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
|
77
|
-
from nucliadb.ingest.orm.metrics import processor_observer
|
78
|
-
from nucliadb_models.common import CloudLink
|
79
|
-
from nucliadb_models.writer import GENERIC_MIME_TYPE
|
80
|
-
from nucliadb_protos import utils_pb2, writer_pb2
|
81
82
|
from nucliadb_utils.storages.storage import Storage
|
82
83
|
|
83
84
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -85,33 +86,14 @@ if TYPE_CHECKING: # pragma: no cover
|
|
85
86
|
|
86
87
|
logger = logging.getLogger(__name__)
|
87
88
|
|
88
|
-
KB_RESOURCE_FIELDS = "/kbs/{kbid}/r/{uuid}/f/"
|
89
|
-
KB_RESOURCE_SLUG_BASE = "/kbs/{kbid}/s/"
|
90
|
-
KB_RESOURCE_SLUG = f"{KB_RESOURCE_SLUG_BASE}{{slug}}"
|
91
89
|
KB_FIELDS: dict[int, Type] = {
|
92
|
-
FieldType.LAYOUT: Layout,
|
93
90
|
FieldType.TEXT: Text,
|
94
91
|
FieldType.FILE: File,
|
95
92
|
FieldType.LINK: Link,
|
96
|
-
FieldType.DATETIME: Datetime,
|
97
|
-
FieldType.KEYWORDSET: Keywordset,
|
98
93
|
FieldType.GENERIC: Generic,
|
99
94
|
FieldType.CONVERSATION: Conversation,
|
100
95
|
}
|
101
96
|
|
102
|
-
KB_REVERSE: dict[str, FieldType.ValueType] = {
|
103
|
-
"l": FieldType.LAYOUT,
|
104
|
-
"t": FieldType.TEXT,
|
105
|
-
"f": FieldType.FILE,
|
106
|
-
"u": FieldType.LINK,
|
107
|
-
"d": FieldType.DATETIME,
|
108
|
-
"k": FieldType.KEYWORDSET,
|
109
|
-
"a": FieldType.GENERIC,
|
110
|
-
"c": FieldType.CONVERSATION,
|
111
|
-
}
|
112
|
-
|
113
|
-
FIELD_TYPE_TO_ID = {v: k for k, v in KB_REVERSE.items()}
|
114
|
-
|
115
97
|
_executor = ThreadPoolExecutor(10)
|
116
98
|
|
117
99
|
|
@@ -122,6 +104,8 @@ PB_TEXT_FORMAT_TO_MIMETYPE = {
|
|
122
104
|
FieldText.Format.MARKDOWN: "text/markdown",
|
123
105
|
FieldText.Format.JSON: "application/json",
|
124
106
|
FieldText.Format.KEEP_MARKDOWN: "text/markdown",
|
107
|
+
FieldText.Format.JSONL: "application/x-ndjson",
|
108
|
+
FieldText.Format.PLAIN_BLANKLINE_SPLIT: "text/plain+blankline",
|
125
109
|
}
|
126
110
|
|
127
111
|
BASIC_IMMUTABLE_FIELDS = ("icon",)
|
@@ -173,9 +157,7 @@ class Resource:
|
|
173
157
|
# Basic
|
174
158
|
async def get_basic(self) -> Optional[PBBasic]:
|
175
159
|
if self.basic is None:
|
176
|
-
basic = await datamanagers.resources.get_basic(
|
177
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
178
|
-
)
|
160
|
+
basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
179
161
|
self.basic = basic if basic is not None else PBBasic()
|
180
162
|
return self.basic
|
181
163
|
|
@@ -221,9 +203,7 @@ class Resource:
|
|
221
203
|
fields.append(field_id)
|
222
204
|
positions[field_id] = i
|
223
205
|
|
224
|
-
updated = [
|
225
|
-
self.basic.fieldmetadata[positions[field]] for field in fields
|
226
|
-
]
|
206
|
+
updated = [self.basic.fieldmetadata[positions[field]] for field in fields]
|
227
207
|
|
228
208
|
del self.basic.fieldmetadata[:]
|
229
209
|
self.basic.fieldmetadata.extend(updated)
|
@@ -244,11 +224,10 @@ class Resource:
|
|
244
224
|
self.indexer.apply_field_metadata(
|
245
225
|
field_id,
|
246
226
|
field_metadata,
|
247
|
-
replace_field=[],
|
248
|
-
replace_splits={},
|
249
227
|
page_positions=page_positions,
|
250
228
|
extracted_text=await field_obj.get_extracted_text(),
|
251
229
|
basic_user_field_metadata=user_field_metadata,
|
230
|
+
replace_field=True,
|
252
231
|
)
|
253
232
|
|
254
233
|
# Some basic fields are computed off field metadata.
|
@@ -264,9 +243,7 @@ class Resource:
|
|
264
243
|
# Origin
|
265
244
|
async def get_origin(self) -> Optional[PBOrigin]:
|
266
245
|
if self.origin is None:
|
267
|
-
origin = await datamanagers.resources.get_origin(
|
268
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
269
|
-
)
|
246
|
+
origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
270
247
|
self.origin = origin
|
271
248
|
return self.origin
|
272
249
|
|
@@ -280,16 +257,12 @@ class Resource:
|
|
280
257
|
# Extra
|
281
258
|
async def get_extra(self) -> Optional[PBExtra]:
|
282
259
|
if self.extra is None:
|
283
|
-
extra = await datamanagers.resources.get_extra(
|
284
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
285
|
-
)
|
260
|
+
extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
|
286
261
|
self.extra = extra
|
287
262
|
return self.extra
|
288
263
|
|
289
264
|
async def set_extra(self, payload: PBExtra):
|
290
|
-
await datamanagers.resources.set_extra(
|
291
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload
|
292
|
-
)
|
265
|
+
await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
|
293
266
|
self.modified = True
|
294
267
|
self.extra = payload
|
295
268
|
|
@@ -329,7 +302,7 @@ class Resource:
|
|
329
302
|
self.relations = relations
|
330
303
|
|
331
304
|
@processor_observer.wrap({"type": "generate_index_message"})
|
332
|
-
async def generate_index_message(self) -> ResourceBrain:
|
305
|
+
async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
|
333
306
|
brain = ResourceBrain(rid=self.uuid)
|
334
307
|
origin = await self.get_origin()
|
335
308
|
basic = await self.get_basic()
|
@@ -339,7 +312,7 @@ class Resource:
|
|
339
312
|
await self.compute_global_tags(brain)
|
340
313
|
fields = await self.get_fields(force=True)
|
341
314
|
for (type_id, field_id), field in fields.items():
|
342
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
315
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
343
316
|
await self.compute_global_text_field(fieldid, brain)
|
344
317
|
|
345
318
|
field_metadata = await field.get_field_metadata()
|
@@ -355,234 +328,66 @@ class Resource:
|
|
355
328
|
(
|
356
329
|
fm
|
357
330
|
for fm in basic.fieldmetadata
|
358
|
-
if fm.field.field == field_id
|
359
|
-
and fm.field.field_type == type_id
|
331
|
+
if fm.field.field == field_id and fm.field.field_type == type_id
|
360
332
|
),
|
361
333
|
None,
|
362
334
|
)
|
363
335
|
brain.apply_field_metadata(
|
364
336
|
field_key,
|
365
337
|
field_metadata,
|
366
|
-
replace_field=[],
|
367
|
-
replace_splits={},
|
368
338
|
page_positions=page_positions,
|
369
339
|
extracted_text=await field.get_extracted_text(),
|
370
340
|
basic_user_field_metadata=user_field_metadata,
|
341
|
+
replace_field=reindex,
|
371
342
|
)
|
372
343
|
|
373
344
|
if self.disable_vectors is False:
|
345
|
+
# XXX: while we don't remove the "default" vectorset concept, we
|
346
|
+
# need to do use None as the default one
|
374
347
|
vo = await field.get_vectors()
|
375
348
|
if vo is not None:
|
376
|
-
|
377
|
-
|
378
|
-
|
349
|
+
async with datamanagers.with_ro_transaction() as ro_txn:
|
350
|
+
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
351
|
+
ro_txn, kbid=self.kb.kbid
|
352
|
+
)
|
379
353
|
brain.apply_field_vectors(
|
380
354
|
field_key,
|
381
355
|
vo,
|
382
356
|
matryoshka_vector_dimension=dimension,
|
357
|
+
replace_field=reindex,
|
383
358
|
)
|
384
|
-
return brain
|
385
|
-
|
386
|
-
async def generate_field_vectors(
|
387
|
-
self,
|
388
|
-
bm: BrokerMessage,
|
389
|
-
type_id: FieldType.ValueType,
|
390
|
-
field_id: str,
|
391
|
-
field: Field,
|
392
|
-
):
|
393
|
-
vo = await field.get_vectors()
|
394
|
-
if vo is None:
|
395
|
-
return
|
396
|
-
evw = ExtractedVectorsWrapper()
|
397
|
-
evw.field.field = field_id
|
398
|
-
evw.field.field_type = type_id # type: ignore
|
399
|
-
evw.vectors.CopyFrom(vo)
|
400
|
-
bm.field_vectors.append(evw)
|
401
|
-
|
402
|
-
async def generate_field_large_computed_metadata(
|
403
|
-
self,
|
404
|
-
bm: BrokerMessage,
|
405
|
-
type_id: FieldType.ValueType,
|
406
|
-
field_id: str,
|
407
|
-
field: Field,
|
408
|
-
):
|
409
|
-
lcm = await field.get_large_field_metadata()
|
410
|
-
if lcm is None:
|
411
|
-
return
|
412
|
-
lcmw = LargeComputedMetadataWrapper()
|
413
|
-
lcmw.field.field = field_id
|
414
|
-
lcmw.field.field_type = type_id # type: ignore
|
415
|
-
lcmw.real.CopyFrom(lcm)
|
416
|
-
bm.field_large_metadata.append(lcmw)
|
417
|
-
|
418
|
-
async def generate_field_computed_metadata(
|
419
|
-
self,
|
420
|
-
bm: BrokerMessage,
|
421
|
-
type_id: FieldType.ValueType,
|
422
|
-
field_id: str,
|
423
|
-
field: Field,
|
424
|
-
):
|
425
|
-
fcmw = FieldComputedMetadataWrapper()
|
426
|
-
fcmw.field.field = field_id
|
427
|
-
fcmw.field.field_type = type_id # type: ignore
|
428
|
-
|
429
|
-
field_metadata = await field.get_field_metadata()
|
430
|
-
if field_metadata is not None:
|
431
|
-
fcmw.metadata.CopyFrom(field_metadata)
|
432
|
-
fcmw.field.field = field_id
|
433
|
-
fcmw.field.field_type = type_id # type: ignore
|
434
|
-
bm.field_metadata.append(fcmw)
|
435
|
-
# Make sure cloud files are removed for exporting
|
436
|
-
|
437
|
-
async def generate_extracted_text(
|
438
|
-
self,
|
439
|
-
bm: BrokerMessage,
|
440
|
-
type_id: FieldType.ValueType,
|
441
|
-
field_id: str,
|
442
|
-
field: Field,
|
443
|
-
):
|
444
|
-
etw = ExtractedTextWrapper()
|
445
|
-
etw.field.field = field_id
|
446
|
-
etw.field.field_type = type_id # type: ignore
|
447
|
-
extracted_text = await field.get_extracted_text()
|
448
|
-
if extracted_text is not None:
|
449
|
-
etw.body.CopyFrom(extracted_text)
|
450
|
-
bm.extracted_text.append(etw)
|
451
|
-
|
452
|
-
async def generate_field(
|
453
|
-
self,
|
454
|
-
bm: BrokerMessage,
|
455
|
-
type_id: FieldType.ValueType,
|
456
|
-
field_id: str,
|
457
|
-
field: Field,
|
458
|
-
):
|
459
|
-
# Used for exporting a field
|
460
|
-
if type_id == FieldType.TEXT:
|
461
|
-
value = await field.get_value()
|
462
|
-
bm.texts[field_id].CopyFrom(value)
|
463
|
-
elif type_id == FieldType.LINK:
|
464
|
-
value = await field.get_value()
|
465
|
-
bm.links[field_id].CopyFrom(value)
|
466
|
-
elif type_id == FieldType.FILE:
|
467
|
-
value = await field.get_value()
|
468
|
-
bm.files[field_id].CopyFrom(value)
|
469
|
-
elif type_id == FieldType.CONVERSATION:
|
470
|
-
value = await self.get_full_conversation(field) # type: ignore
|
471
|
-
bm.conversations[field_id].CopyFrom(value)
|
472
|
-
elif type_id == FieldType.KEYWORDSET:
|
473
|
-
value = await field.get_value()
|
474
|
-
bm.keywordsets[field_id].CopyFrom(value)
|
475
|
-
elif type_id == FieldType.DATETIME:
|
476
|
-
value = await field.get_value()
|
477
|
-
bm.datetimes[field_id].CopyFrom(value)
|
478
|
-
elif type_id == FieldType.LAYOUT:
|
479
|
-
value = await field.get_value()
|
480
|
-
bm.layouts[field_id].CopyFrom(value)
|
481
|
-
|
482
|
-
async def get_full_conversation(
|
483
|
-
self,
|
484
|
-
conversation_field: Conversation,
|
485
|
-
) -> Optional[PBConversation]:
|
486
|
-
"""
|
487
|
-
Messages of a conversations may be stored across several pages.
|
488
|
-
This method fetches them all and returns a single complete conversation.
|
489
|
-
"""
|
490
|
-
full_conv = PBConversation()
|
491
|
-
n_page = 1
|
492
|
-
while True:
|
493
|
-
page = await conversation_field.get_value(page=n_page)
|
494
|
-
if page is None:
|
495
|
-
break
|
496
|
-
full_conv.messages.extend(page.messages)
|
497
|
-
n_page += 1
|
498
|
-
return full_conv
|
499
|
-
|
500
|
-
async def generate_broker_message(self) -> BrokerMessage:
|
501
|
-
# full means downloading all the pointers
|
502
|
-
# minuts the ones to external files that are not PB
|
503
|
-
# Go for all fields and recreate brain
|
504
|
-
bm = BrokerMessage()
|
505
|
-
bm.kbid = self.kb.kbid
|
506
|
-
bm.uuid = self.uuid
|
507
|
-
basic = await self.get_basic()
|
508
|
-
if basic is not None:
|
509
|
-
bm.basic.CopyFrom(basic)
|
510
|
-
bm.slug = bm.basic.slug
|
511
|
-
origin = await self.get_origin()
|
512
|
-
if origin is not None:
|
513
|
-
bm.origin.CopyFrom(origin)
|
514
|
-
relations = await self.get_relations()
|
515
|
-
if relations is not None:
|
516
|
-
for relation in relations.relations:
|
517
|
-
bm.relations.append(relation)
|
518
|
-
|
519
|
-
fields = await self.get_fields(force=True)
|
520
|
-
for (type_id, field_id), field in fields.items():
|
521
|
-
# Value
|
522
|
-
await self.generate_field(bm, type_id, field_id, field)
|
523
|
-
|
524
|
-
# Extracted text
|
525
|
-
await self.generate_extracted_text(bm, type_id, field_id, field)
|
526
|
-
|
527
|
-
# Field Computed Metadata
|
528
|
-
await self.generate_field_computed_metadata(bm, type_id, field_id, field)
|
529
|
-
|
530
|
-
if type_id == FieldType.FILE and isinstance(field, File):
|
531
|
-
field_extracted_data = await field.get_file_extracted_data()
|
532
|
-
if field_extracted_data is not None:
|
533
|
-
bm.file_extracted_data.append(field_extracted_data)
|
534
359
|
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
360
|
+
vectorset_configs = []
|
361
|
+
async with datamanagers.with_ro_transaction() as ro_txn:
|
362
|
+
async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
|
363
|
+
ro_txn, kbid=self.kb.kbid
|
364
|
+
):
|
365
|
+
vectorset_configs.append(vectorset_config)
|
366
|
+
for vectorset_config in vectorset_configs:
|
367
|
+
vo = await field.get_vectors(vectorset=vectorset_config.vectorset_id)
|
368
|
+
if vo is not None:
|
369
|
+
dimension = vectorset_config.vectorset_index_config.vector_dimension
|
370
|
+
brain.apply_field_vectors(
|
371
|
+
field_key,
|
372
|
+
vo,
|
373
|
+
vectorset=vectorset_config.vectorset_id,
|
374
|
+
matryoshka_vector_dimension=dimension,
|
375
|
+
replace_field=reindex,
|
376
|
+
)
|
377
|
+
return brain
|
549
378
|
|
550
379
|
# Fields
|
551
|
-
async def get_fields(
|
552
|
-
self, force: bool = False
|
553
|
-
) -> dict[tuple[FieldType.ValueType, str], Field]:
|
380
|
+
async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
|
554
381
|
# Get all fields
|
555
382
|
for type, field in await self.get_fields_ids(force=force):
|
556
383
|
if (type, field) not in self.fields:
|
557
384
|
self.fields[(type, field)] = await self.get_field(field, type)
|
558
385
|
return self.fields
|
559
386
|
|
560
|
-
async def _deprecated_scan_fields_ids(
|
561
|
-
self,
|
562
|
-
) -> AsyncIterator[tuple[FieldType.ValueType, str]]:
|
563
|
-
logger.warning("Scanning fields ids. This is not optimal.")
|
564
|
-
prefix = KB_RESOURCE_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
|
565
|
-
allfields = set()
|
566
|
-
async for key in self.txn.keys(prefix, count=-1):
|
567
|
-
# The [6:8] `slicing purpose is to match exactly the two
|
568
|
-
# splitted parts corresponding to type and field, and nothing else!
|
569
|
-
type, field = key.split("/")[6:8]
|
570
|
-
type_id = KB_REVERSE.get(type)
|
571
|
-
if type_id is None:
|
572
|
-
raise AttributeError("Invalid field type")
|
573
|
-
result = (type_id, field)
|
574
|
-
if result not in allfields:
|
575
|
-
# fields can have errors that are stored in a subkey:
|
576
|
-
# - field key -> kbs/kbid/r/ruuid/f/myfield
|
577
|
-
# - field error key -> kbs/kbid/r/ruuid/f/myfield/errors
|
578
|
-
# and that would return duplicates here.
|
579
|
-
yield result
|
580
|
-
allfields.add(result)
|
581
|
-
|
582
387
|
async def _inner_get_fields_ids(self) -> list[tuple[FieldType.ValueType, str]]:
|
583
388
|
# Use a set to make sure we don't have duplicate field ids
|
584
389
|
result = set()
|
585
|
-
all_fields = await self.get_all_field_ids()
|
390
|
+
all_fields = await self.get_all_field_ids(for_update=False)
|
586
391
|
if all_fields is not None:
|
587
392
|
for f in all_fields.fields:
|
588
393
|
result.add((f.field_type, f.field))
|
@@ -599,9 +404,7 @@ class Resource:
|
|
599
404
|
result.add((FieldType.GENERIC, generic))
|
600
405
|
return list(result)
|
601
406
|
|
602
|
-
async def get_fields_ids(
|
603
|
-
self, force: bool = False
|
604
|
-
) -> list[tuple[FieldType.ValueType, str]]:
|
407
|
+
async def get_fields_ids(self, force: bool = False) -> list[tuple[FieldType.ValueType, str]]:
|
605
408
|
"""
|
606
409
|
Get all ids of the fields of the resource and cache them.
|
607
410
|
"""
|
@@ -645,23 +448,20 @@ class Resource:
|
|
645
448
|
if field in self.all_fields_keys:
|
646
449
|
self.all_fields_keys.remove(field)
|
647
450
|
|
648
|
-
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
649
|
-
vo = await field_obj.get_vectors()
|
650
|
-
if vo is not None:
|
651
|
-
self.indexer.delete_vectors(field_key=field_key, vo=vo)
|
451
|
+
field_key = self.generate_field_id(FieldID(field_type=type, field=key))
|
652
452
|
|
653
453
|
metadata = await field_obj.get_field_metadata()
|
654
454
|
if metadata is not None:
|
655
|
-
self.indexer.
|
455
|
+
self.indexer.delete_field(field_key=field_key)
|
656
456
|
|
657
457
|
await field_obj.delete()
|
658
458
|
|
659
459
|
def has_field(self, type: FieldType.ValueType, field: str) -> bool:
|
660
460
|
return (type, field) in self.fields
|
661
461
|
|
662
|
-
async def get_all_field_ids(self) -> Optional[PBAllFieldIDs]:
|
462
|
+
async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
|
663
463
|
return await datamanagers.resources.get_all_field_ids(
|
664
|
-
self.txn, kbid=self.kb.kbid, rid=self.uuid
|
464
|
+
self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
|
665
465
|
)
|
666
466
|
|
667
467
|
async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
|
@@ -677,7 +477,7 @@ class Resource:
|
|
677
477
|
errors: Optional[list[writer_pb2.Error]] = None,
|
678
478
|
):
|
679
479
|
needs_update = False
|
680
|
-
all_fields = await self.get_all_field_ids()
|
480
|
+
all_fields = await self.get_all_field_ids(for_update=True)
|
681
481
|
if all_fields is None:
|
682
482
|
needs_update = True
|
683
483
|
all_fields = PBAllFieldIDs()
|
@@ -704,26 +504,12 @@ class Resource:
|
|
704
504
|
@processor_observer.wrap({"type": "apply_fields"})
|
705
505
|
async def apply_fields(self, message: BrokerMessage):
|
706
506
|
message_updated_fields = []
|
707
|
-
for field, layout in message.layouts.items():
|
708
|
-
fid = FieldID(field_type=FieldType.LAYOUT, field=field)
|
709
|
-
await self.set_field(fid.field_type, fid.field, layout)
|
710
|
-
message_updated_fields.append(fid)
|
711
507
|
|
712
508
|
for field, text in message.texts.items():
|
713
509
|
fid = FieldID(field_type=FieldType.TEXT, field=field)
|
714
510
|
await self.set_field(fid.field_type, fid.field, text)
|
715
511
|
message_updated_fields.append(fid)
|
716
512
|
|
717
|
-
for field, keywordset in message.keywordsets.items():
|
718
|
-
fid = FieldID(field_type=FieldType.KEYWORDSET, field=field)
|
719
|
-
await self.set_field(fid.field_type, fid.field, keywordset)
|
720
|
-
message_updated_fields.append(fid)
|
721
|
-
|
722
|
-
for field, datetimeobj in message.datetimes.items():
|
723
|
-
fid = FieldID(field_type=FieldType.DATETIME, field=field)
|
724
|
-
await self.set_field(fid.field_type, fid.field, datetimeobj)
|
725
|
-
message_updated_fields.append(fid)
|
726
|
-
|
727
513
|
for field, link in message.links.items():
|
728
514
|
fid = FieldID(field_type=FieldType.LINK, field=field)
|
729
515
|
await self.set_field(fid.field_type, fid.field, link)
|
@@ -742,13 +528,11 @@ class Resource:
|
|
742
528
|
for fieldid in message.delete_fields:
|
743
529
|
await self.delete_field(fieldid.field_type, fieldid.field)
|
744
530
|
|
745
|
-
if (
|
746
|
-
len(message_updated_fields)
|
747
|
-
or len(message.delete_fields)
|
748
|
-
or len(message.errors)
|
749
|
-
):
|
531
|
+
if len(message_updated_fields) or len(message.delete_fields) or len(message.errors):
|
750
532
|
await self.update_all_field_ids(
|
751
|
-
updated=message_updated_fields,
|
533
|
+
updated=message_updated_fields,
|
534
|
+
deleted=message.delete_fields, # type: ignore
|
535
|
+
errors=message.errors, # type: ignore
|
752
536
|
)
|
753
537
|
|
754
538
|
@processor_observer.wrap({"type": "apply_extracted"})
|
@@ -784,13 +568,15 @@ class Resource:
|
|
784
568
|
|
785
569
|
for link_extracted_data in message.link_extracted_data:
|
786
570
|
await self._apply_link_extracted_data(link_extracted_data)
|
787
|
-
await self.
|
571
|
+
await self.maybe_update_resource_title_from_link(link_extracted_data)
|
788
572
|
extracted_languages.append(link_extracted_data.language)
|
789
573
|
|
790
574
|
for file_extracted_data in message.file_extracted_data:
|
791
575
|
await self._apply_file_extracted_data(file_extracted_data)
|
792
576
|
extracted_languages.append(file_extracted_data.language)
|
793
577
|
|
578
|
+
await self.maybe_update_resource_title_from_file_extracted_data(message)
|
579
|
+
|
794
580
|
# Metadata should go first
|
795
581
|
for field_metadata in message.field_metadata:
|
796
582
|
await self._apply_field_computed_metadata(field_metadata)
|
@@ -801,6 +587,7 @@ class Resource:
|
|
801
587
|
# Upload to binary storage
|
802
588
|
# Vector indexing
|
803
589
|
if self.disable_vectors is False:
|
590
|
+
await self.get_fields(force=True)
|
804
591
|
for field_vectors in message.field_vectors:
|
805
592
|
await self._apply_extracted_vectors(field_vectors)
|
806
593
|
|
@@ -826,9 +613,7 @@ class Resource:
|
|
826
613
|
extracted_text.field,
|
827
614
|
)
|
828
615
|
|
829
|
-
async def _apply_question_answers(
|
830
|
-
self, question_answers: FieldQuestionAnswerWrapper
|
831
|
-
):
|
616
|
+
async def _apply_question_answers(self, question_answers: FieldQuestionAnswerWrapper):
|
832
617
|
field = question_answers.field
|
833
618
|
field_obj = await self.get_field(field.field, field.field_type, load=False)
|
834
619
|
await field_obj.set_question_answers(question_answers)
|
@@ -848,19 +633,27 @@ class Resource:
|
|
848
633
|
|
849
634
|
maybe_update_basic_summary(self.basic, link_extracted_data.description)
|
850
635
|
|
851
|
-
async def
|
636
|
+
async def maybe_update_resource_title_from_link(self, link_extracted_data: LinkExtractedData):
|
637
|
+
"""
|
638
|
+
When parsing link extracted data, we want to replace the resource title for the first link
|
639
|
+
that gets processed and has a title, and only if the current title is a URL, which we take
|
640
|
+
as a hint that the title was not set by the user.
|
641
|
+
"""
|
852
642
|
assert self.basic is not None
|
853
643
|
if not link_extracted_data.title:
|
854
644
|
return
|
855
645
|
if not (self.basic.title.startswith("http") or self.basic.title == ""):
|
856
646
|
return
|
857
|
-
|
858
647
|
title = link_extracted_data.title
|
859
|
-
self.
|
648
|
+
await self.update_resource_title(title)
|
649
|
+
|
650
|
+
async def update_resource_title(self, computed_title: str) -> None:
|
651
|
+
assert self.basic is not None
|
652
|
+
self.basic.title = computed_title
|
860
653
|
# Extracted text
|
861
654
|
field = await self.get_field("title", FieldType.GENERIC, load=False)
|
862
655
|
etw = ExtractedTextWrapper()
|
863
|
-
etw.body.text =
|
656
|
+
etw.body.text = computed_title
|
864
657
|
await field.set_extracted_text(etw)
|
865
658
|
|
866
659
|
# Field computed metadata
|
@@ -872,11 +665,8 @@ class Resource:
|
|
872
665
|
fcm = await field.get_field_metadata(force=True)
|
873
666
|
if fcm is not None:
|
874
667
|
fcmw.metadata.CopyFrom(fcm)
|
875
|
-
|
876
668
|
fcmw.metadata.metadata.ClearField("paragraphs")
|
877
|
-
paragraph = Paragraph(
|
878
|
-
start=0, end=len(title), kind=Paragraph.TypeParagraph.TITLE
|
879
|
-
)
|
669
|
+
paragraph = Paragraph(start=0, end=len(computed_title), kind=Paragraph.TypeParagraph.TITLE)
|
880
670
|
fcmw.metadata.metadata.paragraphs.append(paragraph)
|
881
671
|
|
882
672
|
await field.set_field_metadata(fcmw)
|
@@ -893,9 +683,54 @@ class Resource:
|
|
893
683
|
maybe_update_basic_icon(self.basic, file_extracted_data.icon)
|
894
684
|
maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
|
895
685
|
|
896
|
-
async def
|
897
|
-
|
898
|
-
|
686
|
+
async def _should_update_resource_title_from_file_metadata(self) -> bool:
|
687
|
+
"""
|
688
|
+
We only want to update resource title from file metadata if the title is empty,
|
689
|
+
equal to the resource uuid or equal to any of the file filenames in the resource.
|
690
|
+
"""
|
691
|
+
basic = await self.get_basic()
|
692
|
+
if basic is None:
|
693
|
+
return True
|
694
|
+
current_title = basic.title
|
695
|
+
if current_title == "":
|
696
|
+
# If the title is empty, we should update it
|
697
|
+
return True
|
698
|
+
if current_title == self.uuid:
|
699
|
+
# If the title is the same as the resource uuid, we should update it
|
700
|
+
return True
|
701
|
+
fields = await self.get_fields(force=True)
|
702
|
+
filenames = set()
|
703
|
+
for (field_type, _), field_obj in fields.items():
|
704
|
+
if field_type == FieldType.FILE:
|
705
|
+
field_value: Optional[FieldFile] = await field_obj.get_value()
|
706
|
+
if field_value is not None:
|
707
|
+
if field_value.file.filename not in ("", None):
|
708
|
+
filenames.add(field_value.file.filename)
|
709
|
+
if current_title in filenames:
|
710
|
+
# If the title is equal to any of the file filenames, we should update it
|
711
|
+
return True
|
712
|
+
return False
|
713
|
+
|
714
|
+
async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
|
715
|
+
"""
|
716
|
+
Update the resource title with the first file that has a title extracted.
|
717
|
+
"""
|
718
|
+
if not await self._should_update_resource_title_from_file_metadata():
|
719
|
+
return
|
720
|
+
for fed in message.file_extracted_data:
|
721
|
+
if fed.title == "":
|
722
|
+
# Skip if the extracted title is empty
|
723
|
+
continue
|
724
|
+
fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
|
725
|
+
logger.info(
|
726
|
+
"Updating resource title from file extracted data",
|
727
|
+
extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
|
728
|
+
)
|
729
|
+
await self.update_resource_title(fed.title)
|
730
|
+
# Break after the first file with a title is found
|
731
|
+
break
|
732
|
+
|
733
|
+
async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
|
899
734
|
assert self.basic is not None
|
900
735
|
maybe_update_basic_summary(self.basic, field_metadata.metadata.metadata.summary)
|
901
736
|
|
@@ -904,17 +739,11 @@ class Resource:
|
|
904
739
|
field_metadata.field.field_type,
|
905
740
|
load=False,
|
906
741
|
)
|
907
|
-
(
|
908
|
-
metadata,
|
909
|
-
replace_field,
|
910
|
-
replace_splits,
|
911
|
-
) = await field_obj.set_field_metadata(field_metadata)
|
742
|
+
metadata = await field_obj.set_field_metadata(field_metadata)
|
912
743
|
field_key = self.generate_field_id(field_metadata.field)
|
913
744
|
|
914
745
|
page_positions: Optional[FilePagePositions] = None
|
915
|
-
if field_metadata.field.field_type == FieldType.FILE and isinstance(
|
916
|
-
field_obj, File
|
917
|
-
):
|
746
|
+
if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
|
918
747
|
page_positions = await get_file_page_positions(field_obj)
|
919
748
|
|
920
749
|
user_field_metadata = next(
|
@@ -932,29 +761,24 @@ class Resource:
|
|
932
761
|
self.indexer.apply_field_metadata,
|
933
762
|
field_key,
|
934
763
|
metadata,
|
935
|
-
replace_field=replace_field,
|
936
|
-
replace_splits=replace_splits,
|
937
764
|
page_positions=page_positions,
|
938
765
|
extracted_text=extracted_text,
|
939
766
|
basic_user_field_metadata=user_field_metadata,
|
767
|
+
replace_field=True,
|
940
768
|
)
|
941
769
|
loop = asyncio.get_running_loop()
|
942
770
|
await loop.run_in_executor(_executor, apply_field_metadata)
|
943
771
|
|
944
|
-
maybe_update_basic_thumbnail(
|
945
|
-
self.basic, field_metadata.metadata.metadata.thumbnail
|
946
|
-
)
|
772
|
+
maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
|
947
773
|
|
948
774
|
add_field_classifications(self.basic, field_metadata)
|
949
775
|
|
950
776
|
async def _apply_extracted_vectors(self, field_vectors: ExtractedVectorsWrapper):
|
951
|
-
|
952
|
-
|
953
|
-
):
|
777
|
+
# Store vectors in the resource
|
778
|
+
|
779
|
+
if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
|
954
780
|
# skipping because field does not exist
|
955
|
-
logger.warning(
|
956
|
-
f'Field "{field_vectors.field.field}" does not exist, skipping vectors'
|
957
|
-
)
|
781
|
+
logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
|
958
782
|
return
|
959
783
|
|
960
784
|
field_obj = await self.get_field(
|
@@ -962,22 +786,36 @@ class Resource:
|
|
962
786
|
field_vectors.field.field_type,
|
963
787
|
load=False,
|
964
788
|
)
|
965
|
-
(
|
966
|
-
|
967
|
-
|
968
|
-
|
969
|
-
) = await field_obj.set_vectors(field_vectors)
|
789
|
+
vo = await field_obj.set_vectors(field_vectors)
|
790
|
+
|
791
|
+
# Prepare vectors to be indexed
|
792
|
+
|
970
793
|
field_key = self.generate_field_id(field_vectors.field)
|
971
794
|
if vo is not None:
|
972
|
-
|
973
|
-
|
974
|
-
|
795
|
+
vectorset_id = field_vectors.vectorset_id or None
|
796
|
+
if vectorset_id is None:
|
797
|
+
dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
|
798
|
+
self.txn, kbid=self.kb.kbid
|
799
|
+
)
|
800
|
+
else:
|
801
|
+
config = await datamanagers.vectorsets.get(
|
802
|
+
self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
|
803
|
+
)
|
804
|
+
if config is None:
|
805
|
+
logger.warning(
|
806
|
+
f"Trying to apply a resource on vectorset '{vectorset_id}' that doesn't exist."
|
807
|
+
)
|
808
|
+
return
|
809
|
+
dimension = config.vectorset_index_config.vector_dimension
|
810
|
+
if not dimension:
|
811
|
+
raise ValueError(f"Vector dimension not set for vectorset '{vectorset_id}'")
|
812
|
+
|
975
813
|
apply_field_vectors_partial = partial(
|
976
814
|
self.indexer.apply_field_vectors,
|
977
815
|
field_key,
|
978
816
|
vo,
|
979
|
-
|
980
|
-
|
817
|
+
vectorset=vectorset_id,
|
818
|
+
replace_field=True,
|
981
819
|
matryoshka_vector_dimension=dimension,
|
982
820
|
)
|
983
821
|
loop = asyncio.get_running_loop()
|
@@ -985,9 +823,7 @@ class Resource:
|
|
985
823
|
else:
|
986
824
|
raise AttributeError("VO not found on set")
|
987
825
|
|
988
|
-
async def _apply_field_large_metadata(
|
989
|
-
self, field_large_metadata: LargeComputedMetadataWrapper
|
990
|
-
):
|
826
|
+
async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
|
991
827
|
field_obj = await self.get_field(
|
992
828
|
field_large_metadata.field.field,
|
993
829
|
field_large_metadata.field.field_type,
|
@@ -996,7 +832,7 @@ class Resource:
|
|
996
832
|
await field_obj.set_large_field_metadata(field_large_metadata)
|
997
833
|
|
998
834
|
def generate_field_id(self, field: FieldID) -> str:
|
999
|
-
return f"{
|
835
|
+
return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
|
1000
836
|
|
1001
837
|
async def compute_security(self, brain: ResourceBrain):
|
1002
838
|
security = await self.get_security()
|
@@ -1015,7 +851,7 @@ class Resource:
|
|
1015
851
|
brain.set_resource_metadata(basic=basic, origin=origin)
|
1016
852
|
for type, field in await self.get_fields_ids(force=True):
|
1017
853
|
fieldobj = await self.get_field(field, type, load=False)
|
1018
|
-
fieldid = FieldID(field_type=type, field=field)
|
854
|
+
fieldid = FieldID(field_type=type, field=field)
|
1019
855
|
fieldkey = self.generate_field_id(fieldid)
|
1020
856
|
extracted_metadata = await fieldobj.get_field_metadata()
|
1021
857
|
valid_user_field_metadata = None
|
@@ -1026,16 +862,16 @@ class Resource:
|
|
1026
862
|
):
|
1027
863
|
valid_user_field_metadata = user_field_metadata
|
1028
864
|
break
|
865
|
+
|
866
|
+
generated_by = await fieldobj.generated_by()
|
1029
867
|
brain.apply_field_labels(
|
1030
868
|
fieldkey,
|
1031
869
|
extracted_metadata,
|
1032
870
|
self.uuid,
|
871
|
+
generated_by,
|
1033
872
|
basic.usermetadata,
|
1034
873
|
valid_user_field_metadata,
|
1035
874
|
)
|
1036
|
-
if type == FieldType.KEYWORDSET:
|
1037
|
-
field_data = await fieldobj.db_get_value()
|
1038
|
-
brain.process_keywordset_fields(fieldkey, field_data)
|
1039
875
|
|
1040
876
|
@processor_observer.wrap({"type": "compute_global_text"})
|
1041
877
|
async def compute_global_text(self):
|
@@ -1072,12 +908,10 @@ class Resource:
|
|
1072
908
|
for fieldmetadata in self.basic.fieldmetadata:
|
1073
909
|
field_id = self.generate_field_id(fieldmetadata.field)
|
1074
910
|
for annotationparagraph in fieldmetadata.paragraphs:
|
1075
|
-
userdefinedparagraphclass[annotationparagraph.key] =
|
1076
|
-
annotationparagraph
|
1077
|
-
)
|
911
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
1078
912
|
|
1079
913
|
for (type_id, field_id), field in fields.items():
|
1080
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
914
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1081
915
|
field_key = self.generate_field_id(fieldid)
|
1082
916
|
fm = await field.get_field_metadata()
|
1083
917
|
extracted_text = None
|
@@ -1092,9 +926,7 @@ class Resource:
|
|
1092
926
|
if fm is None:
|
1093
927
|
continue
|
1094
928
|
|
1095
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1096
|
-
(None, fm.metadata)
|
1097
|
-
]
|
929
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1098
930
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1099
931
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1100
932
|
|
@@ -1105,7 +937,7 @@ class Resource:
|
|
1105
937
|
|
1106
938
|
entities: dict[str, str] = {}
|
1107
939
|
if enabled_metadata.entities:
|
1108
|
-
entities
|
940
|
+
_update_entities_dict(entities, field_metadata)
|
1109
941
|
|
1110
942
|
precomputed_vectors = {}
|
1111
943
|
if vo is not None:
|
@@ -1116,9 +948,7 @@ class Resource:
|
|
1116
948
|
vectors = vo.vectors
|
1117
949
|
base_vector_key = f"{self.uuid}/{field_key}"
|
1118
950
|
for index, vector in enumerate(vectors.vectors):
|
1119
|
-
vector_key =
|
1120
|
-
f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
1121
|
-
)
|
951
|
+
vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
|
1122
952
|
precomputed_vectors[vector_key] = vector.vector
|
1123
953
|
|
1124
954
|
if extracted_text is not None:
|
@@ -1129,11 +959,11 @@ class Resource:
|
|
1129
959
|
|
1130
960
|
for paragraph in field_metadata.paragraphs:
|
1131
961
|
if subfield is not None:
|
1132
|
-
paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1133
|
-
else:
|
1134
962
|
paragraph_key = (
|
1135
|
-
f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
963
|
+
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1136
964
|
)
|
965
|
+
else:
|
966
|
+
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1137
967
|
|
1138
968
|
if enabled_metadata.labels:
|
1139
969
|
metadata.labels.ClearField("field")
|
@@ -1147,7 +977,9 @@ class Resource:
|
|
1147
977
|
if subfield is not None:
|
1148
978
|
sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
|
1149
979
|
else:
|
1150
|
-
sentence_key =
|
980
|
+
sentence_key = (
|
981
|
+
f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
|
982
|
+
)
|
1151
983
|
|
1152
984
|
if vo is not None:
|
1153
985
|
metadata.ClearField("vector")
|
@@ -1186,12 +1018,10 @@ class Resource:
|
|
1186
1018
|
for fieldmetadata in self.basic.fieldmetadata:
|
1187
1019
|
field_id = self.generate_field_id(fieldmetadata.field)
|
1188
1020
|
for annotationparagraph in fieldmetadata.paragraphs:
|
1189
|
-
userdefinedparagraphclass[annotationparagraph.key] =
|
1190
|
-
annotationparagraph
|
1191
|
-
)
|
1021
|
+
userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
|
1192
1022
|
|
1193
1023
|
for (type_id, field_id), field in fields.items():
|
1194
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
1024
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1195
1025
|
field_key = self.generate_field_id(fieldid)
|
1196
1026
|
fm = await field.get_field_metadata()
|
1197
1027
|
extracted_text = None
|
@@ -1202,9 +1032,7 @@ class Resource:
|
|
1202
1032
|
if fm is None:
|
1203
1033
|
continue
|
1204
1034
|
|
1205
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1206
|
-
(None, fm.metadata)
|
1207
|
-
]
|
1035
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1208
1036
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1209
1037
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1210
1038
|
|
@@ -1215,7 +1043,7 @@ class Resource:
|
|
1215
1043
|
|
1216
1044
|
entities: dict[str, str] = {}
|
1217
1045
|
if enabled_metadata.entities:
|
1218
|
-
entities
|
1046
|
+
_update_entities_dict(entities, field_metadata)
|
1219
1047
|
|
1220
1048
|
if extracted_text is not None:
|
1221
1049
|
if subfield is not None:
|
@@ -1225,11 +1053,11 @@ class Resource:
|
|
1225
1053
|
|
1226
1054
|
for paragraph in field_metadata.paragraphs:
|
1227
1055
|
if subfield is not None:
|
1228
|
-
paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1229
|
-
else:
|
1230
1056
|
paragraph_key = (
|
1231
|
-
f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1057
|
+
f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
1232
1058
|
)
|
1059
|
+
else:
|
1060
|
+
paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
1233
1061
|
|
1234
1062
|
if enabled_metadata.labels:
|
1235
1063
|
metadata.labels.ClearField("paragraph")
|
@@ -1257,9 +1085,7 @@ class Resource:
|
|
1257
1085
|
|
1258
1086
|
yield pb_paragraph
|
1259
1087
|
|
1260
|
-
async def iterate_fields(
|
1261
|
-
self, enabled_metadata: EnabledMetadata
|
1262
|
-
) -> AsyncIterator[TrainField]:
|
1088
|
+
async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
|
1263
1089
|
fields = await self.get_fields(force=True)
|
1264
1090
|
metadata = TrainMetadata()
|
1265
1091
|
if enabled_metadata.labels:
|
@@ -1269,7 +1095,7 @@ class Resource:
|
|
1269
1095
|
metadata.labels.resource.extend(self.basic.usermetadata.classifications)
|
1270
1096
|
|
1271
1097
|
for (type_id, field_id), field in fields.items():
|
1272
|
-
fieldid = FieldID(field_type=type_id, field=field_id)
|
1098
|
+
fieldid = FieldID(field_type=type_id, field=field_id)
|
1273
1099
|
fm = await field.get_field_metadata()
|
1274
1100
|
extracted_text = None
|
1275
1101
|
|
@@ -1279,9 +1105,7 @@ class Resource:
|
|
1279
1105
|
if fm is None:
|
1280
1106
|
continue
|
1281
1107
|
|
1282
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1283
|
-
(None, fm.metadata)
|
1284
|
-
]
|
1108
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1285
1109
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1286
1110
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1287
1111
|
|
@@ -1298,7 +1122,7 @@ class Resource:
|
|
1298
1122
|
|
1299
1123
|
if enabled_metadata.entities:
|
1300
1124
|
metadata.ClearField("entities")
|
1301
|
-
metadata.entities
|
1125
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
1302
1126
|
|
1303
1127
|
pb_field = TrainField()
|
1304
1128
|
pb_field.uuid = self.uuid
|
@@ -1306,9 +1130,7 @@ class Resource:
|
|
1306
1130
|
pb_field.metadata.CopyFrom(metadata)
|
1307
1131
|
yield pb_field
|
1308
1132
|
|
1309
|
-
async def generate_train_resource(
|
1310
|
-
self, enabled_metadata: EnabledMetadata
|
1311
|
-
) -> TrainResource:
|
1133
|
+
async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
|
1312
1134
|
fields = await self.get_fields(force=True)
|
1313
1135
|
metadata = TrainMetadata()
|
1314
1136
|
if enabled_metadata.labels:
|
@@ -1335,9 +1157,7 @@ class Resource:
|
|
1335
1157
|
if fm is None:
|
1336
1158
|
continue
|
1337
1159
|
|
1338
|
-
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
|
1339
|
-
(None, fm.metadata)
|
1340
|
-
]
|
1160
|
+
field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
|
1341
1161
|
for subfield_metadata, splitted_metadata in fm.split_metadata.items():
|
1342
1162
|
field_metadatas.append((subfield_metadata, splitted_metadata))
|
1343
1163
|
|
@@ -1346,7 +1166,7 @@ class Resource:
|
|
1346
1166
|
metadata.labels.field.extend(splitted_metadata.classifications)
|
1347
1167
|
|
1348
1168
|
if enabled_metadata.entities:
|
1349
|
-
metadata.entities
|
1169
|
+
_update_entities_dict(metadata.entities, splitted_metadata)
|
1350
1170
|
|
1351
1171
|
pb_resource = TrainResource()
|
1352
1172
|
pb_resource.uuid = self.uuid
|
@@ -1375,33 +1195,35 @@ def remove_field_classifications(basic: PBBasic, deleted_fields: list[FieldID]):
|
|
1375
1195
|
Clean classifications of fields that have been deleted
|
1376
1196
|
"""
|
1377
1197
|
field_classifications = [
|
1378
|
-
fc
|
1379
|
-
for fc in basic.computedmetadata.field_classifications
|
1380
|
-
if fc.field not in deleted_fields
|
1198
|
+
fc for fc in basic.computedmetadata.field_classifications if fc.field not in deleted_fields
|
1381
1199
|
]
|
1382
1200
|
basic.computedmetadata.ClearField("field_classifications")
|
1383
1201
|
basic.computedmetadata.field_classifications.extend(field_classifications)
|
1384
1202
|
|
1385
1203
|
|
1386
|
-
def add_field_classifications(
|
1387
|
-
basic: PBBasic, fcmw: FieldComputedMetadataWrapper
|
1388
|
-
) -> bool:
|
1204
|
+
def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper) -> bool:
|
1389
1205
|
"""
|
1390
1206
|
Returns whether some new field classifications were added
|
1391
1207
|
"""
|
1392
|
-
if len(fcmw.metadata.metadata.classifications) == 0
|
1208
|
+
if len(fcmw.metadata.metadata.classifications) == 0 and all(
|
1209
|
+
len(split.classifications) == 0 for split in fcmw.metadata.split_metadata.values()
|
1210
|
+
):
|
1393
1211
|
return False
|
1212
|
+
|
1394
1213
|
remove_field_classifications(basic, [fcmw.field])
|
1395
1214
|
fcfs = FieldClassifications()
|
1396
1215
|
fcfs.field.CopyFrom(fcmw.field)
|
1397
1216
|
fcfs.classifications.extend(fcmw.metadata.metadata.classifications)
|
1217
|
+
|
1218
|
+
for split_id, split in fcmw.metadata.split_metadata.items():
|
1219
|
+
if split_id not in fcmw.metadata.deleted_splits:
|
1220
|
+
fcfs.classifications.extend(split.classifications)
|
1221
|
+
|
1398
1222
|
basic.computedmetadata.field_classifications.append(fcfs)
|
1399
1223
|
return True
|
1400
1224
|
|
1401
1225
|
|
1402
|
-
def add_entities_to_metadata(
|
1403
|
-
entities: dict[str, str], local_text: str, metadata: TrainMetadata
|
1404
|
-
) -> None:
|
1226
|
+
def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
|
1405
1227
|
for entity_key, entity_value in entities.items():
|
1406
1228
|
if entity_key not in local_text:
|
1407
1229
|
# Add the entity only if found in text
|
@@ -1415,9 +1237,7 @@ def add_entities_to_metadata(
|
|
1415
1237
|
for _ in range(local_text.count(entity_key)):
|
1416
1238
|
start = local_text.index(entity_key, last_occurrence_end)
|
1417
1239
|
end = start + len(entity_key)
|
1418
|
-
metadata.entity_positions[poskey].positions.append(
|
1419
|
-
TrainPosition(start=start, end=end)
|
1420
|
-
)
|
1240
|
+
metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
|
1421
1241
|
last_occurrence_end = end
|
1422
1242
|
|
1423
1243
|
|
@@ -1432,15 +1252,22 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
|
|
1432
1252
|
if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
|
1433
1253
|
# Icon already set or detected
|
1434
1254
|
return False
|
1255
|
+
|
1435
1256
|
if not mimetype:
|
1436
1257
|
return False
|
1258
|
+
|
1259
|
+
if not content_types.valid(mimetype):
|
1260
|
+
logger.warning(
|
1261
|
+
"Invalid mimetype. Skipping icon update.",
|
1262
|
+
extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
|
1263
|
+
)
|
1264
|
+
return False
|
1265
|
+
|
1437
1266
|
basic.icon = mimetype
|
1438
1267
|
return True
|
1439
1268
|
|
1440
1269
|
|
1441
|
-
def maybe_update_basic_thumbnail(
|
1442
|
-
basic: PBBasic, thumbnail: Optional[CloudFile]
|
1443
|
-
) -> bool:
|
1270
|
+
def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile]) -> bool:
|
1444
1271
|
if basic.thumbnail or thumbnail is None:
|
1445
1272
|
return False
|
1446
1273
|
basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
|
@@ -1482,3 +1309,23 @@ def extract_field_metadata_languages(
|
|
1482
1309
|
for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
|
1483
1310
|
languages.add(splitted_metadata.language)
|
1484
1311
|
return list(languages)
|
1312
|
+
|
1313
|
+
|
1314
|
+
def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
|
1315
|
+
"""
|
1316
|
+
Update the entities dict with the entities from the field metadata.
|
1317
|
+
Method created to ease the transition from legacy ner field to new entities field.
|
1318
|
+
"""
|
1319
|
+
# Data Augmentation + Processor entities
|
1320
|
+
# This will overwrite entities detected from more than one data augmentation task
|
1321
|
+
# TODO: Change TrainMetadata proto to accept multiple entities with the same text
|
1322
|
+
entity_map = {
|
1323
|
+
entity.text: entity.label
|
1324
|
+
for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
|
1325
|
+
for entity in entities_wrapper.entities
|
1326
|
+
}
|
1327
|
+
target_entites_dict.update(entity_map)
|
1328
|
+
|
1329
|
+
# Legacy processor entities
|
1330
|
+
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
1331
|
+
target_entites_dict.update(field_metadata.ner)
|