nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/ingest/orm/brain.py
CHANGED
@@ -22,17 +22,25 @@ from copy import deepcopy
|
|
22
22
|
from dataclasses import dataclass
|
23
23
|
from typing import Optional
|
24
24
|
|
25
|
+
from nucliadb.common import ids
|
26
|
+
from nucliadb.ingest import logger
|
27
|
+
from nucliadb.ingest.orm.utils import compute_paragraph_key
|
28
|
+
from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
|
29
|
+
from nucliadb_models.metadata import ResourceProcessingStatus
|
30
|
+
from nucliadb_protos import utils_pb2
|
25
31
|
from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
|
26
|
-
from nucliadb_protos.noderesources_pb2 import
|
32
|
+
from nucliadb_protos.noderesources_pb2 import (
|
33
|
+
ParagraphMetadata,
|
34
|
+
Representation,
|
35
|
+
ResourceID,
|
36
|
+
)
|
27
37
|
from nucliadb_protos.noderesources_pb2 import Position as TextPosition
|
28
|
-
from nucliadb_protos.noderesources_pb2 import Representation
|
29
38
|
from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
|
30
|
-
from nucliadb_protos.noderesources_pb2 import ResourceID
|
31
39
|
from nucliadb_protos.resources_pb2 import (
|
32
40
|
Basic,
|
33
41
|
ExtractedText,
|
42
|
+
FieldAuthor,
|
34
43
|
FieldComputedMetadata,
|
35
|
-
FieldKeywordset,
|
36
44
|
FieldMetadata,
|
37
45
|
Metadata,
|
38
46
|
Origin,
|
@@ -40,25 +48,10 @@ from nucliadb_protos.resources_pb2 import (
|
|
40
48
|
UserFieldMetadata,
|
41
49
|
UserMetadata,
|
42
50
|
)
|
43
|
-
from nucliadb_protos.utils_pb2 import Relation, RelationNode
|
44
|
-
|
45
|
-
from nucliadb.ingest import logger
|
46
|
-
from nucliadb.ingest.orm.utils import compute_paragraph_key
|
47
|
-
from nucliadb_models.labels import BASE_LABELS, flatten_resource_labels
|
48
|
-
from nucliadb_models.metadata import ResourceProcessingStatus
|
49
|
-
from nucliadb_protos import utils_pb2
|
51
|
+
from nucliadb_protos.utils_pb2 import Relation, RelationNode
|
50
52
|
|
51
53
|
FilePagePositions = dict[int, tuple[int, int]]
|
52
54
|
|
53
|
-
FIELD_PARAGRAPH_ID = "{rid}/{field_id}/{paragraph_start}-{paragraph_end}"
|
54
|
-
SPLIT_FIELD_PARAGRAPH_ID = (
|
55
|
-
"{rid}/{field_id}/{subfield_id}/{paragraph_start}-{paragraph_end}"
|
56
|
-
)
|
57
|
-
FIELD_VECTOR_ID = "{rid}/{field_id}/{index}/{vector_start}-{vector_end}"
|
58
|
-
SPLIT_FIELD_VECTOR_ID = (
|
59
|
-
"{rid}/{field_id}/{subfield_id}/{index}/{vector_start}-{vector_end}"
|
60
|
-
)
|
61
|
-
|
62
55
|
METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
|
63
56
|
Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
|
64
57
|
Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
|
@@ -79,7 +72,7 @@ class ResourceBrain:
|
|
79
72
|
self.rid = rid
|
80
73
|
ridobj = ResourceID(uuid=rid)
|
81
74
|
self.brain: PBBrainResource = PBBrainResource(resource=ridobj)
|
82
|
-
self.labels: dict[str,
|
75
|
+
self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
|
83
76
|
|
84
77
|
def apply_field_text(self, field_key: str, text: str):
|
85
78
|
self.brain.texts[field_key].text = text
|
@@ -104,28 +97,30 @@ class ResourceBrain:
|
|
104
97
|
self,
|
105
98
|
field_key: str,
|
106
99
|
metadata: FieldComputedMetadata,
|
107
|
-
replace_field: list[str],
|
108
|
-
replace_splits: dict[str, list[str]],
|
109
100
|
page_positions: Optional[FilePagePositions],
|
110
101
|
extracted_text: Optional[ExtractedText],
|
111
102
|
basic_user_field_metadata: Optional[UserFieldMetadata] = None,
|
103
|
+
*,
|
104
|
+
replace_field: bool = False,
|
112
105
|
):
|
113
106
|
# To check for duplicate paragraphs
|
114
107
|
unique_paragraphs: set[str] = set()
|
115
108
|
|
116
109
|
# Expose also user classifications
|
117
|
-
|
110
|
+
user_paragraph_classifications = self._get_paragraph_user_classifications(
|
118
111
|
basic_user_field_metadata
|
119
112
|
)
|
120
113
|
|
121
114
|
# We should set paragraphs and labels
|
122
115
|
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
123
116
|
for subfield, metadata_split in metadata.split_metadata.items():
|
117
|
+
extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
|
118
|
+
|
124
119
|
# For each split of this field
|
125
120
|
for index, paragraph in enumerate(metadata_split.paragraphs):
|
126
121
|
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
127
122
|
|
128
|
-
denied_classifications =
|
123
|
+
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
129
124
|
position = TextPosition(
|
130
125
|
index=index,
|
131
126
|
start=paragraph.start,
|
@@ -157,9 +152,8 @@ class ResourceBrain:
|
|
157
152
|
index=index,
|
158
153
|
repeated_in_field=is_paragraph_repeated_in_field(
|
159
154
|
paragraph,
|
160
|
-
|
155
|
+
extracted_text_str,
|
161
156
|
unique_paragraphs,
|
162
|
-
split=subfield,
|
163
157
|
),
|
164
158
|
metadata=ParagraphMetadata(
|
165
159
|
position=position,
|
@@ -167,22 +161,22 @@ class ResourceBrain:
|
|
167
161
|
representation=representation,
|
168
162
|
),
|
169
163
|
)
|
170
|
-
|
171
|
-
|
164
|
+
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
165
|
+
paragraph_labels = {paragraph_kind_label}
|
166
|
+
paragraph_labels.update(
|
167
|
+
f"/l/{classification.labelset}/{classification.label}"
|
168
|
+
for classification in paragraph.classifications
|
172
169
|
)
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
p.labels.append(label)
|
177
|
-
|
178
|
-
# Add user annotated labels to paragraphs
|
179
|
-
extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
|
170
|
+
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
171
|
+
paragraph_labels.difference_update(denied_classifications)
|
172
|
+
p.labels.extend(list(paragraph_labels))
|
180
173
|
|
181
174
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
182
175
|
|
176
|
+
extracted_text_str = extracted_text.text if extracted_text else None
|
183
177
|
for index, paragraph in enumerate(metadata.metadata.paragraphs):
|
184
178
|
key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
185
|
-
denied_classifications =
|
179
|
+
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
186
180
|
position = TextPosition(
|
187
181
|
index=index,
|
188
182
|
start=paragraph.start,
|
@@ -212,7 +206,7 @@ class ResourceBrain:
|
|
212
206
|
field=field_key,
|
213
207
|
index=index,
|
214
208
|
repeated_in_field=is_paragraph_repeated_in_field(
|
215
|
-
paragraph,
|
209
|
+
paragraph, extracted_text_str, unique_paragraphs
|
216
210
|
),
|
217
211
|
metadata=ParagraphMetadata(
|
218
212
|
position=position,
|
@@ -220,72 +214,59 @@ class ResourceBrain:
|
|
220
214
|
representation=representation,
|
221
215
|
),
|
222
216
|
)
|
223
|
-
|
224
|
-
|
217
|
+
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
218
|
+
paragraph_labels = {paragraph_kind_label}
|
219
|
+
paragraph_labels.update(
|
220
|
+
f"/l/{classification.labelset}/{classification.label}"
|
221
|
+
for classification in paragraph.classifications
|
225
222
|
)
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if label not in denied_classifications:
|
230
|
-
p.labels.append(label)
|
231
|
-
|
232
|
-
# Add user annotated labels to paragraphs
|
233
|
-
extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
|
223
|
+
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
224
|
+
paragraph_labels.difference_update(denied_classifications)
|
225
|
+
p.labels.extend(list(paragraph_labels))
|
234
226
|
|
235
227
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
236
228
|
|
229
|
+
if replace_field:
|
230
|
+
field_type, field_name = field_key.split("/")
|
231
|
+
full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
|
232
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
233
|
+
|
237
234
|
for relations in metadata.metadata.relations:
|
238
235
|
for relation in relations.relations:
|
239
236
|
self.brain.relations.append(relation)
|
240
237
|
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
for sentence_to_delete in replace_field:
|
248
|
-
self.brain.paragraphs_to_delete.append(
|
249
|
-
f"{self.rid}/{field_key}/{sentence_to_delete}"
|
250
|
-
)
|
251
|
-
|
252
|
-
def delete_metadata(self, field_key: str, metadata: FieldComputedMetadata):
|
253
|
-
for subfield, metadata_split in metadata.split_metadata.items():
|
254
|
-
for paragraph in metadata_split.paragraphs:
|
255
|
-
self.brain.paragraphs_to_delete.append(
|
256
|
-
f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
257
|
-
)
|
258
|
-
|
259
|
-
for paragraph in metadata.metadata.paragraphs:
|
260
|
-
self.brain.sentences_to_delete.append(
|
261
|
-
f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
262
|
-
)
|
238
|
+
def delete_field(self, field_key: str):
|
239
|
+
ftype, fkey = field_key.split("/")
|
240
|
+
full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
|
241
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
242
|
+
self.brain.sentences_to_delete.append(full_field_id)
|
263
243
|
|
264
244
|
def apply_field_vectors(
|
265
245
|
self,
|
266
246
|
field_id: str,
|
267
247
|
vo: utils_pb2.VectorObject,
|
268
248
|
*,
|
249
|
+
vectorset: Optional[str] = None,
|
269
250
|
replace_field: bool = False,
|
270
|
-
replace_splits: Optional[list[str]] = None,
|
271
251
|
matryoshka_vector_dimension: Optional[int] = None,
|
272
252
|
):
|
273
|
-
|
274
|
-
|
253
|
+
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
275
254
|
for subfield, vectors in vo.split_vectors.items():
|
255
|
+
_field_id = ids.FieldId(
|
256
|
+
rid=fid.rid,
|
257
|
+
type=fid.type,
|
258
|
+
key=fid.key,
|
259
|
+
subfield_id=subfield,
|
260
|
+
)
|
276
261
|
# For each split of this field
|
277
262
|
for index, vector in enumerate(vectors.vectors):
|
278
|
-
paragraph_key =
|
279
|
-
|
280
|
-
field_id=field_id,
|
281
|
-
subfield_id=subfield,
|
263
|
+
paragraph_key = ids.ParagraphId(
|
264
|
+
field_id=_field_id,
|
282
265
|
paragraph_start=vector.start_paragraph,
|
283
266
|
paragraph_end=vector.end_paragraph,
|
284
267
|
)
|
285
|
-
sentence_key =
|
286
|
-
|
287
|
-
field_id=field_id,
|
288
|
-
subfield_id=subfield,
|
268
|
+
sentence_key = ids.VectorId(
|
269
|
+
field_id=_field_id,
|
289
270
|
index=index,
|
290
271
|
vector_start=vector.start,
|
291
272
|
vector_end=vector.end,
|
@@ -295,19 +276,23 @@ class ResourceBrain:
|
|
295
276
|
paragraph_key,
|
296
277
|
sentence_key,
|
297
278
|
vector,
|
279
|
+
vectorset=vectorset,
|
298
280
|
matryoshka_vector_dimension=matryoshka_vector_dimension,
|
299
281
|
)
|
300
282
|
|
283
|
+
_field_id = ids.FieldId(
|
284
|
+
rid=fid.rid,
|
285
|
+
type=fid.type,
|
286
|
+
key=fid.key,
|
287
|
+
)
|
301
288
|
for index, vector in enumerate(vo.vectors.vectors):
|
302
|
-
paragraph_key =
|
303
|
-
|
304
|
-
field_id=field_id,
|
289
|
+
paragraph_key = ids.ParagraphId(
|
290
|
+
field_id=_field_id,
|
305
291
|
paragraph_start=vector.start_paragraph,
|
306
292
|
paragraph_end=vector.end_paragraph,
|
307
293
|
)
|
308
|
-
sentence_key =
|
309
|
-
|
310
|
-
field_id=field_id,
|
294
|
+
sentence_key = ids.VectorId(
|
295
|
+
field_id=_field_id,
|
311
296
|
index=index,
|
312
297
|
vector_start=vector.start,
|
313
298
|
vector_end=vector.end,
|
@@ -317,26 +302,33 @@ class ResourceBrain:
|
|
317
302
|
paragraph_key,
|
318
303
|
sentence_key,
|
319
304
|
vector,
|
305
|
+
vectorset=vectorset,
|
320
306
|
matryoshka_vector_dimension=matryoshka_vector_dimension,
|
321
307
|
)
|
322
308
|
|
323
|
-
for split in replace_splits:
|
324
|
-
self.brain.sentences_to_delete.append(f"{self.rid}/{field_id}/{split}")
|
325
|
-
|
326
309
|
if replace_field:
|
327
|
-
|
310
|
+
full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
|
311
|
+
if vectorset is None:
|
312
|
+
# DEPRECATED
|
313
|
+
self.brain.sentences_to_delete.append(full_field_id)
|
314
|
+
else:
|
315
|
+
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
328
316
|
|
329
317
|
def _apply_field_vector(
|
330
318
|
self,
|
331
319
|
field_id: str,
|
332
|
-
paragraph_key:
|
333
|
-
sentence_key:
|
320
|
+
paragraph_key: ids.ParagraphId,
|
321
|
+
sentence_key: ids.VectorId,
|
334
322
|
vector: utils_pb2.Vector,
|
335
323
|
*,
|
324
|
+
vectorset: Optional[str],
|
336
325
|
matryoshka_vector_dimension: Optional[int] = None,
|
337
326
|
):
|
338
|
-
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key]
|
339
|
-
|
327
|
+
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
328
|
+
if vectorset:
|
329
|
+
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
330
|
+
else:
|
331
|
+
sentence_pb = paragraph_pb.sentences[sentence_key.full()]
|
340
332
|
|
341
333
|
sentence_pb.ClearField("vector") # clear first to prevent duplicates
|
342
334
|
|
@@ -352,39 +344,18 @@ class ResourceBrain:
|
|
352
344
|
sentence_pb.metadata.position.end = vector.end_paragraph
|
353
345
|
|
354
346
|
# does it make sense to copy forward paragraph values here?
|
355
|
-
sentence_pb.metadata.position.page_number =
|
356
|
-
paragraph_pb.metadata.position.page_number
|
357
|
-
)
|
347
|
+
sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
|
358
348
|
sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
|
359
349
|
|
360
350
|
sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
|
361
351
|
|
362
|
-
sentence_pb.metadata.representation.file =
|
363
|
-
paragraph_pb.metadata.representation.file
|
364
|
-
)
|
352
|
+
sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
|
365
353
|
|
366
|
-
sentence_pb.metadata.representation.is_a_table =
|
367
|
-
paragraph_pb.metadata.representation.is_a_table
|
368
|
-
)
|
354
|
+
sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
|
369
355
|
|
370
356
|
sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
|
371
357
|
|
372
|
-
def
|
373
|
-
# TODO: no need to iterate over all vectors, just delete the whole field
|
374
|
-
for subfield, vectors in vo.split_vectors.items():
|
375
|
-
for vector in vectors.vectors:
|
376
|
-
self.brain.sentences_to_delete.append(
|
377
|
-
f"{self.rid}/{field_key}/{subfield}/{vector.start}-{vector.end}"
|
378
|
-
)
|
379
|
-
|
380
|
-
for vector in vo.vectors.vectors:
|
381
|
-
self.brain.sentences_to_delete.append(
|
382
|
-
f"{self.rid}/{field_key}/{vector.start}-{vector.end}"
|
383
|
-
)
|
384
|
-
|
385
|
-
def set_processing_status(
|
386
|
-
self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]
|
387
|
-
):
|
358
|
+
def set_processing_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
|
388
359
|
"""
|
389
360
|
We purposefully overwrite what we index as a status and DO NOT reflect
|
390
361
|
actual status with what we index.
|
@@ -441,15 +412,11 @@ class ResourceBrain:
|
|
441
412
|
self.brain.metadata.modified.CopyFrom(origin.modified)
|
442
413
|
|
443
414
|
def _set_resource_relations(self, basic: Basic, origin: Optional[Origin]):
|
444
|
-
relationnodedocument = RelationNode(
|
445
|
-
value=self.rid, ntype=RelationNode.NodeType.RESOURCE
|
446
|
-
)
|
415
|
+
relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
|
447
416
|
if origin is not None:
|
448
417
|
# origin contributors
|
449
418
|
for contrib in origin.colaborators:
|
450
|
-
relationnodeuser = RelationNode(
|
451
|
-
value=contrib, ntype=RelationNode.NodeType.USER
|
452
|
-
)
|
419
|
+
relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
|
453
420
|
self.brain.relations.append(
|
454
421
|
Relation(
|
455
422
|
relation=Relation.COLAB,
|
@@ -478,115 +445,147 @@ class ResourceBrain:
|
|
478
445
|
def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
|
479
446
|
if origin is not None:
|
480
447
|
if origin.source_id:
|
481
|
-
self.labels["o"] =
|
448
|
+
self.labels["o"] = {origin.source_id}
|
482
449
|
# origin tags
|
483
450
|
for tag in origin.tags:
|
484
|
-
self.labels["t"].
|
451
|
+
self.labels["t"].add(tag)
|
485
452
|
# origin source
|
486
453
|
if origin.source_id != "":
|
487
|
-
self.labels["u"].
|
454
|
+
self.labels["u"].add(f"s/{origin.source_id}")
|
488
455
|
|
489
456
|
if origin.path:
|
490
|
-
self.labels["p"].
|
457
|
+
self.labels["p"].add(origin.path.lstrip("/"))
|
491
458
|
|
492
459
|
# origin contributors
|
493
460
|
for contrib in origin.colaborators:
|
494
|
-
self.labels["u"].
|
461
|
+
self.labels["u"].add(f"o/{contrib}")
|
495
462
|
|
496
463
|
for key, value in origin.metadata.items():
|
497
|
-
self.labels["m"].
|
464
|
+
self.labels["m"].add(f"{key[:255]}/{value[:255]}")
|
498
465
|
|
499
466
|
# icon
|
500
|
-
self.labels["n"].
|
467
|
+
self.labels["n"].add(f"i/{basic.icon}")
|
501
468
|
|
502
469
|
# processing status
|
503
470
|
status_tag = self.get_processing_status_tag(basic.metadata)
|
504
|
-
self.labels["n"].
|
471
|
+
self.labels["n"].add(f"s/{status_tag}")
|
505
472
|
|
506
473
|
# main language
|
507
474
|
if basic.metadata.language:
|
508
|
-
self.labels["s"].
|
475
|
+
self.labels["s"].add(f"p/{basic.metadata.language}")
|
509
476
|
|
510
477
|
# all language
|
511
478
|
for lang in basic.metadata.languages:
|
512
|
-
self.labels["s"].
|
479
|
+
self.labels["s"].add(f"s/{lang}")
|
513
480
|
|
514
481
|
# labels
|
515
482
|
for classification in basic.usermetadata.classifications:
|
516
|
-
self.labels["l"].
|
483
|
+
self.labels["l"].add(f"{classification.labelset}/{classification.label}")
|
517
484
|
|
518
|
-
|
485
|
+
# hidden
|
486
|
+
if basic.hidden:
|
487
|
+
_, p1, p2 = LABEL_HIDDEN.split("/")
|
488
|
+
self.labels[p1].add(p2)
|
489
|
+
|
490
|
+
self.brain.ClearField("labels")
|
491
|
+
self.brain.labels.extend(flatten_resource_labels(self.labels))
|
519
492
|
|
520
493
|
def process_field_metadata(
|
521
494
|
self,
|
522
495
|
field_key: str,
|
523
496
|
metadata: FieldMetadata,
|
524
|
-
labels: dict[str,
|
497
|
+
labels: dict[str, set[str]],
|
525
498
|
relation_node_document: RelationNode,
|
526
|
-
user_canceled_labels:
|
499
|
+
user_canceled_labels: set[str],
|
527
500
|
):
|
501
|
+
if metadata.mime_type != "":
|
502
|
+
labels["mt"].add(metadata.mime_type)
|
503
|
+
|
504
|
+
base_classification_relation = Relation(
|
505
|
+
relation=Relation.ABOUT,
|
506
|
+
source=relation_node_document,
|
507
|
+
to=RelationNode(
|
508
|
+
ntype=RelationNode.NodeType.LABEL,
|
509
|
+
),
|
510
|
+
)
|
528
511
|
for classification in metadata.classifications:
|
529
512
|
label = f"{classification.labelset}/{classification.label}"
|
530
513
|
if label not in user_canceled_labels:
|
531
|
-
labels["l"].
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
)
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
514
|
+
labels["l"].add(label)
|
515
|
+
relation = Relation()
|
516
|
+
relation.CopyFrom(base_classification_relation)
|
517
|
+
relation.to.value = label
|
518
|
+
self.brain.relations.append(relation)
|
519
|
+
|
520
|
+
# Data Augmentation + Processor entities
|
521
|
+
base_entity_relation = Relation(
|
522
|
+
relation=Relation.ENTITY,
|
523
|
+
source=relation_node_document,
|
524
|
+
to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
|
525
|
+
)
|
526
|
+
use_legacy_entities = True
|
527
|
+
for data_augmentation_task_id, entities in metadata.entities.items():
|
528
|
+
# If we recieved the entities from the processor here, we don't want to use the legacy entities
|
529
|
+
# TODO: Remove this when processor doesn't use this anymore
|
530
|
+
if data_augmentation_task_id == "processor":
|
531
|
+
use_legacy_entities = False
|
532
|
+
|
533
|
+
for ent in entities.entities:
|
534
|
+
entity_text = ent.text
|
535
|
+
entity_label = ent.label
|
536
|
+
# Seems like we don't care about where the entity is in the text
|
537
|
+
# entity_positions = entity.positions
|
538
|
+
labels["e"].add(
|
539
|
+
f"{entity_label}/{entity_text}"
|
540
|
+
) # Add data_augmentation_task_id as a prefix?
|
541
|
+
relation = Relation()
|
542
|
+
relation.CopyFrom(base_entity_relation)
|
543
|
+
relation.to.value = entity_text
|
544
|
+
relation.to.subtype = entity_label
|
545
|
+
self.brain.relations.append(relation)
|
543
546
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
547
|
+
# Legacy processor entities
|
548
|
+
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
549
|
+
def _parse_entity(klass_entity: str) -> tuple[str, str]:
|
550
|
+
try:
|
551
|
+
klass, entity = klass_entity.split("/", 1)
|
552
|
+
return klass, entity
|
553
|
+
except ValueError:
|
548
554
|
raise AttributeError(f"Entity should be with type {klass_entity}")
|
549
|
-
elif len(entity_array) > 1:
|
550
|
-
klass = entity_array[0]
|
551
|
-
entity = "/".join(entity_array[1:])
|
552
|
-
relation_node_entity = RelationNode(
|
553
|
-
value=entity, ntype=RelationNode.NodeType.ENTITY, subtype=klass
|
554
|
-
)
|
555
|
-
rel = Relation(
|
556
|
-
relation=Relation.ENTITY,
|
557
|
-
source=relation_node_document,
|
558
|
-
to=relation_node_entity,
|
559
|
-
)
|
560
|
-
self.brain.relations.append(rel)
|
561
555
|
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
556
|
+
if use_legacy_entities:
|
557
|
+
for klass_entity in metadata.positions.keys():
|
558
|
+
labels["e"].add(klass_entity)
|
559
|
+
klass, entity = _parse_entity(klass_entity)
|
560
|
+
relation = Relation()
|
561
|
+
relation.CopyFrom(base_entity_relation)
|
562
|
+
relation.to.value = entity
|
563
|
+
relation.to.subtype = klass
|
564
|
+
self.brain.relations.append(relation)
|
568
565
|
|
569
566
|
def apply_field_labels(
|
570
567
|
self,
|
571
568
|
field_key: str,
|
572
569
|
metadata: Optional[FieldComputedMetadata],
|
573
570
|
uuid: str,
|
571
|
+
generated_by: FieldAuthor,
|
574
572
|
basic_user_metadata: Optional[UserMetadata] = None,
|
575
573
|
basic_user_fieldmetadata: Optional[UserFieldMetadata] = None,
|
576
574
|
):
|
575
|
+
user_canceled_labels: set[str] = set()
|
577
576
|
if basic_user_metadata is not None:
|
578
|
-
user_canceled_labels
|
579
|
-
f"
|
577
|
+
user_canceled_labels.update(
|
578
|
+
f"{classification.labelset}/{classification.label}"
|
580
579
|
for classification in basic_user_metadata.classifications
|
581
580
|
if classification.cancelled_by_user
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
581
|
+
)
|
582
|
+
relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
|
583
|
+
labels: dict[str, set[str]] = {
|
584
|
+
"l": set(), # classification labels
|
585
|
+
"e": set(), # entities
|
586
|
+
"mt": set(), # mime type
|
587
|
+
"g/da": set(), # generated by
|
588
|
+
}
|
590
589
|
if metadata is not None:
|
591
590
|
for meta in metadata.split_metadata.values():
|
592
591
|
self.process_field_metadata(
|
@@ -607,7 +606,7 @@ class ResourceBrain:
|
|
607
606
|
if basic_user_fieldmetadata is not None:
|
608
607
|
for token in basic_user_fieldmetadata.token:
|
609
608
|
if token.cancelled_by_user is False:
|
610
|
-
labels["e"].
|
609
|
+
labels["e"].add(f"{token.klass}/{token.token}")
|
611
610
|
relation_node_entity = RelationNode(
|
612
611
|
value=token.token,
|
613
612
|
ntype=RelationNode.NodeType.ENTITY,
|
@@ -635,36 +634,33 @@ class ResourceBrain:
|
|
635
634
|
self.brain.paragraphs[field_key].paragraphs[
|
636
635
|
paragraph_annotation.key
|
637
636
|
].labels.append(label)
|
638
|
-
extend_unique(
|
639
|
-
self.brain.texts[field_key].labels, flatten_resource_labels(labels) # type: ignore
|
640
|
-
)
|
641
|
-
|
642
|
-
def compute_labels(self):
|
643
|
-
extend_unique(self.brain.labels, flatten_resource_labels(self.labels))
|
644
637
|
|
638
|
+
if generated_by.WhichOneof("author") == "data_augmentation":
|
639
|
+
field_type, field_id = field_key.split("/")
|
640
|
+
da_task_id = ids.extract_data_augmentation_id(field_id)
|
641
|
+
if da_task_id is None: # pragma: nocover
|
642
|
+
logger.warning(
|
643
|
+
"Data augmentation field id has an unexpected format! Skipping label",
|
644
|
+
extra={
|
645
|
+
"rid": uuid,
|
646
|
+
"field_id": field_id,
|
647
|
+
},
|
648
|
+
)
|
649
|
+
else:
|
650
|
+
labels["g/da"].add(da_task_id)
|
645
651
|
|
646
|
-
|
647
|
-
extracted_text: ExtractedText, start: int, end: int, split: Optional[str] = None
|
648
|
-
) -> str:
|
649
|
-
if split is not None:
|
650
|
-
text = extracted_text.split_text[split]
|
651
|
-
else:
|
652
|
-
text = extracted_text.text
|
653
|
-
return text[start:end]
|
652
|
+
self.brain.texts[field_key].labels.extend(flatten_resource_labels(labels))
|
654
653
|
|
655
654
|
|
656
655
|
def is_paragraph_repeated_in_field(
|
657
656
|
paragraph: Paragraph,
|
658
|
-
extracted_text: Optional[
|
657
|
+
extracted_text: Optional[str],
|
659
658
|
unique_paragraphs: set[str],
|
660
|
-
split: Optional[str] = None,
|
661
659
|
) -> bool:
|
662
660
|
if extracted_text is None:
|
663
661
|
return False
|
664
662
|
|
665
|
-
paragraph_text =
|
666
|
-
extracted_text, start=paragraph.start, end=paragraph.end, split=split
|
667
|
-
)
|
663
|
+
paragraph_text = extracted_text[paragraph.start : paragraph.end]
|
668
664
|
if len(paragraph_text) == 0:
|
669
665
|
return False
|
670
666
|
|
@@ -701,12 +697,3 @@ class ParagraphPages:
|
|
701
697
|
if len(self._materialized) > 0:
|
702
698
|
return self._materialized[-1]
|
703
699
|
return 0
|
704
|
-
|
705
|
-
|
706
|
-
def extend_unique(a: list, b: list):
|
707
|
-
"""
|
708
|
-
Prevents extending with duplicate elements
|
709
|
-
"""
|
710
|
-
for item in b:
|
711
|
-
if item not in a:
|
712
|
-
a.append(item)
|