nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/ingest/orm/brain.py
CHANGED
@@ -20,20 +20,27 @@
|
|
20
20
|
import logging
|
21
21
|
from copy import deepcopy
|
22
22
|
from dataclasses import dataclass
|
23
|
-
from typing import
|
23
|
+
from typing import Optional
|
24
24
|
|
25
|
-
from
|
25
|
+
from nucliadb.common import ids
|
26
|
+
from nucliadb.ingest import logger
|
27
|
+
from nucliadb.ingest.orm.utils import compute_paragraph_key
|
28
|
+
from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
|
29
|
+
from nucliadb_models.metadata import ResourceProcessingStatus
|
30
|
+
from nucliadb_protos import utils_pb2
|
26
31
|
from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
|
27
|
-
from nucliadb_protos.noderesources_pb2 import
|
32
|
+
from nucliadb_protos.noderesources_pb2 import (
|
33
|
+
ParagraphMetadata,
|
34
|
+
Representation,
|
35
|
+
ResourceID,
|
36
|
+
)
|
28
37
|
from nucliadb_protos.noderesources_pb2 import Position as TextPosition
|
29
|
-
from nucliadb_protos.noderesources_pb2 import Representation
|
30
38
|
from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
|
31
|
-
from nucliadb_protos.noderesources_pb2 import ResourceID
|
32
39
|
from nucliadb_protos.resources_pb2 import (
|
33
40
|
Basic,
|
34
41
|
ExtractedText,
|
42
|
+
FieldAuthor,
|
35
43
|
FieldComputedMetadata,
|
36
|
-
FieldKeywordset,
|
37
44
|
FieldMetadata,
|
38
45
|
Metadata,
|
39
46
|
Origin,
|
@@ -41,28 +48,10 @@ from nucliadb_protos.resources_pb2 import (
|
|
41
48
|
UserFieldMetadata,
|
42
49
|
UserMetadata,
|
43
50
|
)
|
44
|
-
from nucliadb_protos.utils_pb2 import
|
45
|
-
Relation,
|
46
|
-
RelationNode,
|
47
|
-
UserVectorSet,
|
48
|
-
UserVectorsList,
|
49
|
-
VectorObject,
|
50
|
-
)
|
51
|
-
|
52
|
-
from nucliadb.ingest import logger
|
53
|
-
from nucliadb.ingest.orm.utils import compute_paragraph_key
|
54
|
-
from nucliadb_models.labels import BASE_LABELS, flatten_resource_labels
|
55
|
-
from nucliadb_models.metadata import ResourceProcessingStatus
|
56
|
-
from nucliadb_protos import utils_pb2
|
57
|
-
|
58
|
-
if TYPE_CHECKING: # pragma: no cover
|
59
|
-
StatusValue = Union[Metadata.Status.V, int]
|
60
|
-
else:
|
61
|
-
StatusValue = int
|
51
|
+
from nucliadb_protos.utils_pb2 import Relation, RelationNode
|
62
52
|
|
63
53
|
FilePagePositions = dict[int, tuple[int, int]]
|
64
54
|
|
65
|
-
|
66
55
|
METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
|
67
56
|
Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
|
68
57
|
Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
|
@@ -83,7 +72,7 @@ class ResourceBrain:
|
|
83
72
|
self.rid = rid
|
84
73
|
ridobj = ResourceID(uuid=rid)
|
85
74
|
self.brain: PBBrainResource = PBBrainResource(resource=ridobj)
|
86
|
-
self.labels: dict[str,
|
75
|
+
self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
|
87
76
|
|
88
77
|
def apply_field_text(self, field_key: str, text: str):
|
89
78
|
self.brain.texts[field_key].text = text
|
@@ -108,28 +97,30 @@ class ResourceBrain:
|
|
108
97
|
self,
|
109
98
|
field_key: str,
|
110
99
|
metadata: FieldComputedMetadata,
|
111
|
-
replace_field: list[str],
|
112
|
-
replace_splits: dict[str, list[str]],
|
113
100
|
page_positions: Optional[FilePagePositions],
|
114
101
|
extracted_text: Optional[ExtractedText],
|
115
102
|
basic_user_field_metadata: Optional[UserFieldMetadata] = None,
|
103
|
+
*,
|
104
|
+
replace_field: bool = False,
|
116
105
|
):
|
117
106
|
# To check for duplicate paragraphs
|
118
107
|
unique_paragraphs: set[str] = set()
|
119
108
|
|
120
109
|
# Expose also user classifications
|
121
|
-
|
110
|
+
user_paragraph_classifications = self._get_paragraph_user_classifications(
|
122
111
|
basic_user_field_metadata
|
123
112
|
)
|
124
113
|
|
125
114
|
# We should set paragraphs and labels
|
126
115
|
paragraph_pages = ParagraphPages(page_positions) if page_positions else None
|
127
116
|
for subfield, metadata_split in metadata.split_metadata.items():
|
117
|
+
extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
|
118
|
+
|
128
119
|
# For each split of this field
|
129
120
|
for index, paragraph in enumerate(metadata_split.paragraphs):
|
130
121
|
key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
131
122
|
|
132
|
-
denied_classifications =
|
123
|
+
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
133
124
|
position = TextPosition(
|
134
125
|
index=index,
|
135
126
|
start=paragraph.start,
|
@@ -161,9 +152,8 @@ class ResourceBrain:
|
|
161
152
|
index=index,
|
162
153
|
repeated_in_field=is_paragraph_repeated_in_field(
|
163
154
|
paragraph,
|
164
|
-
|
155
|
+
extracted_text_str,
|
165
156
|
unique_paragraphs,
|
166
|
-
split=subfield,
|
167
157
|
),
|
168
158
|
metadata=ParagraphMetadata(
|
169
159
|
position=position,
|
@@ -171,22 +161,22 @@ class ResourceBrain:
|
|
171
161
|
representation=representation,
|
172
162
|
),
|
173
163
|
)
|
174
|
-
|
175
|
-
|
164
|
+
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
165
|
+
paragraph_labels = {paragraph_kind_label}
|
166
|
+
paragraph_labels.update(
|
167
|
+
f"/l/{classification.labelset}/{classification.label}"
|
168
|
+
for classification in paragraph.classifications
|
176
169
|
)
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
p.labels.append(label)
|
181
|
-
|
182
|
-
# Add user annotated labels to paragraphs
|
183
|
-
extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
|
170
|
+
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
171
|
+
paragraph_labels.difference_update(denied_classifications)
|
172
|
+
p.labels.extend(list(paragraph_labels))
|
184
173
|
|
185
174
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
186
175
|
|
176
|
+
extracted_text_str = extracted_text.text if extracted_text else None
|
187
177
|
for index, paragraph in enumerate(metadata.metadata.paragraphs):
|
188
178
|
key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
189
|
-
denied_classifications =
|
179
|
+
denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
|
190
180
|
position = TextPosition(
|
191
181
|
index=index,
|
192
182
|
start=paragraph.start,
|
@@ -216,7 +206,7 @@ class ResourceBrain:
|
|
216
206
|
field=field_key,
|
217
207
|
index=index,
|
218
208
|
repeated_in_field=is_paragraph_repeated_in_field(
|
219
|
-
paragraph,
|
209
|
+
paragraph, extracted_text_str, unique_paragraphs
|
220
210
|
),
|
221
211
|
metadata=ParagraphMetadata(
|
222
212
|
position=position,
|
@@ -224,161 +214,148 @@ class ResourceBrain:
|
|
224
214
|
representation=representation,
|
225
215
|
),
|
226
216
|
)
|
227
|
-
|
228
|
-
|
217
|
+
paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
|
218
|
+
paragraph_labels = {paragraph_kind_label}
|
219
|
+
paragraph_labels.update(
|
220
|
+
f"/l/{classification.labelset}/{classification.label}"
|
221
|
+
for classification in paragraph.classifications
|
229
222
|
)
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
if label not in denied_classifications:
|
234
|
-
p.labels.append(label)
|
235
|
-
|
236
|
-
# Add user annotated labels to paragraphs
|
237
|
-
extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
|
223
|
+
paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
|
224
|
+
paragraph_labels.difference_update(denied_classifications)
|
225
|
+
p.labels.extend(list(paragraph_labels))
|
238
226
|
|
239
227
|
self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
|
240
228
|
|
229
|
+
if replace_field:
|
230
|
+
field_type, field_name = field_key.split("/")
|
231
|
+
full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
|
232
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
233
|
+
|
241
234
|
for relations in metadata.metadata.relations:
|
242
235
|
for relation in relations.relations:
|
243
236
|
self.brain.relations.append(relation)
|
244
237
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
for sentence_to_delete in replace_field:
|
252
|
-
self.brain.paragraphs_to_delete.append(
|
253
|
-
f"{self.rid}/{field_key}/{sentence_to_delete}"
|
254
|
-
)
|
255
|
-
|
256
|
-
def delete_metadata(self, field_key: str, metadata: FieldComputedMetadata):
|
257
|
-
for subfield, metadata_split in metadata.split_metadata.items():
|
258
|
-
for paragraph in metadata_split.paragraphs:
|
259
|
-
self.brain.paragraphs_to_delete.append(
|
260
|
-
f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
|
261
|
-
)
|
262
|
-
|
263
|
-
for paragraph in metadata.metadata.paragraphs:
|
264
|
-
self.brain.sentences_to_delete.append(
|
265
|
-
f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
|
266
|
-
)
|
267
|
-
|
268
|
-
def apply_user_vectors(
|
269
|
-
self,
|
270
|
-
field_key: str,
|
271
|
-
user_vectors: UserVectorSet,
|
272
|
-
vectors_to_delete: MessageMap[str, UserVectorsList],
|
273
|
-
):
|
274
|
-
for vectorset, vectors in user_vectors.vectors.items():
|
275
|
-
for vector_id, user_vector in vectors.vectors.items():
|
276
|
-
self.brain.vectors[vectorset].vectors[
|
277
|
-
f"{self.rid}/{field_key}/{vector_id}/{user_vector.start}-{user_vector.end}"
|
278
|
-
].CopyFrom(user_vector)
|
279
|
-
|
280
|
-
for vectorset, vectorslist in vectors_to_delete.items():
|
281
|
-
for vector in vectorslist.vectors:
|
282
|
-
self.brain.vectors_to_delete[vectorset].vectors.append(
|
283
|
-
f"{self.rid}/{field_key}/{vector}"
|
284
|
-
)
|
238
|
+
def delete_field(self, field_key: str):
|
239
|
+
ftype, fkey = field_key.split("/")
|
240
|
+
full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
|
241
|
+
self.brain.paragraphs_to_delete.append(full_field_id)
|
242
|
+
self.brain.sentences_to_delete.append(full_field_id)
|
285
243
|
|
286
244
|
def apply_field_vectors(
|
287
245
|
self,
|
288
|
-
|
289
|
-
vo: VectorObject,
|
290
|
-
|
291
|
-
|
246
|
+
field_id: str,
|
247
|
+
vo: utils_pb2.VectorObject,
|
248
|
+
*,
|
249
|
+
vectorset: Optional[str] = None,
|
250
|
+
replace_field: bool = False,
|
251
|
+
matryoshka_vector_dimension: Optional[int] = None,
|
292
252
|
):
|
253
|
+
fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
|
293
254
|
for subfield, vectors in vo.split_vectors.items():
|
255
|
+
_field_id = ids.FieldId(
|
256
|
+
rid=fid.rid,
|
257
|
+
type=fid.type,
|
258
|
+
key=fid.key,
|
259
|
+
subfield_id=subfield,
|
260
|
+
)
|
294
261
|
# For each split of this field
|
295
|
-
|
296
262
|
for index, vector in enumerate(vectors.vectors):
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
f"{self.rid}/{field_key}/{subfield}/{index}/{vector.start}-{vector.end}"
|
302
|
-
]
|
303
|
-
|
304
|
-
ssentence.ClearField("vector") # clear first to prevent duplicates
|
305
|
-
ssentence.vector.extend(vector.vector)
|
306
|
-
|
307
|
-
# we only care about start/stop position of the paragraph for a given sentence here
|
308
|
-
# the key has the sentence position
|
309
|
-
ssentence.metadata.position.start = vector.start_paragraph
|
310
|
-
ssentence.metadata.position.end = vector.end_paragraph
|
311
|
-
|
312
|
-
ssentence.metadata.position.page_number = (
|
313
|
-
sparagraph.metadata.position.page_number
|
314
|
-
)
|
315
|
-
ssentence.metadata.position.in_page = (
|
316
|
-
sparagraph.metadata.position.in_page
|
317
|
-
)
|
318
|
-
ssentence.metadata.page_with_visual = (
|
319
|
-
sparagraph.metadata.page_with_visual
|
263
|
+
paragraph_key = ids.ParagraphId(
|
264
|
+
field_id=_field_id,
|
265
|
+
paragraph_start=vector.start_paragraph,
|
266
|
+
paragraph_end=vector.end_paragraph,
|
320
267
|
)
|
321
|
-
|
322
|
-
|
323
|
-
|
268
|
+
sentence_key = ids.VectorId(
|
269
|
+
field_id=_field_id,
|
270
|
+
index=index,
|
271
|
+
vector_start=vector.start,
|
272
|
+
vector_end=vector.end,
|
324
273
|
)
|
325
|
-
|
326
|
-
|
274
|
+
self._apply_field_vector(
|
275
|
+
field_id,
|
276
|
+
paragraph_key,
|
277
|
+
sentence_key,
|
278
|
+
vector,
|
279
|
+
vectorset=vectorset,
|
280
|
+
matryoshka_vector_dimension=matryoshka_vector_dimension,
|
327
281
|
)
|
328
|
-
ssentence.metadata.position.index = sparagraph.metadata.position.index
|
329
282
|
|
283
|
+
_field_id = ids.FieldId(
|
284
|
+
rid=fid.rid,
|
285
|
+
type=fid.type,
|
286
|
+
key=fid.key,
|
287
|
+
)
|
330
288
|
for index, vector in enumerate(vo.vectors.vectors):
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
sentence.ClearField("vector") # clear first to prevent duplicates
|
337
|
-
sentence.vector.extend(vector.vector)
|
338
|
-
|
339
|
-
# we only care about start/stop position of the paragraph for a given sentence here
|
340
|
-
# the key has the sentence position
|
341
|
-
sentence.metadata.position.start = vector.start_paragraph
|
342
|
-
sentence.metadata.position.end = vector.end_paragraph
|
343
|
-
|
344
|
-
# does it make sense to copy forward paragraph values here?
|
345
|
-
sentence.metadata.position.page_number = (
|
346
|
-
paragraph.metadata.position.page_number
|
289
|
+
paragraph_key = ids.ParagraphId(
|
290
|
+
field_id=_field_id,
|
291
|
+
paragraph_start=vector.start_paragraph,
|
292
|
+
paragraph_end=vector.end_paragraph,
|
347
293
|
)
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
paragraph.metadata.representation.file
|
294
|
+
sentence_key = ids.VectorId(
|
295
|
+
field_id=_field_id,
|
296
|
+
index=index,
|
297
|
+
vector_start=vector.start,
|
298
|
+
vector_end=vector.end,
|
354
299
|
)
|
355
|
-
|
356
|
-
|
300
|
+
self._apply_field_vector(
|
301
|
+
field_id,
|
302
|
+
paragraph_key,
|
303
|
+
sentence_key,
|
304
|
+
vector,
|
305
|
+
vectorset=vectorset,
|
306
|
+
matryoshka_vector_dimension=matryoshka_vector_dimension,
|
357
307
|
)
|
358
308
|
|
359
|
-
|
309
|
+
if replace_field:
|
310
|
+
full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
|
311
|
+
if vectorset is None:
|
312
|
+
# DEPRECATED
|
313
|
+
self.brain.sentences_to_delete.append(full_field_id)
|
314
|
+
else:
|
315
|
+
self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
|
316
|
+
|
317
|
+
def _apply_field_vector(
|
318
|
+
self,
|
319
|
+
field_id: str,
|
320
|
+
paragraph_key: ids.ParagraphId,
|
321
|
+
sentence_key: ids.VectorId,
|
322
|
+
vector: utils_pb2.Vector,
|
323
|
+
*,
|
324
|
+
vectorset: Optional[str],
|
325
|
+
matryoshka_vector_dimension: Optional[int] = None,
|
326
|
+
):
|
327
|
+
paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
|
328
|
+
if vectorset:
|
329
|
+
sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
|
330
|
+
else:
|
331
|
+
sentence_pb = paragraph_pb.sentences[sentence_key.full()]
|
360
332
|
|
361
|
-
|
362
|
-
self.brain.sentences_to_delete.append(f"{self.rid}/{field_key}/{split}")
|
333
|
+
sentence_pb.ClearField("vector") # clear first to prevent duplicates
|
363
334
|
|
364
|
-
if
|
365
|
-
|
335
|
+
# cut vectors if a specific dimension is specified
|
336
|
+
if matryoshka_vector_dimension is not None:
|
337
|
+
sentence_pb.vector.extend(vector.vector[:matryoshka_vector_dimension])
|
338
|
+
else:
|
339
|
+
sentence_pb.vector.extend(vector.vector)
|
366
340
|
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
f"{self.rid}/{field_key}/{subfield}/{vector.start}-{vector.end}"
|
372
|
-
)
|
341
|
+
# we only care about start/stop position of the paragraph for a given sentence here
|
342
|
+
# the key has the sentence position
|
343
|
+
sentence_pb.metadata.position.start = vector.start_paragraph
|
344
|
+
sentence_pb.metadata.position.end = vector.end_paragraph
|
373
345
|
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
)
|
346
|
+
# does it make sense to copy forward paragraph values here?
|
347
|
+
sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
|
348
|
+
sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
|
378
349
|
|
379
|
-
|
380
|
-
|
381
|
-
|
350
|
+
sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
|
351
|
+
|
352
|
+
sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
|
353
|
+
|
354
|
+
sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
|
355
|
+
|
356
|
+
sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
|
357
|
+
|
358
|
+
def set_processing_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
|
382
359
|
"""
|
383
360
|
We purposefully overwrite what we index as a status and DO NOT reflect
|
384
361
|
actual status with what we index.
|
@@ -435,15 +412,11 @@ class ResourceBrain:
|
|
435
412
|
self.brain.metadata.modified.CopyFrom(origin.modified)
|
436
413
|
|
437
414
|
def _set_resource_relations(self, basic: Basic, origin: Optional[Origin]):
|
438
|
-
relationnodedocument = RelationNode(
|
439
|
-
value=self.rid, ntype=RelationNode.NodeType.RESOURCE
|
440
|
-
)
|
415
|
+
relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
|
441
416
|
if origin is not None:
|
442
417
|
# origin contributors
|
443
418
|
for contrib in origin.colaborators:
|
444
|
-
relationnodeuser = RelationNode(
|
445
|
-
value=contrib, ntype=RelationNode.NodeType.USER
|
446
|
-
)
|
419
|
+
relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
|
447
420
|
self.brain.relations.append(
|
448
421
|
Relation(
|
449
422
|
relation=Relation.COLAB,
|
@@ -472,115 +445,147 @@ class ResourceBrain:
|
|
472
445
|
def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
|
473
446
|
if origin is not None:
|
474
447
|
if origin.source_id:
|
475
|
-
self.labels["o"] =
|
448
|
+
self.labels["o"] = {origin.source_id}
|
476
449
|
# origin tags
|
477
450
|
for tag in origin.tags:
|
478
|
-
self.labels["t"].
|
451
|
+
self.labels["t"].add(tag)
|
479
452
|
# origin source
|
480
453
|
if origin.source_id != "":
|
481
|
-
self.labels["u"].
|
454
|
+
self.labels["u"].add(f"s/{origin.source_id}")
|
482
455
|
|
483
456
|
if origin.path:
|
484
|
-
self.labels["p"].
|
457
|
+
self.labels["p"].add(origin.path.lstrip("/"))
|
485
458
|
|
486
459
|
# origin contributors
|
487
460
|
for contrib in origin.colaborators:
|
488
|
-
self.labels["u"].
|
461
|
+
self.labels["u"].add(f"o/{contrib}")
|
489
462
|
|
490
463
|
for key, value in origin.metadata.items():
|
491
|
-
self.labels["m"].
|
464
|
+
self.labels["m"].add(f"{key[:255]}/{value[:255]}")
|
492
465
|
|
493
466
|
# icon
|
494
|
-
self.labels["n"].
|
467
|
+
self.labels["n"].add(f"i/{basic.icon}")
|
495
468
|
|
496
469
|
# processing status
|
497
470
|
status_tag = self.get_processing_status_tag(basic.metadata)
|
498
|
-
self.labels["n"].
|
471
|
+
self.labels["n"].add(f"s/{status_tag}")
|
499
472
|
|
500
473
|
# main language
|
501
474
|
if basic.metadata.language:
|
502
|
-
self.labels["s"].
|
475
|
+
self.labels["s"].add(f"p/{basic.metadata.language}")
|
503
476
|
|
504
477
|
# all language
|
505
478
|
for lang in basic.metadata.languages:
|
506
|
-
self.labels["s"].
|
479
|
+
self.labels["s"].add(f"s/{lang}")
|
507
480
|
|
508
481
|
# labels
|
509
482
|
for classification in basic.usermetadata.classifications:
|
510
|
-
self.labels["l"].
|
483
|
+
self.labels["l"].add(f"{classification.labelset}/{classification.label}")
|
484
|
+
|
485
|
+
# hidden
|
486
|
+
if basic.hidden:
|
487
|
+
_, p1, p2 = LABEL_HIDDEN.split("/")
|
488
|
+
self.labels[p1].add(p2)
|
511
489
|
|
512
|
-
self.
|
490
|
+
self.brain.ClearField("labels")
|
491
|
+
self.brain.labels.extend(flatten_resource_labels(self.labels))
|
513
492
|
|
514
493
|
def process_field_metadata(
|
515
494
|
self,
|
516
495
|
field_key: str,
|
517
496
|
metadata: FieldMetadata,
|
518
|
-
labels: dict[str,
|
497
|
+
labels: dict[str, set[str]],
|
519
498
|
relation_node_document: RelationNode,
|
520
|
-
user_canceled_labels:
|
499
|
+
user_canceled_labels: set[str],
|
521
500
|
):
|
501
|
+
if metadata.mime_type != "":
|
502
|
+
labels["mt"].add(metadata.mime_type)
|
503
|
+
|
504
|
+
base_classification_relation = Relation(
|
505
|
+
relation=Relation.ABOUT,
|
506
|
+
source=relation_node_document,
|
507
|
+
to=RelationNode(
|
508
|
+
ntype=RelationNode.NodeType.LABEL,
|
509
|
+
),
|
510
|
+
)
|
522
511
|
for classification in metadata.classifications:
|
523
512
|
label = f"{classification.labelset}/{classification.label}"
|
524
513
|
if label not in user_canceled_labels:
|
525
|
-
labels["l"].
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
)
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
514
|
+
labels["l"].add(label)
|
515
|
+
relation = Relation()
|
516
|
+
relation.CopyFrom(base_classification_relation)
|
517
|
+
relation.to.value = label
|
518
|
+
self.brain.relations.append(relation)
|
519
|
+
|
520
|
+
# Data Augmentation + Processor entities
|
521
|
+
base_entity_relation = Relation(
|
522
|
+
relation=Relation.ENTITY,
|
523
|
+
source=relation_node_document,
|
524
|
+
to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
|
525
|
+
)
|
526
|
+
use_legacy_entities = True
|
527
|
+
for data_augmentation_task_id, entities in metadata.entities.items():
|
528
|
+
# If we recieved the entities from the processor here, we don't want to use the legacy entities
|
529
|
+
# TODO: Remove this when processor doesn't use this anymore
|
530
|
+
if data_augmentation_task_id == "processor":
|
531
|
+
use_legacy_entities = False
|
532
|
+
|
533
|
+
for ent in entities.entities:
|
534
|
+
entity_text = ent.text
|
535
|
+
entity_label = ent.label
|
536
|
+
# Seems like we don't care about where the entity is in the text
|
537
|
+
# entity_positions = entity.positions
|
538
|
+
labels["e"].add(
|
539
|
+
f"{entity_label}/{entity_text}"
|
540
|
+
) # Add data_augmentation_task_id as a prefix?
|
541
|
+
relation = Relation()
|
542
|
+
relation.CopyFrom(base_entity_relation)
|
543
|
+
relation.to.value = entity_text
|
544
|
+
relation.to.subtype = entity_label
|
545
|
+
self.brain.relations.append(relation)
|
537
546
|
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
547
|
+
# Legacy processor entities
|
548
|
+
# TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
|
549
|
+
def _parse_entity(klass_entity: str) -> tuple[str, str]:
|
550
|
+
try:
|
551
|
+
klass, entity = klass_entity.split("/", 1)
|
552
|
+
return klass, entity
|
553
|
+
except ValueError:
|
542
554
|
raise AttributeError(f"Entity should be with type {klass_entity}")
|
543
|
-
elif len(entity_array) > 1:
|
544
|
-
klass = entity_array[0]
|
545
|
-
entity = "/".join(entity_array[1:])
|
546
|
-
relation_node_entity = RelationNode(
|
547
|
-
value=entity, ntype=RelationNode.NodeType.ENTITY, subtype=klass
|
548
|
-
)
|
549
|
-
rel = Relation(
|
550
|
-
relation=Relation.ENTITY,
|
551
|
-
source=relation_node_document,
|
552
|
-
to=relation_node_entity,
|
553
|
-
)
|
554
|
-
self.brain.relations.append(rel)
|
555
555
|
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
556
|
+
if use_legacy_entities:
|
557
|
+
for klass_entity in metadata.positions.keys():
|
558
|
+
labels["e"].add(klass_entity)
|
559
|
+
klass, entity = _parse_entity(klass_entity)
|
560
|
+
relation = Relation()
|
561
|
+
relation.CopyFrom(base_entity_relation)
|
562
|
+
relation.to.value = entity
|
563
|
+
relation.to.subtype = klass
|
564
|
+
self.brain.relations.append(relation)
|
562
565
|
|
563
566
|
def apply_field_labels(
|
564
567
|
self,
|
565
568
|
field_key: str,
|
566
569
|
metadata: Optional[FieldComputedMetadata],
|
567
570
|
uuid: str,
|
571
|
+
generated_by: FieldAuthor,
|
568
572
|
basic_user_metadata: Optional[UserMetadata] = None,
|
569
573
|
basic_user_fieldmetadata: Optional[UserFieldMetadata] = None,
|
570
574
|
):
|
575
|
+
user_canceled_labels: set[str] = set()
|
571
576
|
if basic_user_metadata is not None:
|
572
|
-
user_canceled_labels
|
573
|
-
f"
|
577
|
+
user_canceled_labels.update(
|
578
|
+
f"{classification.labelset}/{classification.label}"
|
574
579
|
for classification in basic_user_metadata.classifications
|
575
580
|
if classification.cancelled_by_user
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
581
|
+
)
|
582
|
+
relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
|
583
|
+
labels: dict[str, set[str]] = {
|
584
|
+
"l": set(), # classification labels
|
585
|
+
"e": set(), # entities
|
586
|
+
"mt": set(), # mime type
|
587
|
+
"g/da": set(), # generated by
|
588
|
+
}
|
584
589
|
if metadata is not None:
|
585
590
|
for meta in metadata.split_metadata.values():
|
586
591
|
self.process_field_metadata(
|
@@ -601,7 +606,7 @@ class ResourceBrain:
|
|
601
606
|
if basic_user_fieldmetadata is not None:
|
602
607
|
for token in basic_user_fieldmetadata.token:
|
603
608
|
if token.cancelled_by_user is False:
|
604
|
-
labels["e"].
|
609
|
+
labels["e"].add(f"{token.klass}/{token.token}")
|
605
610
|
relation_node_entity = RelationNode(
|
606
611
|
value=token.token,
|
607
612
|
ntype=RelationNode.NodeType.ENTITY,
|
@@ -629,36 +634,33 @@ class ResourceBrain:
|
|
629
634
|
self.brain.paragraphs[field_key].paragraphs[
|
630
635
|
paragraph_annotation.key
|
631
636
|
].labels.append(label)
|
632
|
-
extend_unique(
|
633
|
-
self.brain.texts[field_key].labels, flatten_resource_labels(labels) # type: ignore
|
634
|
-
)
|
635
637
|
|
636
|
-
|
637
|
-
|
638
|
+
if generated_by.WhichOneof("author") == "data_augmentation":
|
639
|
+
field_type, field_id = field_key.split("/")
|
640
|
+
da_task_id = ids.extract_data_augmentation_id(field_id)
|
641
|
+
if da_task_id is None: # pragma: nocover
|
642
|
+
logger.warning(
|
643
|
+
"Data augmentation field id has an unexpected format! Skipping label",
|
644
|
+
extra={
|
645
|
+
"rid": uuid,
|
646
|
+
"field_id": field_id,
|
647
|
+
},
|
648
|
+
)
|
649
|
+
else:
|
650
|
+
labels["g/da"].add(da_task_id)
|
638
651
|
|
639
|
-
|
640
|
-
def get_paragraph_text(
|
641
|
-
extracted_text: ExtractedText, start: int, end: int, split: Optional[str] = None
|
642
|
-
) -> str:
|
643
|
-
if split is not None:
|
644
|
-
text = extracted_text.split_text[split]
|
645
|
-
else:
|
646
|
-
text = extracted_text.text
|
647
|
-
return text[start:end]
|
652
|
+
self.brain.texts[field_key].labels.extend(flatten_resource_labels(labels))
|
648
653
|
|
649
654
|
|
650
655
|
def is_paragraph_repeated_in_field(
|
651
656
|
paragraph: Paragraph,
|
652
|
-
extracted_text: Optional[
|
657
|
+
extracted_text: Optional[str],
|
653
658
|
unique_paragraphs: set[str],
|
654
|
-
split: Optional[str] = None,
|
655
659
|
) -> bool:
|
656
660
|
if extracted_text is None:
|
657
661
|
return False
|
658
662
|
|
659
|
-
paragraph_text =
|
660
|
-
extracted_text, start=paragraph.start, end=paragraph.end, split=split
|
661
|
-
)
|
663
|
+
paragraph_text = extracted_text[paragraph.start : paragraph.end]
|
662
664
|
if len(paragraph_text) == 0:
|
663
665
|
return False
|
664
666
|
|
@@ -695,12 +697,3 @@ class ParagraphPages:
|
|
695
697
|
if len(self._materialized) > 0:
|
696
698
|
return self._materialized[-1]
|
697
699
|
return 0
|
698
|
-
|
699
|
-
|
700
|
-
def extend_unique(a: list, b: list):
|
701
|
-
"""
|
702
|
-
Prevents extending with duplicate elements
|
703
|
-
"""
|
704
|
-
for item in b:
|
705
|
-
if item not in a:
|
706
|
-
a.append(item)
|