nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,136 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
|
20
|
-
from unittest.mock import AsyncMock, MagicMock, patch
|
21
|
-
|
22
|
-
import pytest
|
23
|
-
|
24
|
-
from nucliadb import purge
|
25
|
-
from nucliadb.common.cluster.exceptions import NodeError, ShardNotFound
|
26
|
-
|
27
|
-
pytestmark = pytest.mark.asyncio
|
28
|
-
|
29
|
-
|
30
|
-
class DataIterator:
|
31
|
-
def __init__(self, data):
|
32
|
-
self.data = data
|
33
|
-
|
34
|
-
def __call__(self, *args, **kwargs):
|
35
|
-
return self
|
36
|
-
|
37
|
-
async def __aiter__(self):
|
38
|
-
for item in self.data:
|
39
|
-
yield item
|
40
|
-
|
41
|
-
|
42
|
-
@pytest.fixture
|
43
|
-
def keys():
|
44
|
-
yield []
|
45
|
-
|
46
|
-
|
47
|
-
@pytest.fixture
|
48
|
-
def txn(keys):
|
49
|
-
mock = AsyncMock()
|
50
|
-
mock.keys = DataIterator(keys)
|
51
|
-
yield mock
|
52
|
-
|
53
|
-
|
54
|
-
@pytest.fixture
|
55
|
-
def driver(txn):
|
56
|
-
mock = AsyncMock()
|
57
|
-
cm = AsyncMock()
|
58
|
-
cm.__aenter__.return_value = txn
|
59
|
-
mock.transaction = MagicMock(return_value=cm)
|
60
|
-
yield mock
|
61
|
-
|
62
|
-
|
63
|
-
@pytest.fixture
|
64
|
-
def storage():
|
65
|
-
mock = AsyncMock()
|
66
|
-
mock.delete_kb.return_value = True, False
|
67
|
-
yield mock
|
68
|
-
|
69
|
-
|
70
|
-
@pytest.fixture(autouse=True)
|
71
|
-
def kb():
|
72
|
-
mock = AsyncMock()
|
73
|
-
with patch("nucliadb.purge.KnowledgeBox", mock):
|
74
|
-
yield mock
|
75
|
-
|
76
|
-
|
77
|
-
async def test_purge(kb, keys, driver):
|
78
|
-
keys.append("/pathto/kbid")
|
79
|
-
|
80
|
-
await purge.purge_kb(driver)
|
81
|
-
|
82
|
-
kb.purge.assert_called_once_with(driver, "kbid")
|
83
|
-
driver.begin.return_value.commit.assert_called_once()
|
84
|
-
|
85
|
-
|
86
|
-
async def test_purge_handle_errors(kb, keys, driver):
|
87
|
-
keys.append("/failed")
|
88
|
-
keys.append("/pathto/failed")
|
89
|
-
keys.append("/pathto/failed")
|
90
|
-
keys.append("/pathto/failed")
|
91
|
-
keys.append("/pathto/failed")
|
92
|
-
|
93
|
-
kb.purge.side_effect = [ShardNotFound(), NodeError(), Exception(), None]
|
94
|
-
driver.begin.return_value.delete.side_effect = Exception()
|
95
|
-
|
96
|
-
await purge.purge_kb(driver)
|
97
|
-
|
98
|
-
driver.begin.return_value.commit.assert_not_called()
|
99
|
-
driver.begin.return_value.abort.assert_called_once()
|
100
|
-
|
101
|
-
|
102
|
-
async def test_purge_kb_storage(
|
103
|
-
keys,
|
104
|
-
driver,
|
105
|
-
storage,
|
106
|
-
):
|
107
|
-
keys.append("/pathto/kbid")
|
108
|
-
|
109
|
-
await purge.purge_kb_storage(driver, storage)
|
110
|
-
|
111
|
-
driver.begin.return_value.commit.assert_called_once()
|
112
|
-
|
113
|
-
|
114
|
-
async def test_purge_kb_storage_handle_errors(keys, driver, storage):
|
115
|
-
keys.append("/failed")
|
116
|
-
keys.append("/pathto/failed")
|
117
|
-
|
118
|
-
driver.begin.return_value.delete.side_effect = Exception()
|
119
|
-
|
120
|
-
await purge.purge_kb_storage(driver, storage)
|
121
|
-
|
122
|
-
driver.begin.return_value.commit.assert_not_called()
|
123
|
-
|
124
|
-
|
125
|
-
async def test_main(driver, storage):
|
126
|
-
with (
|
127
|
-
patch("nucliadb.purge.purge_kb", AsyncMock()) as purge_kb,
|
128
|
-
patch("nucliadb.purge.purge_kb_storage", AsyncMock()) as purge_kb_storage,
|
129
|
-
patch("nucliadb.purge.get_storage", return_value=storage),
|
130
|
-
patch("nucliadb.purge.setup_driver", return_value=driver),
|
131
|
-
patch("nucliadb.purge.setup_cluster", return_value=driver),
|
132
|
-
):
|
133
|
-
await purge.main()
|
134
|
-
|
135
|
-
purge_kb.assert_called_once_with(driver)
|
136
|
-
purge_kb_storage.assert_called_once_with(driver, storage)
|
nucliadb/tests/utils/__init__.py
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import uuid
|
21
|
-
from datetime import datetime
|
22
|
-
|
23
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage, OpStatusWriter
|
24
|
-
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
25
|
-
|
26
|
-
from nucliadb_protos import resources_pb2 as rpb
|
27
|
-
|
28
|
-
|
29
|
-
def broker_resource(
|
30
|
-
kbid: str, rid=None, slug=None, title=None, summary=None
|
31
|
-
) -> BrokerMessage:
|
32
|
-
"""
|
33
|
-
Returns a broker resource with barebones metadata.
|
34
|
-
"""
|
35
|
-
rid = rid or str(uuid.uuid4())
|
36
|
-
slug = slug or f"{rid}slug1"
|
37
|
-
bm: BrokerMessage = BrokerMessage(
|
38
|
-
kbid=kbid,
|
39
|
-
uuid=rid,
|
40
|
-
slug=slug,
|
41
|
-
type=BrokerMessage.AUTOCOMMIT,
|
42
|
-
)
|
43
|
-
title = title or "Title Resource"
|
44
|
-
summary = summary or "Summary of document"
|
45
|
-
bm.basic.icon = "text/plain"
|
46
|
-
bm.basic.title = title
|
47
|
-
bm.basic.summary = summary
|
48
|
-
bm.basic.thumbnail = "doc"
|
49
|
-
bm.basic.layout = "default"
|
50
|
-
bm.basic.metadata.useful = True
|
51
|
-
bm.basic.metadata.language = "es"
|
52
|
-
bm.basic.created.FromDatetime(datetime.now())
|
53
|
-
bm.basic.modified.FromDatetime(datetime.now())
|
54
|
-
bm.origin.source = rpb.Origin.Source.WEB
|
55
|
-
|
56
|
-
etw = rpb.ExtractedTextWrapper()
|
57
|
-
etw.body.text = title
|
58
|
-
etw.field.field = "title"
|
59
|
-
etw.field.field_type = rpb.FieldType.GENERIC
|
60
|
-
bm.extracted_text.append(etw)
|
61
|
-
|
62
|
-
etw = rpb.ExtractedTextWrapper()
|
63
|
-
etw.body.text = summary
|
64
|
-
etw.field.field = "summary"
|
65
|
-
etw.field.field_type = rpb.FieldType.GENERIC
|
66
|
-
bm.extracted_text.append(etw)
|
67
|
-
|
68
|
-
bm.source = BrokerMessage.MessageSource.WRITER
|
69
|
-
return bm
|
70
|
-
|
71
|
-
|
72
|
-
async def inject_message(writer: WriterStub, message: BrokerMessage):
|
73
|
-
resp = await writer.ProcessMessage([message]) # type: ignore
|
74
|
-
assert resp.status == OpStatusWriter.Status.OK
|
@@ -1,44 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
from unittest.mock import AsyncMock, Mock
|
21
|
-
|
22
|
-
|
23
|
-
def get_mocked_session(
|
24
|
-
http_method: str, status: int, text=None, json=None, read=None, context_manager=True
|
25
|
-
):
|
26
|
-
response = Mock(status=status)
|
27
|
-
if text is not None:
|
28
|
-
response.text = AsyncMock(return_value=text)
|
29
|
-
if json is not None:
|
30
|
-
response.json = AsyncMock(return_value=json)
|
31
|
-
if read is not None:
|
32
|
-
if isinstance(read, str):
|
33
|
-
read = read.encode()
|
34
|
-
response.read = AsyncMock(return_value=read)
|
35
|
-
if context_manager:
|
36
|
-
# For when async with self.session.post() as response: is called
|
37
|
-
session = Mock()
|
38
|
-
http_method_mock = AsyncMock(__aenter__=AsyncMock(return_value=response))
|
39
|
-
getattr(session, http_method.lower()).return_value = http_method_mock
|
40
|
-
else:
|
41
|
-
# For when await self.session.post() is called
|
42
|
-
session = AsyncMock()
|
43
|
-
getattr(session, http_method.lower()).return_value = response
|
44
|
-
return session
|
@@ -1,171 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
from datetime import datetime
|
22
|
-
from typing import Optional
|
23
|
-
from uuid import uuid4
|
24
|
-
|
25
|
-
from nucliadb_protos import resources_pb2 as rpb
|
26
|
-
from nucliadb_protos import writer_pb2 as wpb
|
27
|
-
|
28
|
-
from .fields import FieldBuilder
|
29
|
-
from .helpers import labels_to_classifications
|
30
|
-
|
31
|
-
|
32
|
-
class BrokerMessageBuilder:
|
33
|
-
"""Helper to deal with broker message creation. It allows customized
|
34
|
-
creation of broker messages with sensible defaults and default title and
|
35
|
-
summary.
|
36
|
-
|
37
|
-
"""
|
38
|
-
|
39
|
-
def __init__(
|
40
|
-
self,
|
41
|
-
*,
|
42
|
-
kbid: str,
|
43
|
-
rid: Optional[str] = None,
|
44
|
-
slug: Optional[str] = None,
|
45
|
-
):
|
46
|
-
self.bm = wpb.BrokerMessage()
|
47
|
-
self.fields: dict[tuple[str, rpb.FieldType.ValueType], FieldBuilder] = {}
|
48
|
-
|
49
|
-
self.bm.kbid = kbid
|
50
|
-
self.bm.type = wpb.BrokerMessage.AUTOCOMMIT
|
51
|
-
|
52
|
-
# if first BM comes from PROCESSOR, it'll be ignored as it's out of order
|
53
|
-
self.bm.source = wpb.BrokerMessage.MessageSource.WRITER
|
54
|
-
|
55
|
-
if rid is None:
|
56
|
-
rid = str(uuid4())
|
57
|
-
self.bm.uuid = rid
|
58
|
-
|
59
|
-
if slug is None:
|
60
|
-
slug = f"{rid}-slug"
|
61
|
-
self.bm.slug = slug
|
62
|
-
|
63
|
-
self._default_basic()
|
64
|
-
self._default_origin()
|
65
|
-
|
66
|
-
def build(self) -> wpb.BrokerMessage:
|
67
|
-
self._apply_fields()
|
68
|
-
return self.bm
|
69
|
-
|
70
|
-
def add_field_builder(self, field: FieldBuilder):
|
71
|
-
self.fields[(field.id.field, field.id.field_type)] = field
|
72
|
-
|
73
|
-
def field_builder(
|
74
|
-
self, field_id: str, field_type: rpb.FieldType.ValueType
|
75
|
-
) -> FieldBuilder:
|
76
|
-
return self.fields[(field_id, field_type)]
|
77
|
-
|
78
|
-
def with_title(self, title: str):
|
79
|
-
title_builder = FieldBuilder("title", rpb.FieldType.GENERIC)
|
80
|
-
title_builder.with_extracted_text(title)
|
81
|
-
# we do this to writer BMs in write resource API endpoint
|
82
|
-
title_builder.with_extracted_paragraph_metadata(
|
83
|
-
rpb.Paragraph(
|
84
|
-
start=0,
|
85
|
-
end=len(title),
|
86
|
-
kind=rpb.Paragraph.TypeParagraph.TITLE,
|
87
|
-
)
|
88
|
-
)
|
89
|
-
self.bm.basic.title = title
|
90
|
-
self.add_field_builder(title_builder)
|
91
|
-
|
92
|
-
def with_summary(self, summary: str):
|
93
|
-
summary_builder = FieldBuilder("summary", rpb.FieldType.GENERIC)
|
94
|
-
summary_builder.with_extracted_text(summary)
|
95
|
-
# we do this to writer BMs in write resource API endpoint
|
96
|
-
summary_builder.with_extracted_paragraph_metadata(
|
97
|
-
rpb.Paragraph(
|
98
|
-
start=0,
|
99
|
-
end=len(summary),
|
100
|
-
kind=rpb.Paragraph.TypeParagraph.DESCRIPTION,
|
101
|
-
)
|
102
|
-
)
|
103
|
-
self.bm.basic.summary = summary
|
104
|
-
self.add_field_builder(summary_builder)
|
105
|
-
|
106
|
-
def with_resource_labels(self, labelset: str, labels: list[str]):
|
107
|
-
classifications = labels_to_classifications(labelset, labels)
|
108
|
-
self.bm.basic.usermetadata.classifications.extend(classifications)
|
109
|
-
|
110
|
-
def _default_basic(self):
|
111
|
-
self.bm.basic.icon = "text/plain"
|
112
|
-
self.bm.basic.thumbnail = "doc"
|
113
|
-
self.bm.basic.layout = "default"
|
114
|
-
self.bm.basic.metadata.useful = True
|
115
|
-
self.bm.basic.metadata.language = "en"
|
116
|
-
self.bm.basic.metadata.status = rpb.Metadata.Status.PROCESSED
|
117
|
-
self.bm.basic.metadata.metadata["key"] = "value"
|
118
|
-
self.bm.basic.created.FromDatetime(datetime.now())
|
119
|
-
self.bm.basic.modified.FromDatetime(datetime.now())
|
120
|
-
|
121
|
-
self.with_title("Default test resource title")
|
122
|
-
self.with_summary("Default test resource summary")
|
123
|
-
|
124
|
-
def _default_origin(self):
|
125
|
-
self.bm.origin.source = rpb.Origin.Source.API
|
126
|
-
self.bm.origin.source_id = "My Source"
|
127
|
-
self.bm.origin.created.FromDatetime(datetime.now())
|
128
|
-
self.bm.origin.modified.FromDatetime(datetime.now())
|
129
|
-
|
130
|
-
def _apply_fields(self):
|
131
|
-
def replace_if_exists(mut_iterable, field_id: rpb.FieldID, item):
|
132
|
-
for obj in mut_iterable:
|
133
|
-
if obj.field == field_id:
|
134
|
-
obj.Clear()
|
135
|
-
obj.CopyFrom(item)
|
136
|
-
break
|
137
|
-
else:
|
138
|
-
mut_iterable.append(item)
|
139
|
-
|
140
|
-
for field_builder in self.fields.values():
|
141
|
-
field = field_builder.build()
|
142
|
-
|
143
|
-
if field.id.field_type == rpb.FieldType.GENERIC:
|
144
|
-
pass
|
145
|
-
elif field.id.field_type == rpb.FieldType.FILE:
|
146
|
-
file_field = self.bm.files[field.id.field]
|
147
|
-
file_field.added.FromDatetime(datetime.now())
|
148
|
-
file_field.file.source = rpb.CloudFile.Source.EXTERNAL
|
149
|
-
else:
|
150
|
-
raise Exception("Unsupported field type")
|
151
|
-
|
152
|
-
if field.user.metadata is not None:
|
153
|
-
replace_if_exists(
|
154
|
-
self.bm.basic.fieldmetadata, field.id, field.user.metadata
|
155
|
-
)
|
156
|
-
if field.extracted.metadata is not None:
|
157
|
-
replace_if_exists(
|
158
|
-
self.bm.field_metadata, field.id, field.extracted.metadata
|
159
|
-
)
|
160
|
-
if field.extracted.text is not None:
|
161
|
-
replace_if_exists(
|
162
|
-
self.bm.extracted_text, field.id, field.extracted.text
|
163
|
-
)
|
164
|
-
if field.extracted.vectors is not None:
|
165
|
-
replace_if_exists(
|
166
|
-
self.bm.field_vectors, field.id, field.extracted.vectors
|
167
|
-
)
|
168
|
-
if field.extracted.question_answers is not None:
|
169
|
-
replace_if_exists(
|
170
|
-
self.bm.question_answers, field.id, field.extracted.question_answers
|
171
|
-
)
|
@@ -1,197 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
import dataclasses
|
22
|
-
from datetime import datetime
|
23
|
-
from typing import Optional
|
24
|
-
|
25
|
-
from nucliadb_protos import resources_pb2 as rpb
|
26
|
-
from nucliadb_protos import utils_pb2
|
27
|
-
|
28
|
-
from .helpers import labels_to_classifications
|
29
|
-
|
30
|
-
|
31
|
-
@dataclasses.dataclass
|
32
|
-
class FieldUser:
|
33
|
-
metadata: Optional[rpb.UserFieldMetadata] = None
|
34
|
-
|
35
|
-
|
36
|
-
@dataclasses.dataclass
|
37
|
-
class FieldExtracted:
|
38
|
-
metadata: Optional[rpb.FieldComputedMetadataWrapper] = None
|
39
|
-
text: Optional[rpb.ExtractedTextWrapper] = None
|
40
|
-
vectors: Optional[rpb.ExtractedVectorsWrapper] = None
|
41
|
-
question_answers: Optional[rpb.FieldQuestionAnswerWrapper] = None
|
42
|
-
|
43
|
-
|
44
|
-
@dataclasses.dataclass
|
45
|
-
class Field:
|
46
|
-
id: rpb.FieldID
|
47
|
-
user: FieldUser = dataclasses.field(default_factory=FieldUser)
|
48
|
-
extracted: FieldExtracted = dataclasses.field(default_factory=FieldExtracted)
|
49
|
-
|
50
|
-
|
51
|
-
class FieldBuilder:
|
52
|
-
def __init__(self, field: str, field_type: rpb.FieldType.ValueType):
|
53
|
-
self._field_id = rpb.FieldID(field=field, field_type=field_type)
|
54
|
-
self.__extracted_metadata: Optional[rpb.FieldComputedMetadataWrapper] = None
|
55
|
-
self.__extracted_text: Optional[rpb.ExtractedTextWrapper] = None
|
56
|
-
self.__extracted_vectors: Optional[rpb.ExtractedVectorsWrapper] = None
|
57
|
-
self.__user_metadata: Optional[rpb.UserFieldMetadata] = None
|
58
|
-
self.__question_answers: Optional[rpb.FieldQuestionAnswerWrapper] = None
|
59
|
-
|
60
|
-
@property
|
61
|
-
def id(self) -> rpb.FieldID:
|
62
|
-
return self._field_id
|
63
|
-
|
64
|
-
# properties to generate a default value per pb
|
65
|
-
|
66
|
-
@property
|
67
|
-
def _extracted_metadata(self) -> rpb.FieldComputedMetadataWrapper:
|
68
|
-
if self.__extracted_metadata is None:
|
69
|
-
now = datetime.now()
|
70
|
-
self.__extracted_metadata = rpb.FieldComputedMetadataWrapper(
|
71
|
-
field=self._field_id,
|
72
|
-
)
|
73
|
-
self.__extracted_metadata.metadata.metadata.last_index.FromDatetime(now)
|
74
|
-
self.__extracted_metadata.metadata.metadata.last_understanding.FromDatetime(
|
75
|
-
now
|
76
|
-
)
|
77
|
-
self.__extracted_metadata.metadata.metadata.last_extract.FromDatetime(now)
|
78
|
-
return self.__extracted_metadata
|
79
|
-
|
80
|
-
@property
|
81
|
-
def _extracted_text(self) -> rpb.ExtractedTextWrapper:
|
82
|
-
if self.__extracted_text is None:
|
83
|
-
self.__extracted_text = rpb.ExtractedTextWrapper(field=self._field_id)
|
84
|
-
return self.__extracted_text
|
85
|
-
|
86
|
-
@property
|
87
|
-
def _extracted_vectors(self) -> rpb.ExtractedVectorsWrapper:
|
88
|
-
if self.__extracted_vectors is None:
|
89
|
-
self.__extracted_vectors = rpb.ExtractedVectorsWrapper(field=self._field_id)
|
90
|
-
return self.__extracted_vectors
|
91
|
-
|
92
|
-
@property
|
93
|
-
def _question_answers(self) -> rpb.FieldQuestionAnswerWrapper:
|
94
|
-
if self.__question_answers is None:
|
95
|
-
self.__question_answers = rpb.FieldQuestionAnswerWrapper(
|
96
|
-
field=self._field_id
|
97
|
-
)
|
98
|
-
return self.__question_answers
|
99
|
-
|
100
|
-
@property
|
101
|
-
def _user_metadata(self) -> rpb.UserFieldMetadata:
|
102
|
-
if self.__user_metadata is None:
|
103
|
-
self.__user_metadata = rpb.UserFieldMetadata(field=self._field_id)
|
104
|
-
return self.__user_metadata
|
105
|
-
|
106
|
-
def build(self) -> Field:
|
107
|
-
field = Field(id=self._field_id)
|
108
|
-
|
109
|
-
if self.__extracted_metadata is not None:
|
110
|
-
field.extracted.metadata = rpb.FieldComputedMetadataWrapper()
|
111
|
-
field.extracted.metadata.CopyFrom(self.__extracted_metadata)
|
112
|
-
|
113
|
-
if self.__extracted_text is not None:
|
114
|
-
field.extracted.text = rpb.ExtractedTextWrapper()
|
115
|
-
field.extracted.text.CopyFrom(self.__extracted_text)
|
116
|
-
|
117
|
-
if self.__extracted_vectors is not None:
|
118
|
-
field.extracted.vectors = rpb.ExtractedVectorsWrapper()
|
119
|
-
field.extracted.vectors.CopyFrom(self.__extracted_vectors)
|
120
|
-
|
121
|
-
if self.__question_answers is not None:
|
122
|
-
field.extracted.question_answers = rpb.FieldQuestionAnswerWrapper()
|
123
|
-
field.extracted.question_answers.CopyFrom(self.__question_answers)
|
124
|
-
|
125
|
-
if self.__user_metadata is not None:
|
126
|
-
field.user.metadata = rpb.UserFieldMetadata()
|
127
|
-
field.user.metadata.CopyFrom(self.__user_metadata)
|
128
|
-
|
129
|
-
return field
|
130
|
-
|
131
|
-
def with_extracted_labels(self, labelset: str, labels: list[str]):
|
132
|
-
classifications = labels_to_classifications(labelset, labels)
|
133
|
-
self._extracted_metadata.metadata.metadata.classifications.extend(
|
134
|
-
classifications
|
135
|
-
)
|
136
|
-
|
137
|
-
def with_extracted_text(self, text: str):
|
138
|
-
self._extracted_text.body.text = text
|
139
|
-
|
140
|
-
def with_extracted_vectors(self, vectors: list[utils_pb2.Vector]):
|
141
|
-
self._extracted_vectors.vectors.vectors.vectors.extend(vectors)
|
142
|
-
|
143
|
-
def with_extracted_paragraph_metadata(self, paragraph: rpb.Paragraph):
|
144
|
-
self._extracted_metadata.metadata.metadata.paragraphs.append(paragraph)
|
145
|
-
|
146
|
-
def with_user_entity(self, klass: str, name: str, *, start: int, end: int):
|
147
|
-
entity = rpb.TokenSplit(
|
148
|
-
klass=klass,
|
149
|
-
token=name,
|
150
|
-
start=start,
|
151
|
-
end=end,
|
152
|
-
)
|
153
|
-
self._user_metadata.token.append(entity)
|
154
|
-
|
155
|
-
def with_extracted_entity(
|
156
|
-
self, klass: str, name: str, *, positions: list[rpb.Position]
|
157
|
-
):
|
158
|
-
entity = self._extracted_metadata.metadata.metadata.positions[f"{klass}/{name}"]
|
159
|
-
entity.entity = name
|
160
|
-
entity.position.extend(positions)
|
161
|
-
|
162
|
-
def with_user_paragraph_labels(self, key: str, labelset: str, labels: list[str]):
|
163
|
-
classifications = labels_to_classifications(labelset, labels)
|
164
|
-
pa = rpb.ParagraphAnnotation()
|
165
|
-
pa.key = key
|
166
|
-
pa.classifications.extend(classifications)
|
167
|
-
self._user_metadata.paragraphs.append(pa)
|
168
|
-
|
169
|
-
def add_question_answer(
|
170
|
-
self,
|
171
|
-
question: str,
|
172
|
-
answer: str,
|
173
|
-
question_lang: str = "en",
|
174
|
-
question_paragraph_ids: list[str] = [],
|
175
|
-
answer_lang: str = "en",
|
176
|
-
answer_paragraph_ids: list[str] = [],
|
177
|
-
):
|
178
|
-
question_pb = rpb.Question(
|
179
|
-
text=question,
|
180
|
-
language=question_lang,
|
181
|
-
ids_paragraphs=question_paragraph_ids,
|
182
|
-
)
|
183
|
-
answer_pb = rpb.Answers(
|
184
|
-
text=answer,
|
185
|
-
language=answer_lang,
|
186
|
-
ids_paragraphs=answer_paragraph_ids,
|
187
|
-
)
|
188
|
-
|
189
|
-
# check if is another answer for an already added question
|
190
|
-
for question_answer in self._question_answers.question_answers.question_answer:
|
191
|
-
if question_answer.question == question_pb:
|
192
|
-
question_answer.answers.append(answer_pb)
|
193
|
-
return
|
194
|
-
|
195
|
-
question_answer = rpb.QuestionAnswer(question=question_pb)
|
196
|
-
question_answer.answers.append(answer_pb)
|
197
|
-
self._question_answers.question_answers.question_answer.append(question_answer)
|
@@ -1,33 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
from nucliadb_protos.resources_pb2 import Classification
|
22
|
-
|
23
|
-
|
24
|
-
def labels_to_classifications(labelset: str, labels: list[str]) -> list[Classification]:
|
25
|
-
classifications = [
|
26
|
-
Classification(
|
27
|
-
labelset=labelset,
|
28
|
-
label=label,
|
29
|
-
cancelled_by_user=False,
|
30
|
-
)
|
31
|
-
for label in labels
|
32
|
-
]
|
33
|
-
return classifications
|