nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/tests/utils/entities.py
DELETED
@@ -1,78 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import asyncio
|
21
|
-
import time
|
22
|
-
|
23
|
-
from httpx import AsyncClient
|
24
|
-
from nucliadb_protos.knowledgebox_pb2 import KnowledgeBoxID
|
25
|
-
from nucliadb_protos.writer_pb2 import GetEntitiesGroupRequest, GetEntitiesGroupResponse
|
26
|
-
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
27
|
-
|
28
|
-
from nucliadb.writer.api.v1.router import KB_PREFIX
|
29
|
-
from nucliadb_models.entities import (
|
30
|
-
CreateEntitiesGroupPayload,
|
31
|
-
UpdateEntitiesGroupPayload,
|
32
|
-
)
|
33
|
-
|
34
|
-
|
35
|
-
async def create_entities_group(
|
36
|
-
writer: AsyncClient, kbid: str, payload: CreateEntitiesGroupPayload
|
37
|
-
):
|
38
|
-
resp = await writer.post(
|
39
|
-
f"/{KB_PREFIX}/{kbid}/entitiesgroups",
|
40
|
-
content=payload.json(),
|
41
|
-
)
|
42
|
-
return resp
|
43
|
-
|
44
|
-
|
45
|
-
async def update_entities_group(
|
46
|
-
writer: AsyncClient,
|
47
|
-
kbid: str,
|
48
|
-
group: str,
|
49
|
-
payload: UpdateEntitiesGroupPayload,
|
50
|
-
):
|
51
|
-
resp = await writer.patch(
|
52
|
-
f"/{KB_PREFIX}/{kbid}/entitiesgroup/{group}",
|
53
|
-
content=payload.json(),
|
54
|
-
)
|
55
|
-
return resp
|
56
|
-
|
57
|
-
|
58
|
-
async def delete_entities_group(writer: AsyncClient, kbid: str, group: str):
|
59
|
-
resp = await writer.delete(f"/{KB_PREFIX}/{kbid}/entitiesgroup/{group}")
|
60
|
-
return resp
|
61
|
-
|
62
|
-
|
63
|
-
async def wait_until_entity(
|
64
|
-
ingest: WriterStub, kbid: str, group: str, entity: str, timeout: float = 1.0
|
65
|
-
):
|
66
|
-
start = time.time()
|
67
|
-
found = False
|
68
|
-
while not found:
|
69
|
-
response: GetEntitiesGroupResponse = await ingest.GetEntitiesGroup( # type: ignore
|
70
|
-
GetEntitiesGroupRequest(kb=KnowledgeBoxID(uuid=kbid), group=group)
|
71
|
-
)
|
72
|
-
found = entity in response.group.entities
|
73
|
-
assert (
|
74
|
-
time.time() - start < timeout
|
75
|
-
), "Timeout while waiting for entity {group}/{entity}"
|
76
|
-
|
77
|
-
if not found:
|
78
|
-
await asyncio.sleep(0.1)
|
nucliadb/train/api/v1/check.py
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
from fastapi import Request
|
22
|
-
from fastapi_versioning import version # type: ignore
|
23
|
-
|
24
|
-
from nucliadb.train.api.utils import get_kb_partitions
|
25
|
-
from nucliadb.train.api.v1.router import KB_PREFIX, api
|
26
|
-
from nucliadb_models.resource import NucliaDBRoles
|
27
|
-
from nucliadb_models.trainset import TrainSetPartitions
|
28
|
-
from nucliadb_utils.authentication import requires_one
|
29
|
-
|
30
|
-
|
31
|
-
@api.get(
|
32
|
-
f"/{KB_PREFIX}/{{kbid}}/check/labeler/{{labelset}}",
|
33
|
-
tags=["Train"],
|
34
|
-
status_code=200,
|
35
|
-
summary="Return check status of labels",
|
36
|
-
response_model=TrainSetPartitions,
|
37
|
-
)
|
38
|
-
@version(1)
|
39
|
-
@requires_one([NucliaDBRoles.READER])
|
40
|
-
async def check_labeler(
|
41
|
-
request: Request, kbid: str, labelset: str
|
42
|
-
) -> TrainSetPartitions:
|
43
|
-
all_keys = await get_kb_partitions(kbid)
|
44
|
-
return TrainSetPartitions(partitions=all_keys)
|
45
|
-
|
46
|
-
|
47
|
-
@api.get(
|
48
|
-
f"/{KB_PREFIX}/{{kbid}}/check/ner/{{entitygroup}}",
|
49
|
-
tags=["Train"],
|
50
|
-
status_code=200,
|
51
|
-
summary="Return check status of entities",
|
52
|
-
response_model=TrainSetPartitions,
|
53
|
-
)
|
54
|
-
@version(1)
|
55
|
-
@requires_one([NucliaDBRoles.READER])
|
56
|
-
async def check_ner(
|
57
|
-
request: Request, kbid: str, entitygroup: str
|
58
|
-
) -> TrainSetPartitions:
|
59
|
-
all_keys = await get_kb_partitions(kbid)
|
60
|
-
return TrainSetPartitions(partitions=all_keys)
|
nucliadb/train/tests/__init__.py
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
nucliadb/train/tests/conftest.py
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
pytest_plugins = [
|
21
|
-
"pytest_docker_fixtures",
|
22
|
-
"nucliadb.ingest.tests.fixtures",
|
23
|
-
"nucliadb.tests.fixtures",
|
24
|
-
"nucliadb.train.tests.fixtures",
|
25
|
-
"nucliadb_utils.tests.nats",
|
26
|
-
"nucliadb_utils.tests.conftest",
|
27
|
-
"nucliadb_utils.tests.gcs",
|
28
|
-
"nucliadb_utils.tests.s3",
|
29
|
-
]
|
nucliadb/train/tests/fixtures.py
DELETED
@@ -1,342 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import asyncio
|
21
|
-
import uuid
|
22
|
-
from datetime import datetime
|
23
|
-
|
24
|
-
import aiohttp
|
25
|
-
import pytest
|
26
|
-
from grpc import aio
|
27
|
-
from nucliadb_protos.knowledgebox_pb2 import EntitiesGroup, Label, LabelSet
|
28
|
-
from nucliadb_protos.resources_pb2 import (
|
29
|
-
ExtractedTextWrapper,
|
30
|
-
FieldComputedMetadataWrapper,
|
31
|
-
FieldID,
|
32
|
-
FieldType,
|
33
|
-
Paragraph,
|
34
|
-
Position,
|
35
|
-
Sentence,
|
36
|
-
)
|
37
|
-
from nucliadb_protos.writer_pb2 import (
|
38
|
-
BrokerMessage,
|
39
|
-
SetEntitiesRequest,
|
40
|
-
SetLabelsRequest,
|
41
|
-
)
|
42
|
-
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
43
|
-
|
44
|
-
from nucliadb.ingest.orm.entities import EntitiesManager
|
45
|
-
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
46
|
-
from nucliadb.ingest.orm.processor import Processor
|
47
|
-
from nucliadb.ingest.orm.resource import KB_RESOURCE_SLUG_BASE
|
48
|
-
from nucliadb.standalone.settings import Settings
|
49
|
-
from nucliadb.train.utils import start_shard_manager, stop_shard_manager
|
50
|
-
from nucliadb_utils.tests import free_port
|
51
|
-
from nucliadb_utils.utilities import clear_global_cache, get_storage
|
52
|
-
|
53
|
-
|
54
|
-
@pytest.fixture(scope="function")
|
55
|
-
async def train_rest_api(nucliadb: Settings): # type: ignore
|
56
|
-
async with aiohttp.ClientSession(
|
57
|
-
headers={"X-NUCLIADB-ROLES": "READER"},
|
58
|
-
base_url=f"http://localhost:{nucliadb.http_port}",
|
59
|
-
) as client:
|
60
|
-
yield client
|
61
|
-
|
62
|
-
|
63
|
-
@pytest.fixture(scope="function")
|
64
|
-
async def writer_rest_api(nucliadb: Settings): # type: ignore
|
65
|
-
async with aiohttp.ClientSession(
|
66
|
-
headers={"X-NUCLIADB-ROLES": "WRITER"},
|
67
|
-
base_url=f"http://localhost:{nucliadb.http_port}",
|
68
|
-
) as client:
|
69
|
-
yield client
|
70
|
-
|
71
|
-
|
72
|
-
@pytest.fixture(scope="function")
|
73
|
-
async def knowledgebox_with_labels(nucliadb_grpc: WriterStub, knowledgebox: str):
|
74
|
-
slr = SetLabelsRequest()
|
75
|
-
slr.kb.uuid = knowledgebox
|
76
|
-
slr.id = "labelset_paragraphs"
|
77
|
-
slr.labelset.kind.append(LabelSet.LabelSetKind.PARAGRAPHS)
|
78
|
-
l1 = Label(title="label_machine")
|
79
|
-
l2 = Label(title="label_user")
|
80
|
-
slr.labelset.labels.append(l1)
|
81
|
-
slr.labelset.labels.append(l2)
|
82
|
-
await nucliadb_grpc.SetLabels(slr) # type: ignore
|
83
|
-
|
84
|
-
slr = SetLabelsRequest()
|
85
|
-
slr.kb.uuid = knowledgebox
|
86
|
-
slr.id = "labelset_resources"
|
87
|
-
slr.labelset.kind.append(LabelSet.LabelSetKind.RESOURCES)
|
88
|
-
l1 = Label(title="label_machine")
|
89
|
-
l2 = Label(title="label_user")
|
90
|
-
slr.labelset.labels.append(l1)
|
91
|
-
slr.labelset.labels.append(l2)
|
92
|
-
await nucliadb_grpc.SetLabels(slr) # type: ignore
|
93
|
-
|
94
|
-
yield knowledgebox
|
95
|
-
|
96
|
-
|
97
|
-
@pytest.fixture(scope="function")
|
98
|
-
async def knowledgebox_with_entities(nucliadb_grpc: WriterStub, knowledgebox: str):
|
99
|
-
ser = SetEntitiesRequest()
|
100
|
-
ser.kb.uuid = knowledgebox
|
101
|
-
ser.group = "PERSON"
|
102
|
-
ser.entities.title = "PERSON"
|
103
|
-
ser.entities.entities["Ramon"].value = "Ramon"
|
104
|
-
ser.entities.entities["Eudald Camprubi"].value = "Eudald Camprubi"
|
105
|
-
ser.entities.entities["Carmen Iniesta"].value = "Carmen Iniesta"
|
106
|
-
ser.entities.entities["el Super Fran"].value = "el Super Fran"
|
107
|
-
await nucliadb_grpc.SetEntities(ser) # type: ignore
|
108
|
-
|
109
|
-
ser = SetEntitiesRequest()
|
110
|
-
ser.kb.uuid = knowledgebox
|
111
|
-
ser.group = "ORG"
|
112
|
-
ser.entities.title = "ORG"
|
113
|
-
ser.entities.entities["Nuclia"].value = "Nuclia"
|
114
|
-
ser.entities.entities["Debian"].value = "Debian"
|
115
|
-
ser.entities.entities["Generalitat de Catalunya"].value = "Generalitat de Catalunya"
|
116
|
-
await nucliadb_grpc.SetEntities(ser) # type: ignore
|
117
|
-
|
118
|
-
yield knowledgebox
|
119
|
-
|
120
|
-
|
121
|
-
def broker_simple_resource(knowledgebox: str, number: int) -> BrokerMessage:
|
122
|
-
rid = str(uuid.uuid4())
|
123
|
-
message1: BrokerMessage = BrokerMessage(
|
124
|
-
kbid=knowledgebox,
|
125
|
-
uuid=rid,
|
126
|
-
slug=str(number),
|
127
|
-
type=BrokerMessage.AUTOCOMMIT,
|
128
|
-
)
|
129
|
-
|
130
|
-
message1.basic.slug = str(number)
|
131
|
-
message1.basic.icon = "text/plain"
|
132
|
-
message1.basic.title = f"MY TITLE {number}"
|
133
|
-
message1.basic.summary = "Summary of document"
|
134
|
-
message1.basic.thumbnail = "doc"
|
135
|
-
message1.basic.layout = "default"
|
136
|
-
message1.basic.metadata.useful = True
|
137
|
-
message1.basic.metadata.language = "es"
|
138
|
-
message1.basic.created.FromDatetime(datetime.utcnow())
|
139
|
-
message1.basic.modified.FromDatetime(datetime.utcnow())
|
140
|
-
message1.texts["field1"].body = (
|
141
|
-
"My lovely field with some information from Barcelona. This will be the good field. \n\n And then we will go Manresa." # noqa
|
142
|
-
)
|
143
|
-
message1.source = BrokerMessage.MessageSource.WRITER
|
144
|
-
return message1
|
145
|
-
|
146
|
-
|
147
|
-
def broker_processed_resource(knowledgebox, number, rid) -> BrokerMessage:
|
148
|
-
message2: BrokerMessage = BrokerMessage(
|
149
|
-
kbid=knowledgebox,
|
150
|
-
uuid=rid,
|
151
|
-
slug=str(number),
|
152
|
-
type=BrokerMessage.AUTOCOMMIT,
|
153
|
-
)
|
154
|
-
message2.basic.metadata.useful = True
|
155
|
-
message2.basic.metadata.language = "es"
|
156
|
-
message2.source = BrokerMessage.MessageSource.PROCESSOR
|
157
|
-
|
158
|
-
field1_if = FieldID()
|
159
|
-
field1_if.field = "field1"
|
160
|
-
field1_if.field_type = FieldType.TEXT
|
161
|
-
|
162
|
-
title_if = FieldID()
|
163
|
-
title_if.field = "title"
|
164
|
-
title_if.field_type = FieldType.GENERIC
|
165
|
-
|
166
|
-
etw = ExtractedTextWrapper()
|
167
|
-
etw.field.CopyFrom(field1_if)
|
168
|
-
etw.body.text = "My lovely field with some information from Barcelona. This will be the good field. \n\n And then we will go Manresa. I miss Manresa!" # noqa
|
169
|
-
message2.extracted_text.append(etw)
|
170
|
-
|
171
|
-
fcmw = FieldComputedMetadataWrapper()
|
172
|
-
fcmw.field.CopyFrom(field1_if)
|
173
|
-
p1 = Paragraph()
|
174
|
-
p1.start = 0
|
175
|
-
p1.end = 82
|
176
|
-
s1 = Sentence()
|
177
|
-
s1.start = 0
|
178
|
-
s1.end = 52
|
179
|
-
p1.sentences.append(s1)
|
180
|
-
s1 = Sentence()
|
181
|
-
s1.start = 53
|
182
|
-
s1.end = 82
|
183
|
-
p1.sentences.append(s1)
|
184
|
-
|
185
|
-
p2 = Paragraph()
|
186
|
-
p2.start = 84
|
187
|
-
p2.end = 130
|
188
|
-
|
189
|
-
s1 = Sentence()
|
190
|
-
s1.start = 84
|
191
|
-
s1.end = 130
|
192
|
-
p2.sentences.append(s1)
|
193
|
-
|
194
|
-
fcmw.metadata.metadata.paragraphs.append(p1)
|
195
|
-
fcmw.metadata.metadata.paragraphs.append(p2)
|
196
|
-
|
197
|
-
# Add a ner with positions
|
198
|
-
fcmw.metadata.metadata.ner.update(
|
199
|
-
{
|
200
|
-
"Barcelona": "CITY",
|
201
|
-
"Manresa": "CITY",
|
202
|
-
}
|
203
|
-
)
|
204
|
-
fcmw.metadata.metadata.positions["CITY/Barcelona"].entity = "Barcelona"
|
205
|
-
fcmw.metadata.metadata.positions["CITY/Barcelona"].position.append(
|
206
|
-
Position(start=43, end=52)
|
207
|
-
)
|
208
|
-
message2.field_metadata.append(fcmw)
|
209
|
-
|
210
|
-
etw = ExtractedTextWrapper()
|
211
|
-
etw.field.CopyFrom(title_if)
|
212
|
-
etw.body.text = f"MY TITLE {number}"
|
213
|
-
message2.extracted_text.append(etw)
|
214
|
-
|
215
|
-
fcmw = FieldComputedMetadataWrapper()
|
216
|
-
fcmw.field.CopyFrom(title_if)
|
217
|
-
p1 = Paragraph()
|
218
|
-
p1.start = 0
|
219
|
-
p1.end = len(etw.body.text)
|
220
|
-
s1 = Sentence()
|
221
|
-
s1.start = 0
|
222
|
-
s1.end = len(etw.body.text)
|
223
|
-
p1.sentences.append(s1)
|
224
|
-
fcmw.metadata.metadata.paragraphs.append(p1)
|
225
|
-
message2.field_metadata.append(fcmw)
|
226
|
-
message2.basic.metadata.language = "es"
|
227
|
-
|
228
|
-
return message2
|
229
|
-
|
230
|
-
|
231
|
-
# This fixtures should be deleted once grpc train interface is removed
|
232
|
-
|
233
|
-
|
234
|
-
@pytest.fixture(scope="function")
|
235
|
-
async def test_pagination_resources(
|
236
|
-
processor: Processor, knowledgebox_ingest, test_settings_train
|
237
|
-
):
|
238
|
-
"""
|
239
|
-
Create a set of resources with only basic information to test pagination
|
240
|
-
"""
|
241
|
-
amount = 10
|
242
|
-
|
243
|
-
# Create resources
|
244
|
-
for i in range(1, amount + 1):
|
245
|
-
message = broker_simple_resource(knowledgebox_ingest, i)
|
246
|
-
await processor.process(message=message, seqid=-1, transaction_check=False)
|
247
|
-
|
248
|
-
message = broker_processed_resource(knowledgebox_ingest, i, message.uuid)
|
249
|
-
await processor.process(message=message, seqid=-1, transaction_check=False)
|
250
|
-
# Give processed data some time to reach the node
|
251
|
-
|
252
|
-
from time import time
|
253
|
-
|
254
|
-
from nucliadb.common.maindb.utils import get_driver
|
255
|
-
|
256
|
-
driver = get_driver()
|
257
|
-
|
258
|
-
t0 = time()
|
259
|
-
|
260
|
-
while time() - t0 < 30: # wait max 30 seconds for it
|
261
|
-
txn = await driver.begin()
|
262
|
-
count = 0
|
263
|
-
async for key in txn.keys(
|
264
|
-
match=KB_RESOURCE_SLUG_BASE.format(kbid=knowledgebox_ingest), count=-1
|
265
|
-
):
|
266
|
-
count += 1
|
267
|
-
await txn.abort()
|
268
|
-
if count == amount:
|
269
|
-
break
|
270
|
-
print(f"got {count}, retrying")
|
271
|
-
await asyncio.sleep(2)
|
272
|
-
|
273
|
-
# Add entities
|
274
|
-
storage = await get_storage()
|
275
|
-
txn = await driver.begin()
|
276
|
-
kb = KnowledgeBox(txn, storage, kbid=knowledgebox_ingest)
|
277
|
-
entities_manager = EntitiesManager(kb, txn)
|
278
|
-
entities = EntitiesGroup()
|
279
|
-
entities.entities["entity1"].value = "PERSON"
|
280
|
-
await entities_manager.set_entities_group_force("group1", entities)
|
281
|
-
|
282
|
-
# Add ontology
|
283
|
-
labelset = LabelSet()
|
284
|
-
labelset.title = "ls1"
|
285
|
-
label = Label()
|
286
|
-
label_title = "label1"
|
287
|
-
label.title = label_title
|
288
|
-
labelset.labels.append(label)
|
289
|
-
await kb.set_labelset(label_title, labelset)
|
290
|
-
await txn.commit()
|
291
|
-
|
292
|
-
yield knowledgebox_ingest
|
293
|
-
|
294
|
-
|
295
|
-
@pytest.fixture(scope="function")
|
296
|
-
def test_settings_train(cache, gcs, fake_node, maindb_driver): # type: ignore
|
297
|
-
from nucliadb.train.settings import settings
|
298
|
-
from nucliadb_utils.settings import (
|
299
|
-
FileBackendConfig,
|
300
|
-
running_settings,
|
301
|
-
storage_settings,
|
302
|
-
)
|
303
|
-
|
304
|
-
running_settings.debug = False
|
305
|
-
print(f"Redis ready at {maindb_driver.url}")
|
306
|
-
|
307
|
-
old_file_backend = storage_settings.file_backend
|
308
|
-
old_gcs_endpoint_url = storage_settings.gcs_endpoint_url
|
309
|
-
old_gcs_bucket = storage_settings.gcs_bucket
|
310
|
-
old_grpc_port = settings.grpc_port
|
311
|
-
|
312
|
-
storage_settings.gcs_endpoint_url = gcs
|
313
|
-
storage_settings.file_backend = FileBackendConfig.GCS
|
314
|
-
storage_settings.gcs_bucket = "test_{kbid}"
|
315
|
-
settings.grpc_port = free_port()
|
316
|
-
yield
|
317
|
-
storage_settings.file_backend = old_file_backend
|
318
|
-
storage_settings.gcs_endpoint_url = old_gcs_endpoint_url
|
319
|
-
storage_settings.gcs_bucket = old_gcs_bucket
|
320
|
-
settings.grpc_port = old_grpc_port
|
321
|
-
|
322
|
-
|
323
|
-
@pytest.fixture(scope="function")
|
324
|
-
async def train_api(test_settings_train: None, local_files): # type: ignore
|
325
|
-
from nucliadb.train.utils import start_train_grpc, stop_train_grpc
|
326
|
-
|
327
|
-
await start_shard_manager()
|
328
|
-
await start_train_grpc("testing_train")
|
329
|
-
yield
|
330
|
-
await stop_train_grpc()
|
331
|
-
await stop_shard_manager()
|
332
|
-
|
333
|
-
|
334
|
-
@pytest.fixture(scope="function")
|
335
|
-
async def train_client(train_api): # type: ignore
|
336
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
337
|
-
|
338
|
-
from nucliadb.train.settings import settings
|
339
|
-
|
340
|
-
channel = aio.insecure_channel(f"localhost:{settings.grpc_port}")
|
341
|
-
yield TrainStub(channel)
|
342
|
-
clear_global_cache()
|
@@ -1,122 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
|
20
|
-
import asyncio
|
21
|
-
|
22
|
-
import aiohttp
|
23
|
-
import pytest
|
24
|
-
from nucliadb_protos.dataset_pb2 import FieldClassificationBatch, TaskType, TrainSet
|
25
|
-
from nucliadb_protos.knowledgebox_pb2 import Label, LabelSet
|
26
|
-
from nucliadb_protos.writer_pb2 import SetLabelsRequest
|
27
|
-
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
28
|
-
|
29
|
-
from nucliadb.tests.utils import inject_message
|
30
|
-
from nucliadb.tests.utils.broker_messages import BrokerMessageBuilder
|
31
|
-
from nucliadb.train import API_PREFIX
|
32
|
-
from nucliadb.train.api.v1.router import KB_PREFIX
|
33
|
-
from nucliadb.train.tests.utils import get_batches_from_train_response_stream
|
34
|
-
|
35
|
-
|
36
|
-
@pytest.mark.asyncio
|
37
|
-
@pytest.mark.parametrize("knowledgebox", ["STABLE", "EXPERIMENTAL"], indirect=True)
|
38
|
-
async def test_generator_field_classification(
|
39
|
-
train_rest_api: aiohttp.ClientSession,
|
40
|
-
knowledgebox_with_labels: str,
|
41
|
-
):
|
42
|
-
kbid = knowledgebox_with_labels
|
43
|
-
|
44
|
-
async with train_rest_api.get(
|
45
|
-
f"/{API_PREFIX}/v1/{KB_PREFIX}/{kbid}/trainset"
|
46
|
-
) as partitions:
|
47
|
-
assert partitions.status == 200
|
48
|
-
data = await partitions.json()
|
49
|
-
assert len(data["partitions"]) == 1
|
50
|
-
partition_id = data["partitions"][0]
|
51
|
-
|
52
|
-
trainset = TrainSet()
|
53
|
-
trainset.type = TaskType.FIELD_CLASSIFICATION
|
54
|
-
trainset.batch_size = 2
|
55
|
-
|
56
|
-
tests = [
|
57
|
-
(["labelset_resources"], 2, 4),
|
58
|
-
# 2 fields
|
59
|
-
(["labelset_resources/label_user"], 1, 2),
|
60
|
-
# unused label
|
61
|
-
(["labelset_resources/label_alien"], 0, 0),
|
62
|
-
# non existent
|
63
|
-
(["nonexistent_labelset"], 0, 0),
|
64
|
-
]
|
65
|
-
|
66
|
-
for labels, expected_batches, expected_total in tests:
|
67
|
-
trainset.filter.ClearField("labels")
|
68
|
-
trainset.filter.labels.extend(labels) # type: ignore
|
69
|
-
|
70
|
-
async with train_rest_api.post(
|
71
|
-
f"/{API_PREFIX}/v1/{KB_PREFIX}/{kbid}/trainset/{partition_id}",
|
72
|
-
data=trainset.SerializeToString(),
|
73
|
-
) as response:
|
74
|
-
assert response.status == 200
|
75
|
-
batches = []
|
76
|
-
total = 0
|
77
|
-
async for batch in get_batches_from_train_response_stream(
|
78
|
-
response, FieldClassificationBatch
|
79
|
-
):
|
80
|
-
batches.append(batch)
|
81
|
-
total += len(batch.data)
|
82
|
-
assert len(batches) == expected_batches
|
83
|
-
assert total == expected_total
|
84
|
-
|
85
|
-
|
86
|
-
@pytest.fixture(scope="function")
|
87
|
-
@pytest.mark.asyncio
|
88
|
-
async def knowledgebox_with_labels(nucliadb_grpc: WriterStub, knowledgebox: str):
|
89
|
-
slr = SetLabelsRequest()
|
90
|
-
slr.kb.uuid = knowledgebox
|
91
|
-
slr.id = "labelset_paragraphs"
|
92
|
-
slr.labelset.kind.append(LabelSet.LabelSetKind.PARAGRAPHS)
|
93
|
-
slr.labelset.labels.append(Label(title="label_machine"))
|
94
|
-
slr.labelset.labels.append(Label(title="label_user"))
|
95
|
-
slr.labelset.labels.append(Label(title="label_alien"))
|
96
|
-
await nucliadb_grpc.SetLabels(slr) # type: ignore
|
97
|
-
|
98
|
-
slr = SetLabelsRequest()
|
99
|
-
slr.kb.uuid = knowledgebox
|
100
|
-
slr.id = "labelset_resources"
|
101
|
-
slr.labelset.kind.append(LabelSet.LabelSetKind.RESOURCES)
|
102
|
-
slr.labelset.labels.append(Label(title="label_machine"))
|
103
|
-
slr.labelset.labels.append(Label(title="label_user"))
|
104
|
-
slr.labelset.labels.append(Label(title="label_alien"))
|
105
|
-
await nucliadb_grpc.SetLabels(slr) # type: ignore
|
106
|
-
|
107
|
-
bmb = BrokerMessageBuilder(kbid=knowledgebox)
|
108
|
-
bmb.with_title("First resource")
|
109
|
-
bmb.with_summary("First summary")
|
110
|
-
bmb.with_resource_labels("labelset_resources", ["label_user"])
|
111
|
-
bm = bmb.build()
|
112
|
-
await inject_message(nucliadb_grpc, bm)
|
113
|
-
|
114
|
-
bmb = BrokerMessageBuilder(kbid=knowledgebox)
|
115
|
-
bmb.with_title("Second resource")
|
116
|
-
bmb.with_summary("Second summary")
|
117
|
-
bmb.with_resource_labels("labelset_resources", ["label_machine"])
|
118
|
-
bm = bmb.build()
|
119
|
-
await inject_message(nucliadb_grpc, bm)
|
120
|
-
|
121
|
-
await asyncio.sleep(0.1)
|
122
|
-
yield knowledgebox
|