nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0002_rollover_shards.py +1 -2
- migrations/0003_allfields_key.py +2 -37
- migrations/0004_rollover_shards.py +1 -2
- migrations/0005_rollover_shards.py +1 -2
- migrations/0006_rollover_shards.py +2 -4
- migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
- migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
- migrations/0010_fix_corrupt_indexes.py +11 -12
- migrations/0011_materialize_labelset_ids.py +2 -18
- migrations/0012_rollover_shards.py +6 -12
- migrations/0013_rollover_shards.py +2 -4
- migrations/0014_rollover_shards.py +5 -7
- migrations/0015_targeted_rollover.py +6 -12
- migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
- migrations/0017_multiple_writable_shards.py +3 -6
- migrations/0018_purge_orphan_kbslugs.py +59 -0
- migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
- migrations/0020_drain_nodes_from_cluster.py +83 -0
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +41 -24
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/single.py +1 -2
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +11 -16
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +223 -102
- nucliadb/common/cluster/rebalance.py +42 -37
- nucliadb/common/cluster/rollover.py +377 -204
- nucliadb/common/cluster/settings.py +16 -9
- nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +9 -6
- nucliadb/common/cluster/utils.py +43 -29
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +6 -4
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +24 -5
- nucliadb/common/datamanagers/atomic.py +102 -0
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +101 -24
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +214 -117
- nucliadb/common/datamanagers/rollover.py +77 -16
- nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
- nucliadb/common/datamanagers/utils.py +19 -11
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +43 -13
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +6 -6
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +13 -44
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exceptions.py +8 -0
- nucliadb/export_import/exporter.py +20 -7
- nucliadb/export_import/importer.py +6 -11
- nucliadb/export_import/models.py +5 -5
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +94 -54
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +30 -147
- nucliadb/ingest/consumer/consumer.py +96 -52
- nucliadb/ingest/consumer/materializer.py +10 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +20 -19
- nucliadb/ingest/consumer/shard_creator.py +7 -14
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +139 -188
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -25
- nucliadb/ingest/fields/link.py +11 -16
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +255 -262
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +334 -278
- nucliadb/ingest/orm/processor/__init__.py +2 -697
- nucliadb/ingest/orm/processor/auditing.py +117 -0
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +280 -520
- nucliadb/ingest/orm/utils.py +25 -31
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +76 -81
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -173
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +186 -577
- nucliadb/ingest/settings.py +13 -22
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +264 -51
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +57 -37
- nucliadb/migrator/settings.py +2 -1
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +3 -14
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +27 -94
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +13 -13
- nucliadb/reader/api/v1/learning_config.py +8 -12
- nucliadb/reader/api/v1/resource.py +67 -93
- nucliadb/reader/api/v1/services.py +70 -125
- nucliadb/reader/app.py +16 -46
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -31
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +2 -2
- nucliadb/search/api/v1/ask.py +112 -0
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +17 -25
- nucliadb/search/api/v1/find.py +41 -41
- nucliadb/search/api/v1/knowledgebox.py +90 -62
- nucliadb/search/api/v1/predict_proxy.py +2 -2
- nucliadb/search/api/v1/resource/ask.py +66 -117
- nucliadb/search/api/v1/resource/search.py +51 -72
- nucliadb/search/api/v1/router.py +1 -0
- nucliadb/search/api/v1/search.py +50 -197
- nucliadb/search/api/v1/suggest.py +40 -54
- nucliadb/search/api/v1/summarize.py +9 -5
- nucliadb/search/api/v1/utils.py +2 -1
- nucliadb/search/app.py +16 -48
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +176 -188
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +41 -63
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +918 -0
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +851 -282
- nucliadb/search/search/chat/query.py +274 -267
- nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -54
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +198 -234
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +386 -257
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +4 -38
- nucliadb/search/search/summarize.py +14 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +17 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +9 -12
- nucliadb/standalone/introspect.py +5 -5
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +58 -0
- nucliadb/standalone/purge.py +9 -8
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +25 -18
- nucliadb/standalone/settings.py +10 -14
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +2 -2
- nucliadb/train/api/v1/trainset.py +4 -6
- nucliadb/train/app.py +14 -47
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +45 -36
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +20 -25
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/writer/api/constants.py +0 -5
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +102 -49
- nucliadb/writer/api/v1/field.py +196 -620
- nucliadb/writer/api/v1/knowledgebox.py +221 -71
- nucliadb/writer/api/v1/learning_config.py +2 -2
- nucliadb/writer/api/v1/resource.py +114 -216
- nucliadb/writer/api/v1/services.py +64 -132
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +184 -215
- nucliadb/writer/app.py +11 -61
- nucliadb/writer/back_pressure.py +62 -43
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -62
- nucliadb/writer/resource/field.py +45 -135
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +14 -5
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +56 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -412
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -771
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -379
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -258
- nucliadb/search/api/v1/resource/chat.py +0 -94
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -465
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -201
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -584
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -736
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
- nucliadb/tests/migrations/__init__.py +0 -19
- nucliadb/tests/migrations/test_migration_0017.py +0 -80
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -294
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -93
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -60
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -84
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -138
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -167
- nucliadb/tests/utils/broker_messages/fields.py +0 -181
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -222
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -108
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/resource/vectors.py +0 -120
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -192
- nucliadb/writer/tests/test_fields.py +0 -486
- nucliadb/writer/tests/test_files.py +0 -743
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
- nucliadb/writer/tests/test_resources.py +0 -546
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
- nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,80 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import uuid
|
21
|
-
from unittest.mock import AsyncMock, Mock
|
22
|
-
|
23
|
-
import pytest
|
24
|
-
from nucliadb_protos.train_pb2 import GetEntitiesRequest
|
25
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
26
|
-
from nucliadb_protos.writer_pb2 import GetEntitiesResponse
|
27
|
-
|
28
|
-
|
29
|
-
@pytest.fixture(scope="function")
|
30
|
-
async def entities_manager_mock():
|
31
|
-
from nucliadb.train import nodes
|
32
|
-
|
33
|
-
original = nodes.EntitiesManager
|
34
|
-
|
35
|
-
mock = Mock()
|
36
|
-
nodes.EntitiesManager = Mock(return_value=mock)
|
37
|
-
|
38
|
-
yield mock
|
39
|
-
|
40
|
-
nodes.EntitiesManager = original
|
41
|
-
|
42
|
-
|
43
|
-
@pytest.mark.asyncio
|
44
|
-
async def test_get_entities(
|
45
|
-
train_client: TrainStub,
|
46
|
-
knowledgebox_ingest: str,
|
47
|
-
entities_manager_mock: Mock,
|
48
|
-
) -> None:
|
49
|
-
def get_entities_mock(response):
|
50
|
-
response.groups["group1"].entities["entity1"].value = "PERSON"
|
51
|
-
|
52
|
-
entities_manager_mock.get_entities = AsyncMock(side_effect=get_entities_mock)
|
53
|
-
|
54
|
-
req = GetEntitiesRequest()
|
55
|
-
req.kb.uuid = knowledgebox_ingest
|
56
|
-
entities: GetEntitiesResponse = await train_client.GetEntities(req) # type: ignore
|
57
|
-
|
58
|
-
assert entities.groups["group1"].entities["entity1"].value == "PERSON"
|
59
|
-
|
60
|
-
|
61
|
-
@pytest.mark.asyncio
|
62
|
-
async def test_get_entities_kb_not_found(train_client: TrainStub) -> None:
|
63
|
-
req = GetEntitiesRequest()
|
64
|
-
req.kb.uuid = str(uuid.uuid4())
|
65
|
-
entities: GetEntitiesResponse = await train_client.GetEntities(req) # type: ignore
|
66
|
-
assert entities.status == GetEntitiesResponse.Status.NOTFOUND
|
67
|
-
|
68
|
-
|
69
|
-
@pytest.mark.asyncio
|
70
|
-
async def test_get_entities_error(
|
71
|
-
train_client: TrainStub, knowledgebox_ingest: str, entities_manager_mock
|
72
|
-
) -> None:
|
73
|
-
entities_manager_mock.get_entities = AsyncMock(
|
74
|
-
side_effect=Exception("Testing exception on ingest")
|
75
|
-
)
|
76
|
-
|
77
|
-
req = GetEntitiesRequest()
|
78
|
-
req.kb.uuid = knowledgebox_ingest
|
79
|
-
entities: GetEntitiesResponse = await train_client.GetEntities(req) # type: ignore
|
80
|
-
assert entities.status == GetEntitiesResponse.Status.ERROR
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
|
20
|
-
import sys
|
21
|
-
|
22
|
-
import pytest
|
23
|
-
from aioresponses import aioresponses
|
24
|
-
from nucliadb_protos.train_pb2 import GetInfoRequest, TrainInfo
|
25
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
26
|
-
|
27
|
-
VERSION = sys.version_info
|
28
|
-
PY_GEQ_3_11 = VERSION.major > 3 or VERSION.major == 3 and VERSION.minor >= 11
|
29
|
-
|
30
|
-
|
31
|
-
@pytest.mark.asyncio
|
32
|
-
@pytest.mark.skipif(
|
33
|
-
PY_GEQ_3_11, reason="aioresponses not compatible with python 3.11 yet"
|
34
|
-
)
|
35
|
-
async def test_get_info(
|
36
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
37
|
-
) -> None:
|
38
|
-
req = GetInfoRequest()
|
39
|
-
req.kb.uuid = knowledgebox_ingest
|
40
|
-
|
41
|
-
with aioresponses() as m:
|
42
|
-
m.get(
|
43
|
-
f"http://search.nuclia.svc.cluster.local:8030/api/v1/kb/{knowledgebox_ingest}/counters",
|
44
|
-
payload={"resources": 4, "paragraphs": 89, "fields": 4, "sentences": 90},
|
45
|
-
)
|
46
|
-
|
47
|
-
labels: TrainInfo = await train_client.GetInfo(req) # type: ignore
|
48
|
-
assert labels.fields == 4
|
49
|
-
assert labels.resources == 4
|
50
|
-
assert labels.paragraphs == 89
|
51
|
-
assert labels.sentences == 90
|
@@ -1,34 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import pytest
|
21
|
-
from nucliadb_protos.train_pb2 import GetLabelsRequest
|
22
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
23
|
-
from nucliadb_protos.writer_pb2 import GetLabelsResponse
|
24
|
-
|
25
|
-
|
26
|
-
@pytest.mark.asyncio
|
27
|
-
async def test_get_ontology(
|
28
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
29
|
-
) -> None:
|
30
|
-
req = GetLabelsRequest()
|
31
|
-
req.kb.uuid = knowledgebox_ingest
|
32
|
-
|
33
|
-
labels: GetLabelsResponse = await train_client.GetOntology(req) # type: ignore
|
34
|
-
assert labels.labels.labelset["label1"].labels[0].title == "label1"
|
@@ -1,63 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import sys
|
21
|
-
|
22
|
-
import pytest
|
23
|
-
from aioresponses import aioresponses
|
24
|
-
from nucliadb_protos.train_pb2 import GetLabelsetsCountRequest, LabelsetsCount
|
25
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
26
|
-
|
27
|
-
VERSION = sys.version_info
|
28
|
-
PY_GEQ_3_11 = VERSION.major > 3 or VERSION.major == 3 and VERSION.minor >= 11
|
29
|
-
|
30
|
-
|
31
|
-
@pytest.mark.asyncio
|
32
|
-
@pytest.mark.skipif(
|
33
|
-
PY_GEQ_3_11, reason="aioresponses not compatible with python 3.11 yet"
|
34
|
-
)
|
35
|
-
async def test_get_ontology_count(
|
36
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
37
|
-
) -> None:
|
38
|
-
req = GetLabelsetsCountRequest()
|
39
|
-
req.kb.uuid = knowledgebox_ingest
|
40
|
-
|
41
|
-
with aioresponses() as m:
|
42
|
-
m.get(
|
43
|
-
f"http://search.nuclia.svc.cluster.local:8030/api/v1/kb/{knowledgebox_ingest}/search?faceted=/l/my-labelset", # noqa
|
44
|
-
payload={
|
45
|
-
"resources": {},
|
46
|
-
"sentences": {"results": [], "facets": {}},
|
47
|
-
"paragraphs": {
|
48
|
-
"results": [],
|
49
|
-
"facets": {
|
50
|
-
"/l/my-labelset": {
|
51
|
-
"facetresults": [
|
52
|
-
{"tag": "/l/my-labelset/Label 1", "total": 1}
|
53
|
-
]
|
54
|
-
}
|
55
|
-
},
|
56
|
-
},
|
57
|
-
"fulltext": {"results": [], "facets": {}},
|
58
|
-
},
|
59
|
-
)
|
60
|
-
|
61
|
-
req.resource_labelsets.append("my-labelset")
|
62
|
-
labels: LabelsetsCount = await train_client.GetOntologyCount(req) # type: ignore
|
63
|
-
assert labels.labelsets["/l/my-labelset"].paragraphs["Label 1"] == 1
|
@@ -1,222 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
|
20
|
-
import asyncio
|
21
|
-
import base64
|
22
|
-
import json
|
23
|
-
import os
|
24
|
-
from typing import Any
|
25
|
-
from unittest.mock import AsyncMock, patch
|
26
|
-
|
27
|
-
import aiohttp
|
28
|
-
import pytest
|
29
|
-
from nucliadb_protos.dataset_pb2 import ImageClassificationBatch, TaskType, TrainSet
|
30
|
-
from nucliadb_protos.resources_pb2 import (
|
31
|
-
CloudFile,
|
32
|
-
FileExtractedData,
|
33
|
-
FilePages,
|
34
|
-
PageStructure,
|
35
|
-
PageStructurePage,
|
36
|
-
PageStructureToken,
|
37
|
-
)
|
38
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage, OpStatusWriter
|
39
|
-
from nucliadb_protos.writer_pb2_grpc import WriterStub
|
40
|
-
|
41
|
-
from nucliadb.train import API_PREFIX
|
42
|
-
from nucliadb.train.api.v1.router import KB_PREFIX
|
43
|
-
from nucliadb.train.tests.utils import get_batches_from_train_response_stream
|
44
|
-
from nucliadb_utils.utilities import Utility, get_utility, set_utility
|
45
|
-
|
46
|
-
_dir = os.path.dirname(__file__)
|
47
|
-
_testdata_dir = os.path.join(_dir, "..", "..", "tests", "testdata")
|
48
|
-
|
49
|
-
INVOICE_FILENAME = os.path.join(_testdata_dir, "invoice.pdf")
|
50
|
-
INVOICE_SELECTIONS_FILENAME = os.path.join(_testdata_dir, "invoice_selections.json")
|
51
|
-
|
52
|
-
|
53
|
-
@pytest.mark.asyncio
|
54
|
-
@pytest.mark.parametrize("knowledgebox", ["STABLE", "EXPERIMENTAL"], indirect=True)
|
55
|
-
async def test_generation_image_classification(
|
56
|
-
train_rest_api: aiohttp.ClientSession,
|
57
|
-
knowledgebox: str,
|
58
|
-
image_classification_resource,
|
59
|
-
):
|
60
|
-
async with train_rest_api.get(
|
61
|
-
f"/{API_PREFIX}/v1/{KB_PREFIX}/{knowledgebox}/trainset"
|
62
|
-
) as partitions:
|
63
|
-
assert partitions.status == 200
|
64
|
-
data = await partitions.json()
|
65
|
-
assert len(data["partitions"]) == 1
|
66
|
-
partition_id = data["partitions"][0]
|
67
|
-
|
68
|
-
trainset = TrainSet()
|
69
|
-
trainset.type = TaskType.IMAGE_CLASSIFICATION
|
70
|
-
trainset.batch_size = 10
|
71
|
-
|
72
|
-
await asyncio.sleep(0.1)
|
73
|
-
async with train_rest_api.post(
|
74
|
-
f"/{API_PREFIX}/v1/{KB_PREFIX}/{knowledgebox}/trainset/{partition_id}",
|
75
|
-
data=trainset.SerializeToString(),
|
76
|
-
) as response:
|
77
|
-
assert response.status == 200
|
78
|
-
batches = []
|
79
|
-
async for batch in get_batches_from_train_response_stream(
|
80
|
-
response, ImageClassificationBatch
|
81
|
-
):
|
82
|
-
batches.append(batch)
|
83
|
-
assert len(batch.data) == 1
|
84
|
-
selections = json.loads(batch.data[0].selections)
|
85
|
-
assert selections["width"] == 10
|
86
|
-
assert selections["height"] == 10
|
87
|
-
assert len(selections["tokens"]) == 87
|
88
|
-
assert len(selections["annotations"]) == 18
|
89
|
-
assert batch.data[0].page_uri == "DUMMY-URI"
|
90
|
-
assert len(batches) == 1
|
91
|
-
|
92
|
-
|
93
|
-
@pytest.fixture
|
94
|
-
@pytest.mark.asyncio
|
95
|
-
async def image_classification_resource(
|
96
|
-
writer_rest_api: aiohttp.ClientSession, nucliadb_grpc: WriterStub, knowledgebox: str
|
97
|
-
):
|
98
|
-
kbid = knowledgebox
|
99
|
-
field_id = "invoice"
|
100
|
-
|
101
|
-
with open(INVOICE_SELECTIONS_FILENAME) as f:
|
102
|
-
selections = json.load(f)
|
103
|
-
assert len(selections["tokens"]) == 87
|
104
|
-
assert len(selections["annotations"]) == 18
|
105
|
-
|
106
|
-
fieldmetadata = generate_image_classification_fieldmetadata(selections, field_id)
|
107
|
-
|
108
|
-
with open(INVOICE_FILENAME, "rb") as f:
|
109
|
-
invoice_content = f.read()
|
110
|
-
|
111
|
-
resp = await writer_rest_api.post(
|
112
|
-
f"/{API_PREFIX}/v1/{KB_PREFIX}/{knowledgebox}/resources",
|
113
|
-
headers={"x-synchronous": "true"},
|
114
|
-
json={
|
115
|
-
"title": "My invoice",
|
116
|
-
"files": {
|
117
|
-
field_id: {
|
118
|
-
"file": {
|
119
|
-
"filename": "invoice.pdf",
|
120
|
-
"content_type": "application/pdf",
|
121
|
-
"payload": base64.b64encode(invoice_content).decode(),
|
122
|
-
}
|
123
|
-
}
|
124
|
-
},
|
125
|
-
"fieldmetadata": fieldmetadata,
|
126
|
-
},
|
127
|
-
)
|
128
|
-
assert resp.status == 201
|
129
|
-
body = await resp.json()
|
130
|
-
rid = body["uuid"]
|
131
|
-
|
132
|
-
broker_message = generate_image_classification_broker_message(
|
133
|
-
selections, kbid, rid, field_id
|
134
|
-
)
|
135
|
-
|
136
|
-
original_storage = get_utility(Utility.STORAGE)
|
137
|
-
set_utility(Utility.STORAGE, AsyncMock())
|
138
|
-
mock_set = AsyncMock(return_value=None)
|
139
|
-
mock_get = AsyncMock(return_value=broker_message.file_extracted_data[0])
|
140
|
-
with (
|
141
|
-
patch(
|
142
|
-
"nucliadb.ingest.fields.file.File.set_file_extracted_data", new=mock_set
|
143
|
-
) as _,
|
144
|
-
patch(
|
145
|
-
"nucliadb.ingest.fields.file.File.get_file_extracted_data", new=mock_get
|
146
|
-
) as _,
|
147
|
-
):
|
148
|
-
resp = await nucliadb_grpc.ProcessMessage( # type: ignore
|
149
|
-
iter([broker_message]), timeout=10, wait_for_ready=True
|
150
|
-
)
|
151
|
-
assert resp.status == OpStatusWriter.Status.OK
|
152
|
-
yield
|
153
|
-
|
154
|
-
set_utility(Utility.STORAGE, original_storage)
|
155
|
-
|
156
|
-
|
157
|
-
def generate_image_classification_fieldmetadata(
|
158
|
-
selections: dict, field_id: str
|
159
|
-
) -> list[dict[str, Any]]:
|
160
|
-
selections_by_page = {} # type: ignore
|
161
|
-
for annotation in selections["annotations"]:
|
162
|
-
page_selections = selections_by_page.setdefault(annotation["page"], [])
|
163
|
-
page_selections.append(
|
164
|
-
{
|
165
|
-
"label": annotation["label"]["text"],
|
166
|
-
"top": annotation["bounds"]["top"],
|
167
|
-
"left": annotation["bounds"]["left"],
|
168
|
-
"right": annotation["bounds"]["right"],
|
169
|
-
"bottom": annotation["bounds"]["bottom"],
|
170
|
-
"token_ids": [token["tokenIndex"] for token in annotation["tokens"]],
|
171
|
-
}
|
172
|
-
)
|
173
|
-
|
174
|
-
fieldmetadata = {
|
175
|
-
"field": {"field": field_id, "field_type": "file"},
|
176
|
-
"selections": [
|
177
|
-
{
|
178
|
-
"page": page,
|
179
|
-
"visual": selections,
|
180
|
-
}
|
181
|
-
for page, selections in selections_by_page.items()
|
182
|
-
],
|
183
|
-
}
|
184
|
-
return [fieldmetadata]
|
185
|
-
|
186
|
-
|
187
|
-
def generate_image_classification_broker_message(
|
188
|
-
selections: dict, kbid: str, rid: str, field_id: str
|
189
|
-
) -> BrokerMessage:
|
190
|
-
bm = BrokerMessage(
|
191
|
-
kbid=kbid,
|
192
|
-
uuid=rid,
|
193
|
-
source=BrokerMessage.MessageSource.PROCESSOR,
|
194
|
-
file_extracted_data=[
|
195
|
-
FileExtractedData(
|
196
|
-
field=field_id,
|
197
|
-
file_pages_previews=FilePages(
|
198
|
-
pages=[
|
199
|
-
CloudFile(uri="DUMMY-URI"),
|
200
|
-
],
|
201
|
-
structures=[
|
202
|
-
PageStructure(
|
203
|
-
page=PageStructurePage(width=10, height=10),
|
204
|
-
tokens=[
|
205
|
-
PageStructureToken(
|
206
|
-
x=token["x"],
|
207
|
-
y=token["y"],
|
208
|
-
width=token["width"],
|
209
|
-
height=token["height"],
|
210
|
-
text=token["text"],
|
211
|
-
line=0,
|
212
|
-
)
|
213
|
-
for token in selections["tokens"]
|
214
|
-
],
|
215
|
-
)
|
216
|
-
],
|
217
|
-
),
|
218
|
-
)
|
219
|
-
],
|
220
|
-
)
|
221
|
-
|
222
|
-
return bm
|
@@ -1,39 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import pytest
|
21
|
-
from nucliadb_protos.train_pb2 import GetFieldsRequest
|
22
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
23
|
-
|
24
|
-
|
25
|
-
@pytest.mark.asyncio
|
26
|
-
async def test_list_fields(
|
27
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
28
|
-
) -> None:
|
29
|
-
req = GetFieldsRequest()
|
30
|
-
req.kb.uuid = knowledgebox_ingest
|
31
|
-
req.metadata.entities = True
|
32
|
-
req.metadata.labels = True
|
33
|
-
req.metadata.text = True
|
34
|
-
req.metadata.vector = True
|
35
|
-
count = 0
|
36
|
-
async for _ in train_client.GetParagraphs(req): # type: ignore
|
37
|
-
count += 1
|
38
|
-
|
39
|
-
assert count == 30
|
@@ -1,73 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import pytest
|
21
|
-
from nucliadb_protos.train_pb2 import GetParagraphsRequest
|
22
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
23
|
-
|
24
|
-
|
25
|
-
@pytest.mark.asyncio
|
26
|
-
async def test_list_paragraphs(
|
27
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
28
|
-
) -> None:
|
29
|
-
req = GetParagraphsRequest()
|
30
|
-
req.kb.uuid = knowledgebox_ingest
|
31
|
-
req.metadata.entities = True
|
32
|
-
req.metadata.labels = True
|
33
|
-
req.metadata.text = True
|
34
|
-
req.metadata.vector = True
|
35
|
-
count = 0
|
36
|
-
async for _ in train_client.GetParagraphs(req): # type: ignore
|
37
|
-
count += 1
|
38
|
-
|
39
|
-
assert count == 30
|
40
|
-
|
41
|
-
|
42
|
-
@pytest.mark.asyncio
|
43
|
-
async def test_list_paragraphs_shows_ners_with_positions(
|
44
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
45
|
-
) -> None:
|
46
|
-
req = GetParagraphsRequest()
|
47
|
-
req.kb.uuid = knowledgebox_ingest
|
48
|
-
req.metadata.entities = True
|
49
|
-
req.metadata.labels = True
|
50
|
-
req.metadata.text = True
|
51
|
-
req.metadata.vector = True
|
52
|
-
|
53
|
-
found_barcelona = found_manresa = False
|
54
|
-
async for paragraph in train_client.GetParagraphs(req): # type: ignore
|
55
|
-
if "Barcelona" in paragraph.metadata.text:
|
56
|
-
found_barcelona = True
|
57
|
-
assert paragraph.metadata.entities == {"Barcelona": "CITY"}
|
58
|
-
positions = paragraph.metadata.entity_positions["CITY/Barcelona"]
|
59
|
-
assert positions.entity == "Barcelona"
|
60
|
-
assert len(positions.positions) == 1
|
61
|
-
assert positions.positions[0].start == 43
|
62
|
-
assert positions.positions[0].end == 52
|
63
|
-
elif "Manresa" in paragraph.metadata.text:
|
64
|
-
found_manresa = True
|
65
|
-
assert paragraph.metadata.entities == {"Manresa": "CITY"}
|
66
|
-
positions = paragraph.metadata.entity_positions["CITY/Manresa"]
|
67
|
-
assert positions.entity == "Manresa"
|
68
|
-
assert len(positions.positions) == 2
|
69
|
-
assert positions.positions[0].start == 22
|
70
|
-
assert positions.positions[0].end == 29
|
71
|
-
assert positions.positions[1].start == 38
|
72
|
-
assert positions.positions[1].end == 45
|
73
|
-
assert found_manresa and found_barcelona
|
@@ -1,39 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import pytest
|
21
|
-
from nucliadb_protos.train_pb2 import GetResourcesRequest
|
22
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
23
|
-
|
24
|
-
|
25
|
-
@pytest.mark.asyncio
|
26
|
-
async def test_list_resource(
|
27
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
28
|
-
) -> None:
|
29
|
-
req = GetResourcesRequest()
|
30
|
-
req.kb.uuid = knowledgebox_ingest
|
31
|
-
req.metadata.entities = True
|
32
|
-
req.metadata.labels = True
|
33
|
-
req.metadata.text = True
|
34
|
-
req.metadata.vector = True
|
35
|
-
count = 0
|
36
|
-
async for _ in train_client.GetResources(req): # type: ignore
|
37
|
-
count += 1
|
38
|
-
|
39
|
-
assert count == 10
|
@@ -1,71 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import pytest
|
21
|
-
from nucliadb_protos.train_pb2 import GetSentencesRequest
|
22
|
-
from nucliadb_protos.train_pb2_grpc import TrainStub
|
23
|
-
|
24
|
-
|
25
|
-
@pytest.mark.asyncio
|
26
|
-
async def test_list_sentences(
|
27
|
-
train_client: TrainStub, knowledgebox_ingest: str, test_pagination_resources
|
28
|
-
) -> None:
|
29
|
-
req = GetSentencesRequest()
|
30
|
-
req.kb.uuid = knowledgebox_ingest
|
31
|
-
req.metadata.entities = True
|
32
|
-
req.metadata.labels = True
|
33
|
-
req.metadata.text = True
|
34
|
-
req.metadata.vector = True
|
35
|
-
count = 0
|
36
|
-
|
37
|
-
async for _ in train_client.GetSentences(req): # type: ignore
|
38
|
-
count += 1
|
39
|
-
|
40
|
-
assert count == 40
|
41
|
-
|
42
|
-
|
43
|
-
@pytest.mark.asyncio
|
44
|
-
@pytest.mark.parametrize("knowledgebox", ["STABLE", "EXPERIMENTAL"], indirect=True)
|
45
|
-
async def test_list_sentences_shows_ners_with_positions(
|
46
|
-
train_client: TrainStub, knowledgebox: str, test_pagination_resources
|
47
|
-
) -> None:
|
48
|
-
req = GetSentencesRequest()
|
49
|
-
req.kb.uuid = knowledgebox
|
50
|
-
req.metadata.entities = True
|
51
|
-
async for sentence in train_client.GetSentences(req): # type: ignore
|
52
|
-
if "Barcelona" in sentence.metadata.text:
|
53
|
-
assert sentence.metadata.entities == {"Barcelona": "CITY"}
|
54
|
-
positions = sentence.metadata.entity_positions["CITY/Barcelona"]
|
55
|
-
assert positions.entity == "Barcelona"
|
56
|
-
assert len(positions.positions) == 1
|
57
|
-
assert positions.positions[0].start == 43
|
58
|
-
assert positions.positions[0].end == 52
|
59
|
-
elif "Manresa" in sentence.metadata.text:
|
60
|
-
assert sentence.metadata.entities == {"Manresa": "CITY"}
|
61
|
-
positions = sentence.metadata.entity_positions["CITY/Manresa"]
|
62
|
-
assert positions.entity == "Manresa"
|
63
|
-
assert len(positions.positions) == 2
|
64
|
-
assert positions.positions[0].start == 22
|
65
|
-
assert positions.positions[0].end == 29
|
66
|
-
assert positions.positions[1].start == 38
|
67
|
-
assert positions.positions[1].end == 45
|
68
|
-
else:
|
69
|
-
# Other sentences should not have entities nor positions
|
70
|
-
assert sentence.metadata.entities == {}
|
71
|
-
assert sentence.metadata.entity_positions == {}
|