nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,764 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import logging
|
21
|
-
import uuid
|
22
|
-
from dataclasses import dataclass
|
23
|
-
from datetime import datetime
|
24
|
-
from os.path import dirname, getsize
|
25
|
-
from typing import Optional
|
26
|
-
from unittest.mock import AsyncMock, patch
|
27
|
-
|
28
|
-
import nats
|
29
|
-
import pytest
|
30
|
-
from grpc import aio
|
31
|
-
from nucliadb_protos.knowledgebox_pb2 import SemanticModelMetadata
|
32
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage
|
33
|
-
|
34
|
-
from nucliadb.common.cluster import manager
|
35
|
-
from nucliadb.common.cluster.settings import settings as cluster_settings
|
36
|
-
from nucliadb.common.maindb.driver import Driver
|
37
|
-
from nucliadb.ingest.consumer import service as consumer_service
|
38
|
-
from nucliadb.ingest.fields.base import Field
|
39
|
-
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
40
|
-
from nucliadb.ingest.orm.processor import Processor
|
41
|
-
from nucliadb.ingest.orm.resource import KB_REVERSE, Resource
|
42
|
-
from nucliadb.ingest.service.writer import WriterServicer
|
43
|
-
from nucliadb.ingest.settings import settings
|
44
|
-
from nucliadb.ingest.tests.vectors import V1, V2, V3
|
45
|
-
from nucliadb.learning_proxy import LearningConfiguration
|
46
|
-
from nucliadb_protos import resources_pb2 as rpb
|
47
|
-
from nucliadb_protos import utils_pb2 as upb
|
48
|
-
from nucliadb_protos import writer_pb2_grpc
|
49
|
-
from nucliadb_utils import const
|
50
|
-
from nucliadb_utils.audit.basic import BasicAuditStorage
|
51
|
-
from nucliadb_utils.audit.stream import StreamAuditStorage
|
52
|
-
from nucliadb_utils.cache.nats import NatsPubsub
|
53
|
-
from nucliadb_utils.indexing import IndexingUtility
|
54
|
-
from nucliadb_utils.settings import indexing_settings, transaction_settings
|
55
|
-
from nucliadb_utils.storages.settings import settings as storage_settings
|
56
|
-
from nucliadb_utils.storages.storage import Storage
|
57
|
-
from nucliadb_utils.utilities import (
|
58
|
-
Utility,
|
59
|
-
clean_utility,
|
60
|
-
clear_global_cache,
|
61
|
-
get_utility,
|
62
|
-
set_utility,
|
63
|
-
start_nats_manager,
|
64
|
-
start_transaction_utility,
|
65
|
-
stop_nats_manager,
|
66
|
-
stop_transaction_utility,
|
67
|
-
)
|
68
|
-
|
69
|
-
logger = logging.getLogger(__name__)
|
70
|
-
|
71
|
-
|
72
|
-
@pytest.fixture(scope="function")
|
73
|
-
async def processor(maindb_driver, storage, pubsub):
|
74
|
-
proc = Processor(maindb_driver, storage, pubsub, partition="1")
|
75
|
-
yield proc
|
76
|
-
|
77
|
-
|
78
|
-
@pytest.fixture(scope="function")
|
79
|
-
async def stream_processor(maindb_driver, storage, pubsub):
|
80
|
-
proc = Processor(maindb_driver, storage, pubsub, partition="1")
|
81
|
-
yield proc
|
82
|
-
|
83
|
-
|
84
|
-
@pytest.fixture(scope="function")
|
85
|
-
async def local_files():
|
86
|
-
storage_settings.local_testing_files = f"{dirname(__file__)}"
|
87
|
-
|
88
|
-
|
89
|
-
@dataclass
|
90
|
-
class IngestFixture:
|
91
|
-
servicer: WriterServicer
|
92
|
-
channel: aio.Channel
|
93
|
-
host: str
|
94
|
-
serv: aio.Server
|
95
|
-
|
96
|
-
|
97
|
-
@pytest.fixture(scope="function")
|
98
|
-
async def ingest_consumers(
|
99
|
-
redis_config, transaction_utility, storage, fake_node, nats_manager
|
100
|
-
):
|
101
|
-
ingest_consumers_finalizer = await consumer_service.start_ingest_consumers()
|
102
|
-
|
103
|
-
yield
|
104
|
-
|
105
|
-
await ingest_consumers_finalizer()
|
106
|
-
clear_global_cache()
|
107
|
-
|
108
|
-
|
109
|
-
@pytest.fixture(scope="function")
|
110
|
-
async def ingest_processed_consumer(
|
111
|
-
redis_config, transaction_utility, storage, fake_node, nats_manager
|
112
|
-
):
|
113
|
-
ingest_consumer_finalizer = await consumer_service.start_ingest_processed_consumer()
|
114
|
-
|
115
|
-
yield
|
116
|
-
|
117
|
-
await ingest_consumer_finalizer()
|
118
|
-
clear_global_cache()
|
119
|
-
|
120
|
-
|
121
|
-
@pytest.fixture(scope="function")
|
122
|
-
async def grpc_servicer(
|
123
|
-
maindb_driver, ingest_consumers, ingest_processed_consumer, learning_config
|
124
|
-
):
|
125
|
-
servicer = WriterServicer()
|
126
|
-
await servicer.initialize()
|
127
|
-
|
128
|
-
server = aio.server()
|
129
|
-
port = server.add_insecure_port("[::]:0")
|
130
|
-
writer_pb2_grpc.add_WriterServicer_to_server(servicer, server)
|
131
|
-
await server.start()
|
132
|
-
_channel = aio.insecure_channel(f"127.0.0.1:{port}")
|
133
|
-
yield IngestFixture(
|
134
|
-
channel=_channel,
|
135
|
-
serv=server,
|
136
|
-
servicer=servicer,
|
137
|
-
host=f"127.0.0.1:{port}",
|
138
|
-
)
|
139
|
-
await servicer.finalize()
|
140
|
-
await _channel.close()
|
141
|
-
await server.stop(None)
|
142
|
-
|
143
|
-
|
144
|
-
@pytest.fixture(scope="function")
|
145
|
-
async def pubsub(natsd):
|
146
|
-
pubsub = get_utility(Utility.PUBSUB)
|
147
|
-
if pubsub is None:
|
148
|
-
pubsub = NatsPubsub(hosts=[natsd])
|
149
|
-
await pubsub.initialize()
|
150
|
-
set_utility(Utility.PUBSUB, pubsub)
|
151
|
-
|
152
|
-
yield pubsub
|
153
|
-
clean_utility(Utility.PUBSUB)
|
154
|
-
await pubsub.finalize()
|
155
|
-
|
156
|
-
|
157
|
-
@pytest.fixture(scope="function")
|
158
|
-
async def fake_node(indexing_utility, shard_manager):
|
159
|
-
manager.INDEX_NODES.clear()
|
160
|
-
manager.add_index_node(
|
161
|
-
id=str(uuid.uuid4()),
|
162
|
-
address="nohost",
|
163
|
-
shard_count=0,
|
164
|
-
available_disk=100,
|
165
|
-
dummy=True,
|
166
|
-
)
|
167
|
-
manager.add_index_node(
|
168
|
-
id=str(uuid.uuid4()),
|
169
|
-
address="nohost",
|
170
|
-
shard_count=0,
|
171
|
-
available_disk=100,
|
172
|
-
dummy=True,
|
173
|
-
)
|
174
|
-
|
175
|
-
with patch.object(cluster_settings, "standalone_mode", False):
|
176
|
-
yield
|
177
|
-
|
178
|
-
manager.INDEX_NODES.clear()
|
179
|
-
|
180
|
-
|
181
|
-
@pytest.fixture()
|
182
|
-
def learning_config():
|
183
|
-
lconfig = LearningConfiguration(
|
184
|
-
semantic_model="multilingual",
|
185
|
-
semantic_threshold=None,
|
186
|
-
semantic_vector_size=None,
|
187
|
-
semantic_vector_similarity="cosine",
|
188
|
-
)
|
189
|
-
with patch("nucliadb.ingest.service.writer.learning_proxy") as mocked:
|
190
|
-
mocked.set_configuration = AsyncMock(return_value=None)
|
191
|
-
mocked.get_configuration = AsyncMock(return_value=lconfig)
|
192
|
-
mocked.delete_configuration = AsyncMock(return_value=None)
|
193
|
-
yield mocked
|
194
|
-
|
195
|
-
|
196
|
-
@pytest.fixture(scope="function")
|
197
|
-
async def knowledgebox_ingest(
|
198
|
-
storage, maindb_driver: Driver, shard_manager, learning_config
|
199
|
-
):
|
200
|
-
kbid = str(uuid.uuid4())
|
201
|
-
kbslug = str(uuid.uuid4())
|
202
|
-
async with maindb_driver.transaction() as txn:
|
203
|
-
model = SemanticModelMetadata(similarity_function=upb.VectorSimilarity.COSINE)
|
204
|
-
await KnowledgeBox.create(txn, kbslug, model, uuid=kbid)
|
205
|
-
await txn.commit()
|
206
|
-
|
207
|
-
yield kbid
|
208
|
-
|
209
|
-
async with maindb_driver.transaction() as txn:
|
210
|
-
await KnowledgeBox.delete_kb(txn, kbid)
|
211
|
-
await txn.commit()
|
212
|
-
|
213
|
-
|
214
|
-
@pytest.fixture(scope="function")
|
215
|
-
async def audit():
|
216
|
-
return BasicAuditStorage()
|
217
|
-
|
218
|
-
|
219
|
-
@pytest.fixture(scope="function")
|
220
|
-
async def stream_audit(natsd: str):
|
221
|
-
from nucliadb_utils.settings import audit_settings
|
222
|
-
|
223
|
-
audit = StreamAuditStorage(
|
224
|
-
[natsd],
|
225
|
-
audit_settings.audit_jetstream_target, # type: ignore
|
226
|
-
audit_settings.audit_partitions,
|
227
|
-
audit_settings.audit_hash_seed,
|
228
|
-
)
|
229
|
-
await audit.initialize()
|
230
|
-
yield audit
|
231
|
-
await audit.finalize()
|
232
|
-
|
233
|
-
|
234
|
-
@pytest.fixture(scope="function")
|
235
|
-
async def indexing_utility(natsd, _clean_natsd):
|
236
|
-
indexing_utility = IndexingUtility(
|
237
|
-
nats_creds=indexing_settings.index_jetstream_auth,
|
238
|
-
nats_servers=indexing_settings.index_jetstream_servers,
|
239
|
-
dummy=True,
|
240
|
-
)
|
241
|
-
await indexing_utility.initialize()
|
242
|
-
set_utility(Utility.INDEXING, indexing_utility)
|
243
|
-
|
244
|
-
yield
|
245
|
-
|
246
|
-
clean_utility(Utility.INDEXING)
|
247
|
-
await indexing_utility.finalize()
|
248
|
-
|
249
|
-
|
250
|
-
@pytest.fixture(scope="function")
|
251
|
-
async def _clean_natsd(natsd):
|
252
|
-
nc = await nats.connect(servers=[natsd])
|
253
|
-
js = nc.jetstream()
|
254
|
-
|
255
|
-
consumers = [
|
256
|
-
(const.Streams.INGEST.name, const.Streams.INGEST.group.format(partition="1")),
|
257
|
-
(const.Streams.INGEST_PROCESSED.name, const.Streams.INGEST_PROCESSED.group),
|
258
|
-
(const.Streams.INDEX.name, const.Streams.INDEX.group.format(node="1")),
|
259
|
-
]
|
260
|
-
for stream, consumer in consumers:
|
261
|
-
try:
|
262
|
-
await js.delete_consumer(stream, consumer)
|
263
|
-
except nats.js.errors.NotFoundError:
|
264
|
-
pass
|
265
|
-
|
266
|
-
streams = [
|
267
|
-
(const.Streams.INGEST.name, const.Streams.INGEST.subject.format(partition=">")),
|
268
|
-
(const.Streams.INDEX.name, const.Streams.INDEX.subject.format(node="*")),
|
269
|
-
]
|
270
|
-
for stream, subject in streams:
|
271
|
-
try:
|
272
|
-
await js.delete_stream(stream)
|
273
|
-
except nats.js.errors.NotFoundError:
|
274
|
-
pass
|
275
|
-
|
276
|
-
await js.add_stream(name=stream, subjects=[subject])
|
277
|
-
|
278
|
-
await nc.drain()
|
279
|
-
await nc.close()
|
280
|
-
|
281
|
-
indexing_settings.index_jetstream_servers = [natsd]
|
282
|
-
|
283
|
-
yield
|
284
|
-
|
285
|
-
|
286
|
-
@pytest.fixture(scope="function")
|
287
|
-
async def nats_manager(natsd):
|
288
|
-
ncm = await start_nats_manager("service_name", [natsd], None)
|
289
|
-
yield ncm
|
290
|
-
await stop_nats_manager()
|
291
|
-
|
292
|
-
|
293
|
-
@pytest.fixture(scope="function")
|
294
|
-
async def transaction_utility(natsd, pubsub):
|
295
|
-
transaction_settings.transaction_jetstream_servers = [natsd]
|
296
|
-
util = await start_transaction_utility()
|
297
|
-
yield util
|
298
|
-
await stop_transaction_utility()
|
299
|
-
|
300
|
-
|
301
|
-
THUMBNAIL = rpb.CloudFile(
|
302
|
-
uri="thumbnail.png",
|
303
|
-
source=rpb.CloudFile.Source.LOCAL,
|
304
|
-
bucket_name="/integration/orm/assets",
|
305
|
-
size=getsize(f"{dirname(__file__)}/integration/orm/assets/thumbnail.png"),
|
306
|
-
content_type="image/png",
|
307
|
-
filename="thumbnail.png",
|
308
|
-
)
|
309
|
-
|
310
|
-
TEST_CLOUDFILE_FILENAME = "text.pb"
|
311
|
-
TEST_CLOUDFILE = rpb.CloudFile(
|
312
|
-
uri=TEST_CLOUDFILE_FILENAME,
|
313
|
-
source=rpb.CloudFile.Source.LOCAL,
|
314
|
-
bucket_name="/integration/orm/assets",
|
315
|
-
size=getsize(
|
316
|
-
f"{dirname(__file__)}/integration/orm/assets/{TEST_CLOUDFILE_FILENAME}"
|
317
|
-
),
|
318
|
-
content_type="application/octet-stream",
|
319
|
-
filename=TEST_CLOUDFILE_FILENAME,
|
320
|
-
md5="01cca3f53edb934a445a3112c6caa652",
|
321
|
-
)
|
322
|
-
|
323
|
-
|
324
|
-
# HELPERS
|
325
|
-
|
326
|
-
|
327
|
-
async def make_field(field, extracted_text):
|
328
|
-
await field.set_extracted_text(make_extracted_text(field.id, body=extracted_text))
|
329
|
-
await field.set_field_metadata(make_field_metadata(field.id))
|
330
|
-
await field.set_large_field_metadata(make_field_large_metadata(field.id))
|
331
|
-
await field.set_vectors(make_extracted_vectors(field.id))
|
332
|
-
|
333
|
-
|
334
|
-
def make_extracted_text(field_id, body: str):
|
335
|
-
ex1 = rpb.ExtractedTextWrapper()
|
336
|
-
ex1.field.CopyFrom(rpb.FieldID(field_type=rpb.FieldType.TEXT, field=field_id))
|
337
|
-
ex1.body.text = body
|
338
|
-
return ex1
|
339
|
-
|
340
|
-
|
341
|
-
def make_field_metadata(field_id):
|
342
|
-
ex1 = rpb.FieldComputedMetadataWrapper()
|
343
|
-
ex1.field.CopyFrom(rpb.FieldID(field_type=rpb.FieldType.TEXT, field=field_id))
|
344
|
-
ex1.metadata.metadata.links.append("https://nuclia.com")
|
345
|
-
|
346
|
-
p1 = rpb.Paragraph(start=0, end=20)
|
347
|
-
p1.sentences.append(rpb.Sentence(start=0, end=20, key=""))
|
348
|
-
cl1 = rpb.Classification(labelset="labelset1", label="label1")
|
349
|
-
cl2 = rpb.Classification(labelset="paragraph-labelset", label="label1")
|
350
|
-
p1.classifications.append(cl2)
|
351
|
-
ex1.metadata.metadata.paragraphs.append(p1)
|
352
|
-
ex1.metadata.metadata.classifications.append(cl1)
|
353
|
-
# ex1.metadata.metadata.ner["Ramon"] = "PEOPLE"
|
354
|
-
ex1.metadata.metadata.last_index.FromDatetime(datetime.now())
|
355
|
-
ex1.metadata.metadata.last_understanding.FromDatetime(datetime.now())
|
356
|
-
ex1.metadata.metadata.last_extract.FromDatetime(datetime.now())
|
357
|
-
ex1.metadata.metadata.last_summary.FromDatetime(datetime.now())
|
358
|
-
ex1.metadata.metadata.thumbnail.CopyFrom(THUMBNAIL)
|
359
|
-
ex1.metadata.metadata.positions["ENTITY/document"].entity = "document"
|
360
|
-
ex1.metadata.metadata.positions["ENTITY/document"].position.extend(
|
361
|
-
[rpb.Position(start=0, end=5), rpb.Position(start=13, end=18)]
|
362
|
-
)
|
363
|
-
return ex1
|
364
|
-
|
365
|
-
|
366
|
-
def make_field_large_metadata(field_id):
|
367
|
-
ex1 = rpb.LargeComputedMetadataWrapper()
|
368
|
-
ex1.field.CopyFrom(rpb.FieldID(field_type=rpb.FieldType.TEXT, field=field_id))
|
369
|
-
en1 = rpb.Entity(token="tok1", root="tok", type="NAME")
|
370
|
-
en2 = rpb.Entity(token="tok2", root="tok2", type="NAME")
|
371
|
-
ex1.real.metadata.entities.append(en1)
|
372
|
-
ex1.real.metadata.entities.append(en2)
|
373
|
-
ex1.real.metadata.tokens["tok"] = 3
|
374
|
-
return ex1
|
375
|
-
|
376
|
-
|
377
|
-
def make_extracted_vectors(field_id):
|
378
|
-
ex1 = rpb.ExtractedVectorsWrapper()
|
379
|
-
ex1.field.CopyFrom(rpb.FieldID(field_type=rpb.FieldType.TEXT, field=field_id))
|
380
|
-
v1 = rpb.Vector(start=0, end=20, vector=b"ansjkdn")
|
381
|
-
ex1.vectors.vectors.vectors.append(v1)
|
382
|
-
return ex1
|
383
|
-
|
384
|
-
|
385
|
-
@pytest.fixture(scope="function")
|
386
|
-
async def test_resource(storage, maindb_driver, knowledgebox_ingest, fake_node):
|
387
|
-
"""
|
388
|
-
Create a resource that has every possible bit of information
|
389
|
-
"""
|
390
|
-
resource = await create_resource(
|
391
|
-
storage=storage,
|
392
|
-
driver=maindb_driver,
|
393
|
-
knowledgebox_ingest=knowledgebox_ingest,
|
394
|
-
)
|
395
|
-
yield resource
|
396
|
-
resource.clean()
|
397
|
-
|
398
|
-
|
399
|
-
@pytest.fixture(scope="function")
|
400
|
-
def partition_settings():
|
401
|
-
settings.replica_number = 1
|
402
|
-
settings.total_replicas = 4
|
403
|
-
|
404
|
-
yield settings
|
405
|
-
|
406
|
-
|
407
|
-
def broker_resource(
|
408
|
-
knowledgebox: str, rid: Optional[str] = None, slug: Optional[str] = None
|
409
|
-
) -> BrokerMessage:
|
410
|
-
if rid is None:
|
411
|
-
rid = str(uuid.uuid4())
|
412
|
-
if slug is None:
|
413
|
-
slug = f"{rid}slug1"
|
414
|
-
|
415
|
-
message1: BrokerMessage = BrokerMessage(
|
416
|
-
kbid=knowledgebox,
|
417
|
-
uuid=rid,
|
418
|
-
slug=slug,
|
419
|
-
type=BrokerMessage.AUTOCOMMIT,
|
420
|
-
)
|
421
|
-
|
422
|
-
message1.basic.icon = "text/plain"
|
423
|
-
message1.basic.title = "Title Resource"
|
424
|
-
message1.basic.summary = "Summary of document"
|
425
|
-
message1.basic.thumbnail = "doc"
|
426
|
-
message1.basic.layout = "default"
|
427
|
-
message1.basic.metadata.useful = True
|
428
|
-
message1.basic.metadata.language = "es"
|
429
|
-
message1.basic.created.FromDatetime(datetime.now())
|
430
|
-
message1.basic.modified.FromDatetime(datetime.now())
|
431
|
-
message1.origin.source = rpb.Origin.Source.WEB
|
432
|
-
|
433
|
-
message1.files["file"].file.uri = "http://nofile"
|
434
|
-
message1.files["file"].file.size = 0
|
435
|
-
message1.files["file"].file.source = rpb.CloudFile.Source.LOCAL
|
436
|
-
|
437
|
-
etw = rpb.ExtractedTextWrapper()
|
438
|
-
etw.body.text = "My own text Ramon. This is great to be here. \n Where is my beer?"
|
439
|
-
etw.field.field = "file"
|
440
|
-
etw.field.field_type = rpb.FieldType.FILE
|
441
|
-
message1.extracted_text.append(etw)
|
442
|
-
|
443
|
-
etw = rpb.ExtractedTextWrapper()
|
444
|
-
etw.body.text = "Summary of document"
|
445
|
-
etw.field.field = "summary"
|
446
|
-
etw.field.field_type = rpb.FieldType.GENERIC
|
447
|
-
message1.extracted_text.append(etw)
|
448
|
-
|
449
|
-
etw = rpb.ExtractedTextWrapper()
|
450
|
-
etw.body.text = "Title Resource"
|
451
|
-
etw.field.field = "title"
|
452
|
-
etw.field.field_type = rpb.FieldType.GENERIC
|
453
|
-
message1.extracted_text.append(etw)
|
454
|
-
|
455
|
-
fcm = rpb.FieldComputedMetadataWrapper()
|
456
|
-
fcm.field.field = "file"
|
457
|
-
fcm.field.field_type = rpb.FieldType.FILE
|
458
|
-
p1 = rpb.Paragraph(
|
459
|
-
start=0,
|
460
|
-
end=45,
|
461
|
-
)
|
462
|
-
p1.start_seconds.append(0)
|
463
|
-
p1.end_seconds.append(10)
|
464
|
-
p2 = rpb.Paragraph(
|
465
|
-
start=47,
|
466
|
-
end=64,
|
467
|
-
)
|
468
|
-
p2.start_seconds.append(10)
|
469
|
-
p2.end_seconds.append(20)
|
470
|
-
p2.start_seconds.append(20)
|
471
|
-
p2.end_seconds.append(30)
|
472
|
-
|
473
|
-
fcm.metadata.metadata.paragraphs.append(p1)
|
474
|
-
fcm.metadata.metadata.paragraphs.append(p2)
|
475
|
-
fcm.metadata.metadata.last_index.FromDatetime(datetime.now())
|
476
|
-
fcm.metadata.metadata.last_understanding.FromDatetime(datetime.now())
|
477
|
-
fcm.metadata.metadata.last_extract.FromDatetime(datetime.now())
|
478
|
-
fcm.metadata.metadata.ner["Ramon"] = "PERSON"
|
479
|
-
|
480
|
-
c1 = rpb.Classification()
|
481
|
-
c1.label = "label1"
|
482
|
-
c1.labelset = "labelset1"
|
483
|
-
fcm.metadata.metadata.classifications.append(c1)
|
484
|
-
message1.field_metadata.append(fcm)
|
485
|
-
|
486
|
-
ev = rpb.ExtractedVectorsWrapper()
|
487
|
-
ev.field.field = "file"
|
488
|
-
ev.field.field_type = rpb.FieldType.FILE
|
489
|
-
|
490
|
-
v1 = rpb.Vector()
|
491
|
-
v1.start = 0
|
492
|
-
v1.end = 19
|
493
|
-
v1.start_paragraph = 0
|
494
|
-
v1.end_paragraph = 45
|
495
|
-
v1.vector.extend(V1)
|
496
|
-
ev.vectors.vectors.vectors.append(v1)
|
497
|
-
|
498
|
-
v2 = rpb.Vector()
|
499
|
-
v2.start = 20
|
500
|
-
v2.end = 45
|
501
|
-
v2.start_paragraph = 0
|
502
|
-
v2.end_paragraph = 45
|
503
|
-
v2.vector.extend(V2)
|
504
|
-
ev.vectors.vectors.vectors.append(v2)
|
505
|
-
|
506
|
-
v3 = rpb.Vector()
|
507
|
-
v3.start = 48
|
508
|
-
v3.end = 65
|
509
|
-
v3.start_paragraph = 47
|
510
|
-
v3.end_paragraph = 64
|
511
|
-
v3.vector.extend(V3)
|
512
|
-
ev.vectors.vectors.vectors.append(v3)
|
513
|
-
|
514
|
-
message1.field_vectors.append(ev)
|
515
|
-
message1.source = BrokerMessage.MessageSource.WRITER
|
516
|
-
return message1
|
517
|
-
|
518
|
-
|
519
|
-
async def create_resource(
|
520
|
-
storage: Storage, driver: Driver, knowledgebox_ingest: str
|
521
|
-
) -> Resource:
|
522
|
-
txn = await driver.begin()
|
523
|
-
|
524
|
-
rid = str(uuid.uuid4())
|
525
|
-
kb_obj = KnowledgeBox(txn, storage, kbid=knowledgebox_ingest)
|
526
|
-
test_resource = await kb_obj.add_resource(uuid=rid, slug="slug")
|
527
|
-
await test_resource.set_slug()
|
528
|
-
|
529
|
-
# 1. ROOT ELEMENTS
|
530
|
-
# 1.1 BASIC
|
531
|
-
|
532
|
-
basic = rpb.Basic(
|
533
|
-
title="My title",
|
534
|
-
summary="My summary",
|
535
|
-
icon="text/plain",
|
536
|
-
layout="basic",
|
537
|
-
thumbnail="/file",
|
538
|
-
last_seqid=1,
|
539
|
-
last_account_seq=2,
|
540
|
-
)
|
541
|
-
basic.metadata.metadata["key"] = "value"
|
542
|
-
basic.metadata.language = "ca"
|
543
|
-
basic.metadata.useful = True
|
544
|
-
basic.metadata.status = rpb.Metadata.Status.PROCESSED
|
545
|
-
|
546
|
-
cl1 = rpb.Classification(labelset="labelset1", label="label1")
|
547
|
-
basic.usermetadata.classifications.append(cl1)
|
548
|
-
|
549
|
-
r1 = upb.Relation(
|
550
|
-
relation=upb.Relation.CHILD,
|
551
|
-
source=upb.RelationNode(value=rid, ntype=upb.RelationNode.NodeType.RESOURCE),
|
552
|
-
to=upb.RelationNode(value="000001", ntype=upb.RelationNode.NodeType.RESOURCE),
|
553
|
-
)
|
554
|
-
|
555
|
-
basic.usermetadata.relations.append(r1)
|
556
|
-
|
557
|
-
ufm1 = rpb.UserFieldMetadata(
|
558
|
-
token=[rpb.TokenSplit(token="My home", klass="Location")],
|
559
|
-
field=rpb.FieldID(field_type=rpb.FieldType.TEXT, field="text1"),
|
560
|
-
)
|
561
|
-
|
562
|
-
basic.fieldmetadata.append(ufm1)
|
563
|
-
basic.created.FromDatetime(datetime.utcnow())
|
564
|
-
basic.modified.FromDatetime(datetime.utcnow())
|
565
|
-
|
566
|
-
await test_resource.set_basic(basic)
|
567
|
-
|
568
|
-
# 1.2 RELATIONS
|
569
|
-
|
570
|
-
rels = []
|
571
|
-
r1 = upb.Relation(
|
572
|
-
relation=upb.Relation.CHILD,
|
573
|
-
source=upb.RelationNode(value=rid, ntype=upb.RelationNode.NodeType.RESOURCE),
|
574
|
-
to=upb.RelationNode(value="000001", ntype=upb.RelationNode.NodeType.RESOURCE),
|
575
|
-
)
|
576
|
-
|
577
|
-
rels.append(r1)
|
578
|
-
await test_resource.set_relations(rels)
|
579
|
-
|
580
|
-
# 1.3 ORIGIN
|
581
|
-
|
582
|
-
o2 = rpb.Origin()
|
583
|
-
o2.source = rpb.Origin.Source.API
|
584
|
-
o2.source_id = "My Source"
|
585
|
-
o2.created.FromDatetime(datetime.now())
|
586
|
-
o2.modified.FromDatetime(datetime.now())
|
587
|
-
|
588
|
-
await test_resource.set_origin(o2)
|
589
|
-
|
590
|
-
# 2. FIELDS
|
591
|
-
#
|
592
|
-
# Add an example of each of the files, containing all possible metadata
|
593
|
-
|
594
|
-
# Title
|
595
|
-
title_field = await test_resource.get_field(
|
596
|
-
"title", rpb.FieldType.GENERIC, load=False
|
597
|
-
)
|
598
|
-
await make_field(title_field, "MyText")
|
599
|
-
|
600
|
-
# Summary
|
601
|
-
summary_field = await test_resource.get_field(
|
602
|
-
"summary", rpb.FieldType.GENERIC, load=False
|
603
|
-
)
|
604
|
-
await make_field(summary_field, "MyText")
|
605
|
-
|
606
|
-
# 2.1 FILE FIELD
|
607
|
-
|
608
|
-
t2 = rpb.FieldFile(
|
609
|
-
language="es",
|
610
|
-
)
|
611
|
-
t2.added.FromDatetime(datetime.now())
|
612
|
-
t2.file.CopyFrom(TEST_CLOUDFILE)
|
613
|
-
|
614
|
-
file_field = await test_resource.set_field(rpb.FieldType.FILE, "file1", t2)
|
615
|
-
await add_field_id(test_resource, file_field)
|
616
|
-
await make_field(file_field, "MyText")
|
617
|
-
|
618
|
-
# 2.2 LINK FIELD
|
619
|
-
li2 = rpb.FieldLink(
|
620
|
-
uri="htts://nuclia.cloud",
|
621
|
-
language="ca",
|
622
|
-
)
|
623
|
-
li2.added.FromDatetime(datetime.now())
|
624
|
-
li2.headers["AUTHORIZATION"] = "Bearer xxxxx"
|
625
|
-
linkfield = await test_resource.set_field(rpb.FieldType.LINK, "link1", li2)
|
626
|
-
|
627
|
-
ex1 = rpb.LinkExtractedData()
|
628
|
-
ex1.date.FromDatetime(datetime.now())
|
629
|
-
ex1.language = "ca"
|
630
|
-
ex1.title = "My Title"
|
631
|
-
ex1.field = "link1"
|
632
|
-
|
633
|
-
ex1.link_preview.CopyFrom(THUMBNAIL)
|
634
|
-
ex1.link_thumbnail.CopyFrom(THUMBNAIL)
|
635
|
-
|
636
|
-
await linkfield.set_link_extracted_data(ex1)
|
637
|
-
await add_field_id(test_resource, linkfield)
|
638
|
-
await make_field(linkfield, "MyText")
|
639
|
-
|
640
|
-
# 2.3 TEXT FIELDS
|
641
|
-
|
642
|
-
t23 = rpb.FieldText(body="This is my text field", format=rpb.FieldText.Format.PLAIN)
|
643
|
-
textfield = await test_resource.set_field(rpb.FieldType.TEXT, "text1", t23)
|
644
|
-
await add_field_id(test_resource, textfield)
|
645
|
-
await make_field(textfield, "MyText")
|
646
|
-
|
647
|
-
# 2.4 LAYOUT FIELD
|
648
|
-
|
649
|
-
l2 = rpb.FieldLayout(format=rpb.FieldLayout.Format.NUCLIAv1)
|
650
|
-
l2.body.blocks["field1"].x = 0
|
651
|
-
l2.body.blocks["field1"].y = 0
|
652
|
-
l2.body.blocks["field1"].cols = 1
|
653
|
-
l2.body.blocks["field1"].rows = 1
|
654
|
-
l2.body.blocks["field1"].type = rpb.Block.TypeBlock.TITLE
|
655
|
-
l2.body.blocks["field1"].payload = "{}"
|
656
|
-
l2.body.blocks["field1"].file.CopyFrom(TEST_CLOUDFILE)
|
657
|
-
|
658
|
-
layoutfield = await test_resource.set_field(rpb.FieldType.LAYOUT, "layout1", l2)
|
659
|
-
await add_field_id(test_resource, layoutfield)
|
660
|
-
|
661
|
-
await layoutfield.set_extracted_text(
|
662
|
-
make_extracted_text(layoutfield.id, body="MyText")
|
663
|
-
)
|
664
|
-
await layoutfield.set_field_metadata(make_field_metadata(layoutfield.id))
|
665
|
-
await layoutfield.set_large_field_metadata(
|
666
|
-
make_field_large_metadata(layoutfield.id)
|
667
|
-
)
|
668
|
-
await layoutfield.set_vectors(make_extracted_vectors(layoutfield.id))
|
669
|
-
|
670
|
-
# 2.5 CONVERSATION FIELD
|
671
|
-
|
672
|
-
def make_message(
|
673
|
-
text: str, files: Optional[list[rpb.CloudFile]] = None
|
674
|
-
) -> rpb.Message:
|
675
|
-
msg = rpb.Message(
|
676
|
-
who="myself",
|
677
|
-
)
|
678
|
-
msg.timestamp.FromDatetime(datetime.now())
|
679
|
-
msg.content.text = text
|
680
|
-
msg.content.format = rpb.MessageContent.Format.PLAIN
|
681
|
-
|
682
|
-
if files:
|
683
|
-
for file in files:
|
684
|
-
msg.content.attachments.append(file)
|
685
|
-
return msg
|
686
|
-
|
687
|
-
c2 = rpb.Conversation()
|
688
|
-
|
689
|
-
for i in range(300):
|
690
|
-
new_message = make_message(f"{i} hello")
|
691
|
-
if i == 33:
|
692
|
-
new_message = make_message(f"{i} hello", files=[TEST_CLOUDFILE, THUMBNAIL])
|
693
|
-
c2.messages.append(new_message)
|
694
|
-
|
695
|
-
convfield = await test_resource.set_field(rpb.FieldType.CONVERSATION, "conv1", c2)
|
696
|
-
await add_field_id(test_resource, convfield)
|
697
|
-
await make_field(convfield, extracted_text="MyText")
|
698
|
-
|
699
|
-
# 2.6 KEYWORDSET FIELD
|
700
|
-
|
701
|
-
k2 = rpb.FieldKeywordset(
|
702
|
-
keywords=[rpb.Keyword(value="kw1"), rpb.Keyword(value="kw2")]
|
703
|
-
)
|
704
|
-
kws_field = await test_resource.set_field(
|
705
|
-
rpb.FieldType.KEYWORDSET, "keywordset1", k2
|
706
|
-
)
|
707
|
-
await add_field_id(test_resource, kws_field)
|
708
|
-
await make_field(kws_field, "MyText")
|
709
|
-
|
710
|
-
# 2.7 DATETIMES FIELD
|
711
|
-
|
712
|
-
d2 = rpb.FieldDatetime()
|
713
|
-
d2.value.FromDatetime(datetime.now())
|
714
|
-
datetime_field = await test_resource.set_field(
|
715
|
-
rpb.FieldType.DATETIME, "datetime1", d2
|
716
|
-
)
|
717
|
-
await add_field_id(test_resource, datetime_field)
|
718
|
-
await make_field(datetime_field, "MyText")
|
719
|
-
|
720
|
-
field_obj = await test_resource.get_field("datetime1", type=rpb.FieldType.DATETIME)
|
721
|
-
|
722
|
-
# Q/A
|
723
|
-
question_answers = rpb.FieldQuestionAnswerWrapper()
|
724
|
-
for i in range(10):
|
725
|
-
qa = rpb.QuestionAnswer()
|
726
|
-
|
727
|
-
qa.question.text = f"My question {i}"
|
728
|
-
qa.question.language = "catalan"
|
729
|
-
qa.question.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
|
730
|
-
|
731
|
-
answer = rpb.Answers()
|
732
|
-
answer.text = f"My answer {i}"
|
733
|
-
answer.language = "catalan"
|
734
|
-
answer.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
|
735
|
-
qa.answers.append(answer)
|
736
|
-
question_answers.question_answers.question_answer.append(qa)
|
737
|
-
|
738
|
-
await field_obj.set_question_answers(question_answers)
|
739
|
-
|
740
|
-
await txn.commit()
|
741
|
-
return test_resource
|
742
|
-
|
743
|
-
|
744
|
-
async def add_field_id(resource: Resource, field: Field):
|
745
|
-
field_type = KB_REVERSE[field.type]
|
746
|
-
field_id = rpb.FieldID(field_type=field_type, field=field.id)
|
747
|
-
await resource.update_all_field_ids(updated=[field_id])
|
748
|
-
|
749
|
-
|
750
|
-
@pytest.fixture(scope="function")
|
751
|
-
async def entities_manager_mock():
|
752
|
-
"""EntitiesManager mock for ingest gRPC API disabling indexed entities
|
753
|
-
functionality. As tests doesn't startup a node, with this mock we allow
|
754
|
-
testing ingest's gRPC API while the whole entities functionality is properly
|
755
|
-
tested in tests nos using this fixture.
|
756
|
-
|
757
|
-
"""
|
758
|
-
klass = "nucliadb.ingest.service.writer.EntitiesManager"
|
759
|
-
with patch(f"{klass}.get_indexed_entities_group", AsyncMock(return_value=None)):
|
760
|
-
with patch(
|
761
|
-
"nucliadb.common.cluster.manager.KBShardManager.apply_for_all_shards",
|
762
|
-
AsyncMock(return_value=[]),
|
763
|
-
):
|
764
|
-
yield
|