nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,684 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
import base64
|
21
|
-
import uuid
|
22
|
-
from datetime import datetime
|
23
|
-
from os.path import dirname, getsize
|
24
|
-
from unittest.mock import patch
|
25
|
-
from uuid import uuid4
|
26
|
-
|
27
|
-
import nats
|
28
|
-
import pytest
|
29
|
-
from nats.aio.client import Client
|
30
|
-
from nats.js import JetStreamContext
|
31
|
-
from nucliadb_protos.audit_pb2 import AuditField, AuditRequest
|
32
|
-
from nucliadb_protos.resources_pb2 import (
|
33
|
-
TEXT,
|
34
|
-
Answers,
|
35
|
-
Classification,
|
36
|
-
CloudFile,
|
37
|
-
Entity,
|
38
|
-
ExtractedTextWrapper,
|
39
|
-
ExtractedVectorsWrapper,
|
40
|
-
FieldComputedMetadataWrapper,
|
41
|
-
FieldID,
|
42
|
-
FieldQuestionAnswerWrapper,
|
43
|
-
FieldType,
|
44
|
-
FileExtractedData,
|
45
|
-
LargeComputedMetadataWrapper,
|
46
|
-
)
|
47
|
-
from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
|
48
|
-
from nucliadb_protos.resources_pb2 import Origin, Paragraph, QuestionAnswer
|
49
|
-
from nucliadb_protos.utils_pb2 import Vector
|
50
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage
|
51
|
-
|
52
|
-
from nucliadb.common.maindb.driver import Driver
|
53
|
-
from nucliadb.ingest import SERVICE_NAME
|
54
|
-
from nucliadb.ingest.consumer.auditing import (
|
55
|
-
IndexAuditHandler,
|
56
|
-
ResourceWritesAuditHandler,
|
57
|
-
)
|
58
|
-
from nucliadb.ingest.orm.exceptions import DeadletteredError
|
59
|
-
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
60
|
-
from nucliadb.ingest.orm.processor import Processor
|
61
|
-
from nucliadb.ingest.orm.resource import Resource
|
62
|
-
from nucliadb_utils.audit.stream import StreamAuditStorage
|
63
|
-
from nucliadb_utils.storages.storage import Storage
|
64
|
-
from nucliadb_utils.utilities import Utility, get_indexing, get_storage, set_utility
|
65
|
-
|
66
|
-
EXAMPLE_VECTOR = base64.b64decode(
|
67
|
-
"k05VTVBZAQB2AHsnZGVzY3InOiAnPGY0JywgJ2ZvcnRyYW5fb3JkZXInOiBGYWxzZSwgJ3NoYXBlJzogKDc2OCwpLCB9ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogKIq+lgYUvvf7bTx39oo+fFaXPbx2a71RfiK/FBroPVTawr4UEm2+AgtuvWzTTT5ipjg9JflLvMdfub6fBIE+d7gmvnJPl75alUQ+n68Hv2BYJL20+74+bHDsPV/wTT63h4E9hes9PqxHXT6h/J079HfVPF/7BD3BSMO+PuA9PjJhtD4W6pq+rmwjPp+Fzz6xUfa+FMZtOutIBT4mPik+EbsAPyRePb41IVW+i+0RPT7GtL51USY+GRjRvjWD1z4+wq+9j9kqvmq/074hHBM+kh+ZPoRfmb6R0yi/kuirvlcqLj+Ss64+0cMBP2UKsD2LtpI9927BvtCfHb5KY7U+8s64vkcGX778+NY+2pMxPNowJD7R39u+dbmfPqbrL73bIby+Nbu8voH3kr4gps6+f3L6PuJFAb3PFWA+99BPPjkLzD0vc8m79JmtvWYnbL6W+6A+WUWEveVVED0V0h8+3zPWvv19Dr2igdC9JcGRPV568z41ZVu8mRxRvdkBQr73JHO+PFxkvtHatLzVgN49NEgav0l7ab276hK+ABMDvrRrJj4akbO++zFnPRzXoDyecdi+pGq4viUgiL4XXwK+tvcOPivvgD7PV0w+D7CwPmfoiL0REec+tsx1Pe2xkD6S9Jm+ZW09P1Obiz2Ov/Q+OtsBP8Xicj7WJpi9szGJvqvWvz4hFqG++ZuGvIAmMb0r+T2+wj9RPgZN0z7KwGI+ezogPgI78D6aUrW8etzkPHpSqb7c4Sg+b6BZvXlSrr6un6a8uUCrvhbBgb7PtwA+CsSwvQzyz73G1eq+plYZP/6I7r6BRsu992/gPuIBJj9aT8u94saNvdIDG76Zar4+GeRxvncSZz3citO+ILq6vmS3D78JHk6/NdeIPWYQwb0WZJW9OnwJPhdIQL7Gta6+MZWevpRNvr0ZH/c9B//hPtNUlL1pWhu/VliNvshFjT6laVS9EpjovQBHdb4HWMe+e/rfvrcSDz620/I+krapPlnIDz5uR1Y+znjqPTFM+T1+kK8+VMcevDegSjvM7fw+e0yKPbDoVz56wk4+EeoGvnq3rT76dbW+ghE6vvos0b6CqQu+p6JDPvzn2bogOui+oZU5v6/Pvr4siDI9Kv6Dvt6TQj51LqW+qYLsPmyjZT45DkG9MQivPgIHBT/qeRW/ghXOPkcJtL6MwhA/9F5PvbR7Jr4ftKA+mdkePwm2A77WpNU+Ho/NvsWEfL75zPS9v8ycvtXFVD5ONFI7mVkOPlFd7bzacZK7aSyRPkRrhz6e8+k+glJ5Pq9mmD0X95y+APOjPveVBb9yOgM+DLlMPkqCRb7CKwW8N+TevpZtmD5lbpq+n+tdPr4+m7661Wg+gd66ve5dzr2ZH7k+x/aNPo+0Kz4PMMa+voMGv+ud8r4Nape892YZPWlDL76twQi/RC2QPk8juz1uTwC9yf3rvn8RmD5LO0e+7t5CvvYTbb5O8UA/yrZqO7aZib6FBEe+n/xAv08BGr15Vxs/FNIevkbN1r3f2Hw+oj18PJwOnb3SDpo+wf67vvy3sj6qvRM/BrljPtrlBr4w2Ck9Jh6fPv6Vn75qa7w+eWShvj6bYj56q46+x41KPvQtqb2qXVm+DTmTPpvXWz5hUnC9f7ptPAu1tsDAcUa+ckyGvTeaIz3FcaC9Zu/cvYvjzD7WUdQ+P2DFvrbdHz4CfVe+HxwAP3HZy775Q7w+eg+svcccuTwLBFW+QkVhPuSSjLymH6g+DFBKviDgWz0wxyK+1C+3PSKk975Hkxi8FKzVvRnykD5lFCa/bqnBPRACHL5uUS+/Zb3FvoK66j5CHUu+vq4TvkxWfT5wv7o+wW79PJHsrD42Aau+SuQFvdzUnz50dEe+qZNjvmZ1LLxvt529oeHDPsv3dT5O+z69vOoevm/1Cz5O7NU9i6uHPibkEr6g5d2+LobFPn+KAT/gLsY+2jm4vlpyhD48l4g+yqx3Pql7yr5sIYK+7awLPlnODb+3e7i+t9RVPQC99z6SQJk+lbXoPbyAI7mKcCu/4kX9vFuhtL61fhq+UjGgvYxSvDzCzfw+24xfvs+Sjr782jy+kTzaPmEqtD6sN2c9otXavSqTiT5hM/q+MjAFP4kflT5JOe280NUmvrQtkT4f55m9CyFwPr8GF7wNzBm+x05SvsFJtz0MG9w+HCf/vn4mkr7iMiw+DmhCPUDI/j3PrVe+glX3vlpDPz8ucKG+MexCPgoBDD+FMn68BMDnvCf+UT3bgq4+srqvvYF8H7+1VKq9qbQTvY1tBL8epwC85PUdviSEhD7hg7e9jMUzvVuFz71qCf29IudEPsAwH767q809fL0uvrk+Mr7OTVy9TNcWvhnV3T4hOwq/F/E3P4UOXz4Vade+fK8TP5v4sr4Amf8+HCqPvmYV7Lo3UMK+0urYPrSH3zw/8oq9tAHCvvs5GD91e6w9GsqJPNRo7j5ffH6+X++MvKFQxj17Es6+TA5OvW8tAz8C4nU8tiHDvm5FZL5Kv3O+fuZ0Php/9j0Gyua+mSKVvs+pDT8+TwC/qS8Gvl/z0r5iVLq+a8e/vIXlIT4r7Ty+dqrXPmn9Db4p4PS+Kv0nPfnVUz7avj0+KVOTPkG3Kz68dQa+LSKGvXnRvjxnzyM92moTvy9SnD4F9Dw+mWoyvXpXRD8nm7I9O245P6KlZT4zCxc+baKLPsyE0rw8YHO+coGePfcAYT300Jw+UoeUvlvHFD7CjpC+p9KpvlteKrvgzwG8Sbg2vn8NDz5MDtW99URGvoaaxb0svk+9+cajvUvAab1qXpS91FSbvszYlj6f9oI+Ge5yPDdVxr45qV2+WmuxPcx+qj5l88W9ApSIvsFrwT4GT8c+Vg/0PjkNT745ezC/9ogqPm7bE7/Wh1O9b7NrvlVU/j4u3ga9mv+xvaHTtD76O40+LIyTvssUDT73Q5y+QO5TvX7bgj3gY5S+YTSfvpYeIL6a+Y29CLmZvda6xz6cC9Q+9sQSPwnG+j3RS927zvaAvq7iLz0CqPw9Fir+vNr7VT5qEgM+yhqtupy5q76uVtE7eZ+Nvi/7h75rkLq9vOW0O7QhFj7JCbc+3tp7vlpEOT9+aPc+hwnnvkqLPr0Ry/4+8zOaPfE0O70OJ6I9eQlJvbAU/T0KcaK8gS2Kvulxdj0u2JY+u4mxPN4vXj7B6xQ+LjBLvuTgJ77vq7M7KbcIvnbIdD0UQd++ZyuHvlaAPr4SeMw++sRuvZ7sXz3yJ5O9cSmPvZ8mRL7X2JM9trN4PpzLt70C3Og94uwLv4pACb8LWoY9Uz+ZPvE1Ij4R8HG9JVyJvvFOZz6XkIU+had5PvoQKT7h3CK+IzATv1U3qrxUum68B1bDviBzhz7u5XI9KXwkPoszXr6en5I9VNxMPAKusT5XGTg8Ne9GvC6yBz/EidM+V8T8u3LO1D7qSJa+AlsUPeb9pb0vNFK+lFCevTGrR70aeSu+zihyvOLan77CaxE/5ZnaPUv8Nr/hBhs+oCZBPttGqr5ZrwO+O0DGPU7JOD7FxdK+pw6CPWumgz6VB7++Gjb1vq6Ns7uZ1FI9VmTLPsl2iz7h5YI8CJYXvh6MSz6ucvc9qx1bPovgpT7ZWyO+Z+d1vrXkrz3VC8s+dmievuxuHb7MOXE+ewUCvJcPuT6n2Rc8mQyYvl45Gr1ER3c9LCZYvmqQhb1lVJu+V1acPZp63z5Cfmu+4NFZPvmBJb6cmAI+J0U7PsLkSb16KrO9wj4JPo4Fq7563+09jAw8vkYbbD7/Z5q7TH1kvnJrLb1mqkS+R+a9vX0ODD4p9ak+un8VO6mSp71C66w+FlLVPr/0Wb0eLR2+AneHvVTFHD/P0X0+TsQ4vlWQQzzP8no6VtEOPHLiG78Foyg+Un5OP/fFeL3uVxc+C1VzP9IInL2Zbbo8bw2Lvt5f0b4LY9w9LyaMvIcBc70K3bs+9lz5vTSTC7770MG+B4dHvvRFSz3lO6w9ENACv5NLBz20vSk+MuMQPLQYZr/2+6o+gzANvXGTjL259Qy9ZUMKPnyCC7498ww8oGGSvouNujyvJVW+TjmIvvI8KT667mq9MC6fvVUcvz0=" # noqa
|
68
|
-
)
|
69
|
-
|
70
|
-
|
71
|
-
@pytest.fixture(autouse=True)
|
72
|
-
async def audit_consumers(storage, pubsub, stream_audit: StreamAuditStorage):
|
73
|
-
index_auditor = IndexAuditHandler(
|
74
|
-
audit=stream_audit,
|
75
|
-
pubsub=pubsub,
|
76
|
-
)
|
77
|
-
resource_writes_auditor = ResourceWritesAuditHandler(
|
78
|
-
storage=storage,
|
79
|
-
audit=stream_audit,
|
80
|
-
pubsub=pubsub,
|
81
|
-
)
|
82
|
-
|
83
|
-
await index_auditor.initialize()
|
84
|
-
await resource_writes_auditor.initialize()
|
85
|
-
yield
|
86
|
-
await index_auditor.finalize()
|
87
|
-
await resource_writes_auditor.finalize()
|
88
|
-
|
89
|
-
|
90
|
-
@pytest.fixture()
|
91
|
-
def kbid(
|
92
|
-
local_files,
|
93
|
-
storage: Storage,
|
94
|
-
txn,
|
95
|
-
cache,
|
96
|
-
fake_node,
|
97
|
-
processor,
|
98
|
-
knowledgebox_ingest,
|
99
|
-
):
|
100
|
-
yield knowledgebox_ingest
|
101
|
-
|
102
|
-
|
103
|
-
@pytest.mark.asyncio
|
104
|
-
async def test_ingest_messages_autocommit(kbid: str, processor):
|
105
|
-
rid = str(uuid.uuid4())
|
106
|
-
message1: BrokerMessage = BrokerMessage(
|
107
|
-
kbid=kbid,
|
108
|
-
uuid=rid,
|
109
|
-
slug="slug1",
|
110
|
-
type=BrokerMessage.AUTOCOMMIT,
|
111
|
-
)
|
112
|
-
filename = f"{dirname(__file__)}/assets/file.png"
|
113
|
-
cf1 = CloudFile(
|
114
|
-
uri="file.png",
|
115
|
-
source=CloudFile.Source.LOCAL,
|
116
|
-
bucket_name="/integration/ingest/assets",
|
117
|
-
size=getsize(filename),
|
118
|
-
content_type="image/png",
|
119
|
-
filename="file.png",
|
120
|
-
)
|
121
|
-
message1.basic.icon = "text/plain"
|
122
|
-
message1.basic.title = "Title Resource"
|
123
|
-
message1.basic.summary = "Summary of Document"
|
124
|
-
message1.basic.thumbnail = "doc"
|
125
|
-
message1.basic.layout = "default"
|
126
|
-
message1.basic.metadata.language = "es"
|
127
|
-
message1.basic.created.FromDatetime(datetime.now())
|
128
|
-
message1.basic.modified.FromDatetime(datetime.now())
|
129
|
-
message1.origin.source = Origin.Source.WEB
|
130
|
-
message1.files["file"].file.CopyFrom(cf1)
|
131
|
-
|
132
|
-
fed = FileExtractedData()
|
133
|
-
fed.file_pages_previews.pages.append(cf1)
|
134
|
-
fed.language = "ca"
|
135
|
-
fed.md5 = "asdsadsad"
|
136
|
-
fed.metadata["key1"] = "ca"
|
137
|
-
fed.nested["key2"] = "ca"
|
138
|
-
fed.file_generated["subfile1"].CopyFrom(cf1)
|
139
|
-
fed.file_preview.CopyFrom(cf1)
|
140
|
-
fed.file_thumbnail.CopyFrom(cf1)
|
141
|
-
message1.file_extracted_data.append(fed)
|
142
|
-
|
143
|
-
etw = ExtractedTextWrapper()
|
144
|
-
etw.body.text = "My own text"
|
145
|
-
etw.field.field = "file"
|
146
|
-
etw.field.field_type = FieldType.FILE
|
147
|
-
message1.extracted_text.append(etw)
|
148
|
-
etw = ExtractedTextWrapper()
|
149
|
-
etw.body.text = "My summary"
|
150
|
-
etw.field.field = "summary"
|
151
|
-
etw.field.field_type = FieldType.GENERIC
|
152
|
-
message1.extracted_text.append(etw)
|
153
|
-
|
154
|
-
fcm = FieldComputedMetadataWrapper()
|
155
|
-
fcm.field.field = "file"
|
156
|
-
fcm.field.field_type = FieldType.FILE
|
157
|
-
p1 = Paragraph(
|
158
|
-
start=1,
|
159
|
-
end=20,
|
160
|
-
)
|
161
|
-
fcm.metadata.metadata.paragraphs.append(p1)
|
162
|
-
fcm.metadata.metadata.last_index.FromDatetime(datetime.now())
|
163
|
-
fcm.metadata.metadata.last_understanding.FromDatetime(datetime.now())
|
164
|
-
fcm.metadata.metadata.last_extract.FromDatetime(datetime.now())
|
165
|
-
fcm.metadata.metadata.ner["Ramon"] = "PERSON"
|
166
|
-
|
167
|
-
c1 = Classification()
|
168
|
-
c1.label = "label1"
|
169
|
-
c1.labelset = "labelset1"
|
170
|
-
fcm.metadata.metadata.classifications.append(c1)
|
171
|
-
message1.field_metadata.append(fcm)
|
172
|
-
|
173
|
-
lcmw = LargeComputedMetadataWrapper()
|
174
|
-
lcmw.field.field = "file"
|
175
|
-
lcmw.field.field_type = FieldType.FILE
|
176
|
-
lcmw.real.metadata.tokens["asd"] = 4
|
177
|
-
lcmw.real.metadata.entities.append(Entity(token="token", root="tok", type="PERSON"))
|
178
|
-
message1.field_large_metadata.append(lcmw)
|
179
|
-
|
180
|
-
ev = ExtractedVectorsWrapper()
|
181
|
-
ev.field.field = "file"
|
182
|
-
ev.field.field_type = FieldType.FILE
|
183
|
-
v1 = Vector(
|
184
|
-
start=1, end=10, start_paragraph=1, end_paragraph=20, vector=EXAMPLE_VECTOR
|
185
|
-
)
|
186
|
-
ev.vectors.vectors.vectors.append(v1)
|
187
|
-
message1.field_vectors.append(ev)
|
188
|
-
|
189
|
-
message1.source = BrokerMessage.MessageSource.WRITER
|
190
|
-
await processor.process(message=message1, seqid=1)
|
191
|
-
|
192
|
-
index = get_indexing()
|
193
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
194
|
-
|
195
|
-
pb = await storage.get_indexing(index._calls[0][1])
|
196
|
-
assert pb.texts["a/summary"].text == "My summary" # type: ignore
|
197
|
-
|
198
|
-
pb = await storage.get_indexing(index._calls[1][1])
|
199
|
-
assert pb.texts["a/summary"].text == "My summary" # type: ignore
|
200
|
-
|
201
|
-
|
202
|
-
@pytest.mark.asyncio
|
203
|
-
async def test_ingest_error_message(
|
204
|
-
kbid: str, storage: Storage, processor, maindb_driver: Driver
|
205
|
-
):
|
206
|
-
filename = f"{dirname(__file__)}/assets/resource.pb"
|
207
|
-
with open(filename, "r") as f:
|
208
|
-
data = base64.b64decode(f.read())
|
209
|
-
message0: BrokerMessage = BrokerMessage()
|
210
|
-
message0.ParseFromString(data)
|
211
|
-
message0.kbid = kbid
|
212
|
-
message0.source = BrokerMessage.MessageSource.WRITER
|
213
|
-
|
214
|
-
await processor.process(message=message0, seqid=1)
|
215
|
-
|
216
|
-
filename = f"{dirname(__file__)}/assets/error.pb"
|
217
|
-
with open(filename, "r") as f:
|
218
|
-
data = base64.b64decode(f.read())
|
219
|
-
message1: BrokerMessage = BrokerMessage()
|
220
|
-
message1.ParseFromString(data)
|
221
|
-
message1.kbid = kbid
|
222
|
-
message1.ClearField("field_vectors")
|
223
|
-
message1.source = BrokerMessage.MessageSource.WRITER
|
224
|
-
|
225
|
-
await processor.process(message=message1, seqid=2)
|
226
|
-
|
227
|
-
async with maindb_driver.transaction() as txn:
|
228
|
-
kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
|
229
|
-
r = await kb_obj.get(message1.uuid)
|
230
|
-
assert r is not None
|
231
|
-
field_obj = await r.get_field("wikipedia_ml", TEXT)
|
232
|
-
ext1 = await field_obj.get_extracted_text()
|
233
|
-
lfm1 = await field_obj.get_large_field_metadata()
|
234
|
-
fm1 = await field_obj.get_field_metadata()
|
235
|
-
basic = await r.get_basic()
|
236
|
-
assert basic is not None
|
237
|
-
assert basic.slug == message1.slug
|
238
|
-
assert basic.summary == message0.basic.summary
|
239
|
-
|
240
|
-
assert ext1.text == message1.extracted_text[0].body.text
|
241
|
-
|
242
|
-
assert lfm1 is not None
|
243
|
-
assert fm1 is not None
|
244
|
-
assert field_obj.value.body == message0.texts["wikipedia_ml"].body
|
245
|
-
|
246
|
-
|
247
|
-
@pytest.mark.asyncio
|
248
|
-
async def test_ingest_messages_origin(
|
249
|
-
local_files,
|
250
|
-
storage: Storage,
|
251
|
-
fake_node,
|
252
|
-
processor,
|
253
|
-
knowledgebox_ingest,
|
254
|
-
):
|
255
|
-
rid = "43ece3e4-b706-4c74-b41b-3637f6d28197"
|
256
|
-
message1: BrokerMessage = BrokerMessage(
|
257
|
-
kbid=knowledgebox_ingest,
|
258
|
-
uuid=rid,
|
259
|
-
slug="slug1",
|
260
|
-
type=BrokerMessage.AUTOCOMMIT,
|
261
|
-
)
|
262
|
-
message1.source = BrokerMessage.MessageSource.WRITER
|
263
|
-
await processor.process(message=message1, seqid=1)
|
264
|
-
|
265
|
-
async with processor.driver.transaction() as txn:
|
266
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
267
|
-
kb = KnowledgeBox(txn, storage, knowledgebox_ingest)
|
268
|
-
res = Resource(txn, storage, kb, rid)
|
269
|
-
origin = await res.get_origin()
|
270
|
-
|
271
|
-
# should not be set
|
272
|
-
assert origin is None
|
273
|
-
|
274
|
-
# now set the origin
|
275
|
-
message1.origin.CopyFrom(
|
276
|
-
Origin(
|
277
|
-
source=Origin.Source.API,
|
278
|
-
filename="file.png",
|
279
|
-
url="http://www.google.com",
|
280
|
-
)
|
281
|
-
)
|
282
|
-
await processor.process(message=message1, seqid=2)
|
283
|
-
|
284
|
-
async with processor.driver.transaction() as txn:
|
285
|
-
kb = KnowledgeBox(txn, storage, knowledgebox_ingest)
|
286
|
-
res = Resource(txn, storage, kb, rid)
|
287
|
-
origin = await res.get_origin()
|
288
|
-
|
289
|
-
assert origin is not None
|
290
|
-
assert origin.url == "http://www.google.com"
|
291
|
-
assert origin.source == Origin.Source.API
|
292
|
-
assert origin.filename == "file.png"
|
293
|
-
|
294
|
-
|
295
|
-
def add_filefields(message, items=None):
|
296
|
-
items = items or []
|
297
|
-
for fieldid, filename in items:
|
298
|
-
file_path = f"{dirname(__file__)}/assets/{filename}"
|
299
|
-
cf1 = CloudFile(
|
300
|
-
uri=filename,
|
301
|
-
source=CloudFile.Source.LOCAL,
|
302
|
-
bucket_name="/integration/ingest/assets",
|
303
|
-
size=getsize(file_path),
|
304
|
-
content_type="application/octet-stream",
|
305
|
-
filename=filename,
|
306
|
-
)
|
307
|
-
message.files[fieldid].file.CopyFrom(cf1)
|
308
|
-
|
309
|
-
|
310
|
-
def add_textfields(message, items=None):
|
311
|
-
items = items or []
|
312
|
-
for fieldid in items:
|
313
|
-
message.texts[fieldid].body = "some random text"
|
314
|
-
|
315
|
-
|
316
|
-
def make_message(
|
317
|
-
kbid: str, rid: str, slug: str = "resource", message_type=BrokerMessage.AUTOCOMMIT
|
318
|
-
):
|
319
|
-
message: BrokerMessage = BrokerMessage(
|
320
|
-
kbid=kbid,
|
321
|
-
uuid=rid,
|
322
|
-
slug=slug,
|
323
|
-
type=message_type,
|
324
|
-
)
|
325
|
-
message.basic.icon = "text/plain"
|
326
|
-
message.basic.title = "Title Resource"
|
327
|
-
message.basic.summary = "Summary of document"
|
328
|
-
message.basic.thumbnail = "doc"
|
329
|
-
message.basic.layout = "default"
|
330
|
-
message.basic.metadata.language = "es"
|
331
|
-
message.basic.created.FromDatetime(datetime.now())
|
332
|
-
message.basic.modified.FromDatetime(datetime.now())
|
333
|
-
message.origin.source = Origin.Source.WEB
|
334
|
-
|
335
|
-
return message
|
336
|
-
|
337
|
-
|
338
|
-
async def get_audit_messages(sub):
|
339
|
-
msg = await sub.fetch(1)
|
340
|
-
auditreq = AuditRequest()
|
341
|
-
auditreq.ParseFromString(msg[0].data)
|
342
|
-
return auditreq
|
343
|
-
|
344
|
-
|
345
|
-
@pytest.mark.asyncio
|
346
|
-
async def test_ingest_audit_stream_files_only(
|
347
|
-
local_files,
|
348
|
-
storage: Storage,
|
349
|
-
txn,
|
350
|
-
cache,
|
351
|
-
fake_node,
|
352
|
-
knowledgebox_ingest,
|
353
|
-
stream_processor,
|
354
|
-
stream_audit: StreamAuditStorage,
|
355
|
-
maindb_driver: Driver,
|
356
|
-
):
|
357
|
-
from nucliadb_utils.settings import audit_settings
|
358
|
-
|
359
|
-
# Prepare a test audit stream to receive our messages
|
360
|
-
partition = stream_audit.get_partition(knowledgebox_ingest)
|
361
|
-
client: Client = await nats.connect(stream_audit.nats_servers)
|
362
|
-
jetstream: JetStreamContext = client.jetstream()
|
363
|
-
if audit_settings.audit_jetstream_target is None:
|
364
|
-
assert False, "Missing jetstream target in audit settings"
|
365
|
-
subject = audit_settings.audit_jetstream_target.format(
|
366
|
-
partition=partition, type="*"
|
367
|
-
)
|
368
|
-
try:
|
369
|
-
await jetstream.delete_stream(name=audit_settings.audit_stream)
|
370
|
-
except nats.js.errors.NotFoundError:
|
371
|
-
pass
|
372
|
-
await jetstream.add_stream(name=audit_settings.audit_stream, subjects=[subject])
|
373
|
-
psub = await jetstream.pull_subscribe(subject, "psub")
|
374
|
-
|
375
|
-
rid = str(uuid.uuid4())
|
376
|
-
|
377
|
-
# We use the same file multiple times, so the size will be the same
|
378
|
-
test_png_size = getsize(f"{dirname(__file__)}/assets/file.png")
|
379
|
-
test_text_size = getsize(f"{dirname(__file__)}/assets/text.pb")
|
380
|
-
test_vectors_size = getsize(f"{dirname(__file__)}/assets/vectors.pb")
|
381
|
-
|
382
|
-
#
|
383
|
-
# Test 1: add a resource with some files
|
384
|
-
#
|
385
|
-
message = make_message(knowledgebox_ingest, rid)
|
386
|
-
add_filefields(
|
387
|
-
message,
|
388
|
-
[("file_1", "file.png"), ("file_2", "text.pb"), ("file_3", "vectors.pb")],
|
389
|
-
)
|
390
|
-
await stream_processor.process(message=message, seqid=1)
|
391
|
-
|
392
|
-
auditreq = await get_audit_messages(psub)
|
393
|
-
|
394
|
-
# Minimal assert to make sure we get the information from the node on the audit
|
395
|
-
# gets from the sidecar to the audit report when adding or modifying a resource
|
396
|
-
# The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
|
397
|
-
|
398
|
-
assert auditreq.kbid == knowledgebox_ingest
|
399
|
-
assert auditreq.rid == rid
|
400
|
-
assert auditreq.type == AuditRequest.AuditType.NEW
|
401
|
-
|
402
|
-
try:
|
403
|
-
int(auditreq.trace_id)
|
404
|
-
except ValueError:
|
405
|
-
assert False, "Invalid trace ID"
|
406
|
-
|
407
|
-
audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
|
408
|
-
assert audit_by_fieldid["file_1"].action == AuditField.FieldAction.MODIFIED
|
409
|
-
assert audit_by_fieldid["file_1"].size == test_png_size
|
410
|
-
assert audit_by_fieldid["file_2"].action == AuditField.FieldAction.MODIFIED
|
411
|
-
assert audit_by_fieldid["file_2"].size == test_text_size
|
412
|
-
assert audit_by_fieldid["file_3"].action == AuditField.FieldAction.MODIFIED
|
413
|
-
assert audit_by_fieldid["file_3"].size == test_vectors_size
|
414
|
-
|
415
|
-
#
|
416
|
-
# Test 2: delete one of the previous field on the same resource
|
417
|
-
#
|
418
|
-
|
419
|
-
message.files.clear()
|
420
|
-
fieldid = FieldID(field="file_1", field_type=FieldType.FILE)
|
421
|
-
message.delete_fields.append(fieldid)
|
422
|
-
|
423
|
-
await stream_processor.process(message=message, seqid=2)
|
424
|
-
auditreq = await get_audit_messages(psub)
|
425
|
-
|
426
|
-
# Minimal assert to make sure we get the information from the node on the audit
|
427
|
-
# gets from the sidecar to the audit report when adding or modifying a resource
|
428
|
-
# The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
|
429
|
-
|
430
|
-
assert auditreq.kbid == knowledgebox_ingest
|
431
|
-
assert auditreq.rid == rid
|
432
|
-
assert auditreq.type == AuditRequest.AuditType.MODIFIED
|
433
|
-
|
434
|
-
#
|
435
|
-
# Test 3: modify a file while adding and deleting other files
|
436
|
-
#
|
437
|
-
|
438
|
-
message = make_message(knowledgebox_ingest, rid)
|
439
|
-
add_filefields(message, [("file_2", "file.png"), ("file_4", "text.pb")])
|
440
|
-
fieldid = FieldID(field="file_3", field_type=FieldType.FILE)
|
441
|
-
message.delete_fields.append(fieldid)
|
442
|
-
|
443
|
-
await stream_processor.process(message=message, seqid=3)
|
444
|
-
auditreq = await get_audit_messages(psub)
|
445
|
-
|
446
|
-
# Minimal assert to make sure we get the information from the node on the audit
|
447
|
-
# gets from the sidecar to the audit report when adding or modifying a resource
|
448
|
-
# The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
|
449
|
-
|
450
|
-
assert auditreq.kbid == knowledgebox_ingest
|
451
|
-
assert auditreq.rid == rid
|
452
|
-
assert auditreq.type == AuditRequest.AuditType.MODIFIED
|
453
|
-
|
454
|
-
audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
|
455
|
-
assert audit_by_fieldid["file_2"].action == AuditField.FieldAction.MODIFIED
|
456
|
-
assert audit_by_fieldid["file_2"].size == test_png_size
|
457
|
-
assert audit_by_fieldid["file_4"].action == AuditField.FieldAction.MODIFIED
|
458
|
-
assert audit_by_fieldid["file_4"].size == test_text_size
|
459
|
-
assert audit_by_fieldid["file_3"].action == AuditField.FieldAction.DELETED
|
460
|
-
assert audit_by_fieldid["file_3"].size == 0
|
461
|
-
|
462
|
-
#
|
463
|
-
# Test 4: delete resource
|
464
|
-
#
|
465
|
-
|
466
|
-
message = make_message(
|
467
|
-
knowledgebox_ingest, rid, message_type=BrokerMessage.MessageType.DELETE
|
468
|
-
)
|
469
|
-
await stream_processor.process(message=message, seqid=4)
|
470
|
-
auditreq = await get_audit_messages(psub)
|
471
|
-
|
472
|
-
assert auditreq.type == AuditRequest.AuditType.DELETED
|
473
|
-
|
474
|
-
# Test 5: Delete knowledgebox
|
475
|
-
|
476
|
-
async with maindb_driver.transaction() as txn:
|
477
|
-
set_utility(Utility.AUDIT, stream_audit)
|
478
|
-
await KnowledgeBox.delete_kb(txn, knowledgebox_ingest) # type: ignore
|
479
|
-
|
480
|
-
auditreq = await get_audit_messages(psub)
|
481
|
-
assert auditreq.kbid == knowledgebox_ingest
|
482
|
-
assert auditreq.type == AuditRequest.AuditType.KB_DELETED
|
483
|
-
|
484
|
-
try:
|
485
|
-
int(auditreq.trace_id)
|
486
|
-
except ValueError:
|
487
|
-
assert False, "Invalid trace ID"
|
488
|
-
|
489
|
-
# Currently where not updating audit counters on delete operations
|
490
|
-
assert not auditreq.HasField("kb_counter")
|
491
|
-
|
492
|
-
await client.drain()
|
493
|
-
await client.close()
|
494
|
-
|
495
|
-
|
496
|
-
@pytest.mark.asyncio
|
497
|
-
async def test_qa(
|
498
|
-
local_files,
|
499
|
-
storage: Storage,
|
500
|
-
cache,
|
501
|
-
fake_node,
|
502
|
-
stream_processor,
|
503
|
-
stream_audit: StreamAuditStorage,
|
504
|
-
test_resource: Resource,
|
505
|
-
):
|
506
|
-
kbid = test_resource.kb.kbid
|
507
|
-
rid = test_resource.uuid
|
508
|
-
driver = stream_processor.driver
|
509
|
-
message = make_message(kbid, rid)
|
510
|
-
message.account_seq = 2
|
511
|
-
message.files["qa"].file.uri = "http://something"
|
512
|
-
message.files["qa"].file.size = 123
|
513
|
-
message.files["qa"].file.source = CloudFile.Source.LOCAL
|
514
|
-
|
515
|
-
qaw = FieldQuestionAnswerWrapper()
|
516
|
-
qaw.field.field_type = FieldType.FILE
|
517
|
-
qaw.field.field = "qa"
|
518
|
-
|
519
|
-
for i in range(10):
|
520
|
-
qa = QuestionAnswer()
|
521
|
-
|
522
|
-
qa.question.text = f"My question {i}"
|
523
|
-
qa.question.language = "catalan"
|
524
|
-
qa.question.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
|
525
|
-
|
526
|
-
answer = Answers()
|
527
|
-
answer.text = f"My answer {i}"
|
528
|
-
answer.language = "catalan"
|
529
|
-
answer.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
|
530
|
-
qa.answers.append(answer)
|
531
|
-
qaw.question_answers.question_answer.append(qa)
|
532
|
-
|
533
|
-
message.question_answers.append(qaw)
|
534
|
-
|
535
|
-
await stream_processor.process(message=message, seqid=1)
|
536
|
-
|
537
|
-
async with driver.transaction() as txn:
|
538
|
-
kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
|
539
|
-
r = await kb_obj.get(message.uuid)
|
540
|
-
assert r is not None
|
541
|
-
res = await r.get_field(key="qa", type=FieldType.FILE)
|
542
|
-
res_qa = await res.get_question_answers()
|
543
|
-
|
544
|
-
assert qaw.question_answers == res_qa
|
545
|
-
|
546
|
-
# delete op
|
547
|
-
message = make_message(kbid, rid, message_type=BrokerMessage.MessageType.DELETE)
|
548
|
-
await stream_processor.process(message=message, seqid=2)
|
549
|
-
|
550
|
-
|
551
|
-
@pytest.mark.asyncio
|
552
|
-
async def test_ingest_audit_stream_mixed(
|
553
|
-
local_files,
|
554
|
-
storage: Storage,
|
555
|
-
cache,
|
556
|
-
fake_node,
|
557
|
-
stream_processor,
|
558
|
-
stream_audit: StreamAuditStorage,
|
559
|
-
test_resource: Resource,
|
560
|
-
):
|
561
|
-
from nucliadb_utils.settings import audit_settings
|
562
|
-
|
563
|
-
kbid = test_resource.kb.kbid
|
564
|
-
rid = test_resource.uuid
|
565
|
-
# Prepare a test audit stream to receive our messages
|
566
|
-
partition = stream_audit.get_partition(kbid)
|
567
|
-
client: Client = await nats.connect(stream_audit.nats_servers)
|
568
|
-
jetstream: JetStreamContext = client.jetstream()
|
569
|
-
if audit_settings.audit_jetstream_target is None:
|
570
|
-
assert False, "Missing jetstream target in audit settings"
|
571
|
-
subject = audit_settings.audit_jetstream_target.format(
|
572
|
-
partition=partition, type="*"
|
573
|
-
)
|
574
|
-
try:
|
575
|
-
await jetstream.delete_stream(name=audit_settings.audit_stream)
|
576
|
-
except nats.js.errors.NotFoundError:
|
577
|
-
pass
|
578
|
-
await jetstream.add_stream(name=audit_settings.audit_stream, subjects=[subject])
|
579
|
-
psub = await jetstream.pull_subscribe(subject, "psub")
|
580
|
-
|
581
|
-
#
|
582
|
-
# Test 1: starting with a complete resource, do one of heac add, mod, del field
|
583
|
-
#
|
584
|
-
message = make_message(kbid, rid)
|
585
|
-
add_filefields(message, [("file_1", "file.png")])
|
586
|
-
add_textfields(message, ["text1"])
|
587
|
-
fieldid = FieldID(field="conv1", field_type=FieldType.CONVERSATION)
|
588
|
-
message.delete_fields.append(fieldid)
|
589
|
-
await stream_processor.process(message=message, seqid=1)
|
590
|
-
|
591
|
-
auditreq = await get_audit_messages(psub)
|
592
|
-
|
593
|
-
# Minimal assert to make sure we get the information from the node on the audit
|
594
|
-
# gets from the sidecar to the audit report when adding or modifying a resource
|
595
|
-
# The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
|
596
|
-
|
597
|
-
assert auditreq.kbid == kbid
|
598
|
-
assert auditreq.rid == rid
|
599
|
-
assert auditreq.type == AuditRequest.AuditType.MODIFIED
|
600
|
-
|
601
|
-
assert len(auditreq.fields_audit) == 4
|
602
|
-
audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
|
603
|
-
assert audit_by_fieldid["file_1"].action == AuditField.FieldAction.MODIFIED
|
604
|
-
assert audit_by_fieldid["text1"].action == AuditField.FieldAction.MODIFIED
|
605
|
-
assert audit_by_fieldid["conv1"].action == AuditField.FieldAction.DELETED
|
606
|
-
|
607
|
-
#
|
608
|
-
# Test 2: delete resource
|
609
|
-
#
|
610
|
-
|
611
|
-
message = make_message(kbid, rid, message_type=BrokerMessage.MessageType.DELETE)
|
612
|
-
await stream_processor.process(message=message, seqid=2)
|
613
|
-
auditreq = await get_audit_messages(psub)
|
614
|
-
|
615
|
-
assert auditreq.type == AuditRequest.AuditType.DELETED
|
616
|
-
|
617
|
-
await client.drain()
|
618
|
-
await client.close()
|
619
|
-
|
620
|
-
|
621
|
-
@pytest.mark.asyncio
|
622
|
-
async def test_ingest_account_seq_stored(
|
623
|
-
local_files,
|
624
|
-
storage: Storage,
|
625
|
-
fake_node,
|
626
|
-
stream_processor,
|
627
|
-
test_resource: Resource,
|
628
|
-
):
|
629
|
-
driver = stream_processor.driver
|
630
|
-
kbid = test_resource.kb.kbid
|
631
|
-
rid = test_resource.uuid
|
632
|
-
|
633
|
-
message = make_message(kbid, rid)
|
634
|
-
message.account_seq = 2
|
635
|
-
add_filefields(message, [("file_1", "file.png")])
|
636
|
-
await stream_processor.process(message=message, seqid=1)
|
637
|
-
|
638
|
-
async with driver.transaction() as txn:
|
639
|
-
kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
|
640
|
-
r = await kb_obj.get(message.uuid)
|
641
|
-
assert r is not None
|
642
|
-
basic = await r.get_basic()
|
643
|
-
|
644
|
-
assert basic is not None
|
645
|
-
assert basic.last_account_seq == 2
|
646
|
-
assert basic.queue == 0
|
647
|
-
|
648
|
-
|
649
|
-
@pytest.mark.asyncio
|
650
|
-
async def test_ingest_processor_handles_missing_kb(
|
651
|
-
local_files,
|
652
|
-
storage: Storage,
|
653
|
-
fake_node,
|
654
|
-
stream_processor,
|
655
|
-
test_resource: Resource,
|
656
|
-
):
|
657
|
-
kbid = str(uuid4())
|
658
|
-
rid = str(uuid4())
|
659
|
-
message = make_message(kbid, rid)
|
660
|
-
message.account_seq = 1
|
661
|
-
await stream_processor.process(message=message, seqid=1)
|
662
|
-
|
663
|
-
|
664
|
-
@pytest.mark.asyncio
|
665
|
-
async def test_ingest_autocommit_deadletter_marks_resource(
|
666
|
-
kbid: str, processor: Processor, storage, maindb_driver: Driver
|
667
|
-
):
|
668
|
-
rid = str(uuid.uuid4())
|
669
|
-
message = make_message(kbid, rid)
|
670
|
-
|
671
|
-
with (
|
672
|
-
patch.object(processor, "notify_commit") as mock_notify,
|
673
|
-
pytest.raises(DeadletteredError),
|
674
|
-
):
|
675
|
-
# cause an error to force deadletter handling
|
676
|
-
mock_notify.side_effect = Exception("test")
|
677
|
-
await processor.process(message=message, seqid=1)
|
678
|
-
|
679
|
-
async with maindb_driver.transaction() as txn:
|
680
|
-
kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
|
681
|
-
resource = await kb_obj.get(message.uuid)
|
682
|
-
|
683
|
-
mock_notify.assert_called_once()
|
684
|
-
assert resource.basic.metadata.status == PBMetadata.Status.ERROR # type: ignore
|