nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -1,140 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
import asyncio
|
22
|
-
from unittest.mock import AsyncMock, MagicMock, patch
|
23
|
-
|
24
|
-
import pytest
|
25
|
-
from nucliadb_protos.writer_pb2 import Notification, ShardObject, Shards
|
26
|
-
|
27
|
-
from nucliadb.common.cluster.settings import settings
|
28
|
-
from nucliadb.ingest.consumer import shard_creator
|
29
|
-
from nucliadb_protos import nodereader_pb2
|
30
|
-
|
31
|
-
pytestmark = pytest.mark.asyncio
|
32
|
-
|
33
|
-
|
34
|
-
@pytest.fixture()
|
35
|
-
def pubsub():
|
36
|
-
mock = AsyncMock()
|
37
|
-
mock.parse = lambda x: x
|
38
|
-
yield mock
|
39
|
-
|
40
|
-
|
41
|
-
@pytest.fixture()
|
42
|
-
def reader():
|
43
|
-
yield AsyncMock()
|
44
|
-
|
45
|
-
|
46
|
-
@pytest.fixture()
|
47
|
-
def kbdm():
|
48
|
-
mock = MagicMock()
|
49
|
-
mock.get_model_metadata = AsyncMock(return_value="model")
|
50
|
-
with patch("nucliadb.common.cluster.manager.datamanagers.kb", return_value=mock):
|
51
|
-
yield mock
|
52
|
-
|
53
|
-
|
54
|
-
@pytest.fixture()
|
55
|
-
def shard_manager(reader):
|
56
|
-
sm = MagicMock()
|
57
|
-
node = MagicMock(reader=reader)
|
58
|
-
shards = Shards(shards=[ShardObject(read_only=False)], actual=0)
|
59
|
-
sm.get_current_active_shard = AsyncMock(return_value=shards.shards[0])
|
60
|
-
sm.maybe_create_new_shard = AsyncMock()
|
61
|
-
with (
|
62
|
-
patch(
|
63
|
-
"nucliadb.ingest.consumer.shard_creator.get_shard_manager", return_value=sm
|
64
|
-
),
|
65
|
-
patch(
|
66
|
-
"nucliadb.ingest.consumer.shard_creator.choose_node",
|
67
|
-
return_value=(node, "shard_id"),
|
68
|
-
),
|
69
|
-
patch(
|
70
|
-
"nucliadb.ingest.consumer.shard_creator.locking.distributed_lock",
|
71
|
-
return_value=AsyncMock(),
|
72
|
-
),
|
73
|
-
):
|
74
|
-
yield sm
|
75
|
-
|
76
|
-
|
77
|
-
@pytest.fixture()
|
78
|
-
async def shard_creator_handler(pubsub, shard_manager):
|
79
|
-
sc = shard_creator.ShardCreatorHandler(
|
80
|
-
driver=AsyncMock(transaction=MagicMock(return_value=AsyncMock())),
|
81
|
-
storage=AsyncMock(),
|
82
|
-
pubsub=pubsub,
|
83
|
-
check_delay=0.05,
|
84
|
-
)
|
85
|
-
await sc.initialize()
|
86
|
-
yield sc
|
87
|
-
await sc.finalize()
|
88
|
-
|
89
|
-
|
90
|
-
async def test_handle_message_create_new_shard(
|
91
|
-
shard_creator_handler: shard_creator.ShardCreatorHandler,
|
92
|
-
reader,
|
93
|
-
kbdm,
|
94
|
-
shard_manager,
|
95
|
-
):
|
96
|
-
reader.GetShard.return_value = nodereader_pb2.Shard(
|
97
|
-
paragraphs=settings.max_shard_paragraphs + 1
|
98
|
-
)
|
99
|
-
|
100
|
-
notif = Notification(
|
101
|
-
kbid="kbid",
|
102
|
-
action=Notification.Action.INDEXED,
|
103
|
-
)
|
104
|
-
await shard_creator_handler.handle_message(notif.SerializeToString())
|
105
|
-
await asyncio.sleep(0.06)
|
106
|
-
shard_manager.maybe_create_new_shard.assert_called_with(
|
107
|
-
"kbid", settings.max_shard_paragraphs + 1
|
108
|
-
)
|
109
|
-
|
110
|
-
|
111
|
-
async def test_handle_message_do_not_create(
|
112
|
-
shard_creator_handler: shard_creator.ShardCreatorHandler, reader, shard_manager
|
113
|
-
):
|
114
|
-
reader.GetShard.return_value = nodereader_pb2.Shard(
|
115
|
-
paragraphs=settings.max_shard_paragraphs - 1
|
116
|
-
)
|
117
|
-
|
118
|
-
notif = Notification(
|
119
|
-
kbid="kbid",
|
120
|
-
action=Notification.Action.INDEXED,
|
121
|
-
)
|
122
|
-
await shard_creator_handler.handle_message(notif.SerializeToString())
|
123
|
-
|
124
|
-
await shard_creator_handler.finalize()
|
125
|
-
|
126
|
-
shard_manager.create_shard_by_kbid.assert_not_called()
|
127
|
-
|
128
|
-
|
129
|
-
async def test_handle_message_ignore_not_indexed(
|
130
|
-
shard_creator_handler: shard_creator.ShardCreatorHandler, shard_manager
|
131
|
-
):
|
132
|
-
notif = Notification(
|
133
|
-
kbid="kbid",
|
134
|
-
action=Notification.Action.COMMIT,
|
135
|
-
)
|
136
|
-
await shard_creator_handler.handle_message(notif.SerializeToString())
|
137
|
-
|
138
|
-
await shard_creator_handler.finalize()
|
139
|
-
|
140
|
-
shard_manager.create_shard_by_kbid.assert_not_called()
|
@@ -1,67 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
import asyncio
|
22
|
-
|
23
|
-
import pytest
|
24
|
-
|
25
|
-
from nucliadb.ingest.consumer import utils
|
26
|
-
|
27
|
-
pytestmark = pytest.mark.asyncio
|
28
|
-
|
29
|
-
|
30
|
-
async def test_delay_task_handler():
|
31
|
-
dth = utils.DelayedTaskHandler(0.05)
|
32
|
-
await dth.initialize()
|
33
|
-
|
34
|
-
counter = 0
|
35
|
-
|
36
|
-
async def handler():
|
37
|
-
await asyncio.sleep(0.1)
|
38
|
-
nonlocal counter
|
39
|
-
counter += 1
|
40
|
-
|
41
|
-
dth.schedule("key1", handler)
|
42
|
-
dth.schedule("key1", handler)
|
43
|
-
dth.schedule("key1", handler)
|
44
|
-
dth.schedule("key2", handler)
|
45
|
-
dth.schedule("key3", handler)
|
46
|
-
dth.schedule("key4", handler)
|
47
|
-
|
48
|
-
# all should be scheduled and duplicates ignored
|
49
|
-
assert len(dth.to_process) == 4
|
50
|
-
|
51
|
-
await asyncio.sleep(0.06)
|
52
|
-
# they should all be running now
|
53
|
-
assert len(dth.outstanding_tasks) == 4
|
54
|
-
|
55
|
-
# schedule a couple more
|
56
|
-
dth.schedule("key1", handler) # duplicate key, should get rescheduled at end
|
57
|
-
dth.schedule("key5", handler)
|
58
|
-
dth.schedule("key6", handler)
|
59
|
-
|
60
|
-
await asyncio.sleep(0.1)
|
61
|
-
# original set should be finished now
|
62
|
-
assert counter == 4
|
63
|
-
|
64
|
-
# finish everything now
|
65
|
-
await dth.finalize()
|
66
|
-
|
67
|
-
assert counter == 7
|
@@ -1,19 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
@@ -1,247 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
from uuid import uuid4
|
21
|
-
|
22
|
-
import pytest
|
23
|
-
from nucliadb_protos.noderesources_pb2 import Resource as PBResource
|
24
|
-
from nucliadb_protos.resources_pb2 import (
|
25
|
-
Basic,
|
26
|
-
ExtractedText,
|
27
|
-
FieldComputedMetadataWrapper,
|
28
|
-
FieldID,
|
29
|
-
FieldType,
|
30
|
-
Metadata,
|
31
|
-
Paragraph,
|
32
|
-
Sentence,
|
33
|
-
)
|
34
|
-
|
35
|
-
from nucliadb.ingest.orm.brain import ParagraphPages, ResourceBrain
|
36
|
-
from nucliadb_protos import resources_pb2
|
37
|
-
|
38
|
-
|
39
|
-
def test_apply_field_metadata_marks_duplicated_paragraphs():
|
40
|
-
# Simulate a field with two paragraphs that contain the same text
|
41
|
-
br = ResourceBrain(rid=str(uuid4()))
|
42
|
-
field_key = "text1"
|
43
|
-
fcmw = FieldComputedMetadataWrapper()
|
44
|
-
fcmw.field.CopyFrom(FieldID(field_type=FieldType.TEXT, field=field_key))
|
45
|
-
paragraph = "Some paragraph here. "
|
46
|
-
text_1 = f"{paragraph}{paragraph}"
|
47
|
-
first_occurrence = [0, len(paragraph)]
|
48
|
-
second_occurrence = [len(paragraph), len(paragraph) * 2]
|
49
|
-
|
50
|
-
et = ExtractedText(text=text_1)
|
51
|
-
p1 = Paragraph(start=first_occurrence[0], end=first_occurrence[1])
|
52
|
-
p1.sentences.append(
|
53
|
-
Sentence(start=first_occurrence[0], end=first_occurrence[1], key="test")
|
54
|
-
)
|
55
|
-
p2 = Paragraph(start=second_occurrence[0], end=second_occurrence[1])
|
56
|
-
p2.sentences.append(
|
57
|
-
Sentence(start=second_occurrence[0], end=second_occurrence[1], key="test")
|
58
|
-
)
|
59
|
-
fcmw.metadata.metadata.paragraphs.append(p1)
|
60
|
-
fcmw.metadata.metadata.paragraphs.append(p2)
|
61
|
-
|
62
|
-
br.apply_field_metadata(
|
63
|
-
field_key,
|
64
|
-
fcmw.metadata,
|
65
|
-
replace_field=[],
|
66
|
-
replace_splits={},
|
67
|
-
page_positions={},
|
68
|
-
extracted_text=et,
|
69
|
-
)
|
70
|
-
|
71
|
-
assert len(br.brain.paragraphs[field_key].paragraphs) == 2
|
72
|
-
for key, paragraph in br.brain.paragraphs[field_key].paragraphs.items():
|
73
|
-
if f"{first_occurrence[0]}-{first_occurrence[1]}" in key:
|
74
|
-
# Only the first time that a paragraph is found should be set to false
|
75
|
-
assert paragraph.repeated_in_field is False
|
76
|
-
else:
|
77
|
-
assert paragraph.repeated_in_field is True
|
78
|
-
|
79
|
-
|
80
|
-
def test_apply_field_metadata_marks_duplicated_paragraphs_on_split_metadata():
|
81
|
-
# # Test now the split text path
|
82
|
-
br = ResourceBrain(rid=str(uuid4()))
|
83
|
-
field_key = "text1"
|
84
|
-
split_key = "subfield"
|
85
|
-
fcmw = FieldComputedMetadataWrapper()
|
86
|
-
fcmw.field.CopyFrom(FieldID(field_type=FieldType.TEXT, field=field_key))
|
87
|
-
paragraph = "Some paragraph here. "
|
88
|
-
text_1 = f"{paragraph}{paragraph}"
|
89
|
-
first_occurrence = [0, len(paragraph)]
|
90
|
-
second_occurrence = [len(paragraph), len(paragraph) * 2]
|
91
|
-
|
92
|
-
et = ExtractedText()
|
93
|
-
et.split_text[split_key] = text_1
|
94
|
-
p1 = Paragraph(start=first_occurrence[0], end=first_occurrence[1])
|
95
|
-
p1.sentences.append(
|
96
|
-
Sentence(start=first_occurrence[0], end=first_occurrence[1], key="test")
|
97
|
-
)
|
98
|
-
p2 = Paragraph(start=second_occurrence[0], end=second_occurrence[1])
|
99
|
-
p2.sentences.append(
|
100
|
-
Sentence(start=second_occurrence[0], end=second_occurrence[1], key="test")
|
101
|
-
)
|
102
|
-
fcmw.metadata.split_metadata[split_key].paragraphs.append(p1)
|
103
|
-
fcmw.metadata.split_metadata[split_key].paragraphs.append(p2)
|
104
|
-
|
105
|
-
br.apply_field_metadata(
|
106
|
-
field_key,
|
107
|
-
fcmw.metadata,
|
108
|
-
replace_field=[],
|
109
|
-
replace_splits={},
|
110
|
-
page_positions={},
|
111
|
-
extracted_text=et,
|
112
|
-
)
|
113
|
-
|
114
|
-
assert len(br.brain.paragraphs[field_key].paragraphs) == 2
|
115
|
-
for key, paragraph in br.brain.paragraphs[field_key].paragraphs.items():
|
116
|
-
if f"{first_occurrence[0]}-{first_occurrence[1]}" in key:
|
117
|
-
# Only the first time that a paragraph is found should be set to false
|
118
|
-
assert paragraph.repeated_in_field is False
|
119
|
-
else:
|
120
|
-
assert paragraph.repeated_in_field is True
|
121
|
-
|
122
|
-
|
123
|
-
def test_get_page_number():
|
124
|
-
page_numbers = ParagraphPages(
|
125
|
-
{
|
126
|
-
0: (0, 99),
|
127
|
-
1: (100, 199),
|
128
|
-
2: (200, 299),
|
129
|
-
}
|
130
|
-
)
|
131
|
-
assert page_numbers.get(10) == 0
|
132
|
-
assert page_numbers.get(100) == 1
|
133
|
-
assert page_numbers.get(500) == 2
|
134
|
-
|
135
|
-
|
136
|
-
@pytest.mark.parametrize(
|
137
|
-
"new_status,previous_status,expected_brain_status",
|
138
|
-
[
|
139
|
-
# No previous_status
|
140
|
-
(Metadata.Status.PENDING, None, PBResource.PENDING),
|
141
|
-
(Metadata.Status.PROCESSED, None, PBResource.PROCESSED),
|
142
|
-
(Metadata.Status.ERROR, None, PBResource.PROCESSED),
|
143
|
-
(Metadata.Status.BLOCKED, None, PBResource.PROCESSED),
|
144
|
-
(Metadata.Status.EXPIRED, None, PBResource.PROCESSED),
|
145
|
-
# previous_status = PENDING
|
146
|
-
(Metadata.Status.PENDING, Metadata.Status.PENDING, PBResource.PENDING),
|
147
|
-
(Metadata.Status.PROCESSED, Metadata.Status.PENDING, PBResource.PROCESSED),
|
148
|
-
(Metadata.Status.ERROR, Metadata.Status.PENDING, PBResource.PROCESSED),
|
149
|
-
(Metadata.Status.BLOCKED, Metadata.Status.PENDING, PBResource.PROCESSED),
|
150
|
-
(Metadata.Status.EXPIRED, Metadata.Status.PENDING, PBResource.PROCESSED),
|
151
|
-
# previous_status = PROCESSED
|
152
|
-
(Metadata.Status.PROCESSED, Metadata.Status.PROCESSED, PBResource.PROCESSED),
|
153
|
-
(Metadata.Status.ERROR, Metadata.Status.PROCESSED, PBResource.PROCESSED),
|
154
|
-
(Metadata.Status.BLOCKED, Metadata.Status.PROCESSED, PBResource.PROCESSED),
|
155
|
-
(Metadata.Status.PENDING, Metadata.Status.PROCESSED, PBResource.PROCESSED),
|
156
|
-
(Metadata.Status.EXPIRED, Metadata.Status.PROCESSED, PBResource.PROCESSED),
|
157
|
-
# previous_status = ERROR
|
158
|
-
(Metadata.Status.PENDING, Metadata.Status.ERROR, PBResource.PROCESSED),
|
159
|
-
(Metadata.Status.PROCESSED, Metadata.Status.ERROR, PBResource.PROCESSED),
|
160
|
-
(Metadata.Status.ERROR, Metadata.Status.ERROR, PBResource.PROCESSED),
|
161
|
-
(Metadata.Status.BLOCKED, Metadata.Status.ERROR, PBResource.PROCESSED),
|
162
|
-
(Metadata.Status.EXPIRED, Metadata.Status.ERROR, PBResource.PROCESSED),
|
163
|
-
# previous_status = BLOCKED
|
164
|
-
(Metadata.Status.PENDING, Metadata.Status.BLOCKED, PBResource.PROCESSED),
|
165
|
-
(Metadata.Status.PROCESSED, Metadata.Status.BLOCKED, PBResource.PROCESSED),
|
166
|
-
(Metadata.Status.ERROR, Metadata.Status.BLOCKED, PBResource.PROCESSED),
|
167
|
-
(Metadata.Status.BLOCKED, Metadata.Status.BLOCKED, PBResource.PROCESSED),
|
168
|
-
(Metadata.Status.EXPIRED, Metadata.Status.BLOCKED, PBResource.PROCESSED),
|
169
|
-
# previous_status = EXPIRED
|
170
|
-
(Metadata.Status.PENDING, Metadata.Status.EXPIRED, PBResource.PROCESSED),
|
171
|
-
(Metadata.Status.PROCESSED, Metadata.Status.EXPIRED, PBResource.PROCESSED),
|
172
|
-
(Metadata.Status.ERROR, Metadata.Status.EXPIRED, PBResource.PROCESSED),
|
173
|
-
(Metadata.Status.BLOCKED, Metadata.Status.EXPIRED, PBResource.PROCESSED),
|
174
|
-
(Metadata.Status.EXPIRED, Metadata.Status.EXPIRED, PBResource.PROCESSED),
|
175
|
-
],
|
176
|
-
)
|
177
|
-
def test_set_processing_status(new_status, previous_status, expected_brain_status):
|
178
|
-
br = ResourceBrain(rid="foo")
|
179
|
-
basic = Basic()
|
180
|
-
basic.metadata.status = new_status
|
181
|
-
br.set_processing_status(basic, previous_status)
|
182
|
-
assert br.brain.status == expected_brain_status
|
183
|
-
|
184
|
-
|
185
|
-
def test_apply_field_metadata_populates_page_number():
|
186
|
-
br = ResourceBrain(rid="foo")
|
187
|
-
field_key = "text1"
|
188
|
-
|
189
|
-
fcmw = FieldComputedMetadataWrapper()
|
190
|
-
fcmw.field.CopyFrom(FieldID(field_type=FieldType.TEXT, field=field_key))
|
191
|
-
|
192
|
-
p1 = Paragraph(
|
193
|
-
start=40, end=54, start_seconds=[0], end_seconds=[10], text="Some text here"
|
194
|
-
)
|
195
|
-
p1.sentences.append(Sentence(start=40, end=54, key="test"))
|
196
|
-
fcmw.metadata.metadata.paragraphs.append(p1)
|
197
|
-
|
198
|
-
# Add it to the split too
|
199
|
-
fcmw.metadata.split_metadata["subfield"].paragraphs.append(p1)
|
200
|
-
|
201
|
-
page_positions = {
|
202
|
-
0: (0, 20),
|
203
|
-
1: (21, 39),
|
204
|
-
2: (40, 100),
|
205
|
-
}
|
206
|
-
br.apply_field_metadata(
|
207
|
-
field_key,
|
208
|
-
fcmw.metadata,
|
209
|
-
replace_field=[],
|
210
|
-
replace_splits={},
|
211
|
-
page_positions=page_positions,
|
212
|
-
extracted_text=None,
|
213
|
-
)
|
214
|
-
|
215
|
-
assert len(br.brain.paragraphs[field_key].paragraphs) == 2
|
216
|
-
for paragraph in br.brain.paragraphs[field_key].paragraphs.values():
|
217
|
-
assert paragraph.metadata.position.page_number == 2
|
218
|
-
assert paragraph.metadata.position.start == 40
|
219
|
-
assert paragraph.metadata.position.end == 54
|
220
|
-
assert paragraph.metadata.position.start_seconds == [0]
|
221
|
-
assert paragraph.metadata.position.end_seconds == [10]
|
222
|
-
|
223
|
-
|
224
|
-
def test_set_resource_metadata_promotes_origin_dates():
|
225
|
-
resource_brain = ResourceBrain("rid")
|
226
|
-
basic = Basic()
|
227
|
-
basic.created.seconds = 1
|
228
|
-
basic.modified.seconds = 2
|
229
|
-
origin = resources_pb2.Origin()
|
230
|
-
origin.created.seconds = 3
|
231
|
-
origin.modified.seconds = 4
|
232
|
-
|
233
|
-
resource_brain.set_resource_metadata(basic, origin)
|
234
|
-
|
235
|
-
assert resource_brain.brain.metadata.created.seconds == 3
|
236
|
-
assert resource_brain.brain.metadata.modified.seconds == 4
|
237
|
-
|
238
|
-
|
239
|
-
def test_set_resource_metadata_handles_timestamp_not_present():
|
240
|
-
resource_brain = ResourceBrain("rid")
|
241
|
-
basic = Basic()
|
242
|
-
resource_brain.set_resource_metadata(basic, None)
|
243
|
-
created = resource_brain.brain.metadata.created.seconds
|
244
|
-
modified = resource_brain.brain.metadata.modified.seconds
|
245
|
-
assert created > 0
|
246
|
-
assert modified > 0
|
247
|
-
assert modified >= created
|
@@ -1,74 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
#
|
20
|
-
|
21
|
-
import uuid
|
22
|
-
|
23
|
-
from nucliadb.ingest.orm.brain import FIELD_PARAGRAPH_ID, FIELD_VECTOR_ID, ResourceBrain
|
24
|
-
from nucliadb_protos import utils_pb2
|
25
|
-
|
26
|
-
|
27
|
-
def test_apply_field_vectors_for_matryoshka_embeddings():
|
28
|
-
STORED_VECTOR_DIMENSION = 100
|
29
|
-
MATRYOSHKA_DIMENSION = 10
|
30
|
-
|
31
|
-
rid = uuid.uuid4().hex
|
32
|
-
field_id = uuid.uuid4().hex
|
33
|
-
vectors = utils_pb2.VectorObject(
|
34
|
-
vectors=utils_pb2.Vectors(
|
35
|
-
vectors=[
|
36
|
-
utils_pb2.Vector(
|
37
|
-
start=0,
|
38
|
-
end=10,
|
39
|
-
start_paragraph=0,
|
40
|
-
end_paragraph=10,
|
41
|
-
vector=[1.0] * STORED_VECTOR_DIMENSION,
|
42
|
-
)
|
43
|
-
]
|
44
|
-
)
|
45
|
-
)
|
46
|
-
paragraph_key = FIELD_PARAGRAPH_ID.format(
|
47
|
-
rid=rid,
|
48
|
-
field_id=field_id,
|
49
|
-
paragraph_start=0,
|
50
|
-
paragraph_end=10,
|
51
|
-
)
|
52
|
-
vector_key = FIELD_VECTOR_ID.format(
|
53
|
-
rid=rid,
|
54
|
-
field_id=field_id,
|
55
|
-
index=0,
|
56
|
-
vector_start=0,
|
57
|
-
vector_end=10,
|
58
|
-
)
|
59
|
-
|
60
|
-
brain = ResourceBrain(rid=rid)
|
61
|
-
brain.apply_field_vectors(field_id, vectors, matryoshka_vector_dimension=None)
|
62
|
-
vector = (
|
63
|
-
brain.brain.paragraphs[field_id].paragraphs[paragraph_key].sentences[vector_key]
|
64
|
-
)
|
65
|
-
assert len(vector.vector) == STORED_VECTOR_DIMENSION
|
66
|
-
|
67
|
-
brain = ResourceBrain(rid=rid)
|
68
|
-
brain.apply_field_vectors(
|
69
|
-
field_id, vectors, matryoshka_vector_dimension=MATRYOSHKA_DIMENSION
|
70
|
-
)
|
71
|
-
vector = (
|
72
|
-
brain.brain.paragraphs[field_id].paragraphs[paragraph_key].sentences[vector_key]
|
73
|
-
)
|
74
|
-
assert len(vector.vector) == MATRYOSHKA_DIMENSION
|
@@ -1,131 +0,0 @@
|
|
1
|
-
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
-
#
|
3
|
-
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
-
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
-
#
|
6
|
-
# AGPL:
|
7
|
-
# This program is free software: you can redistribute it and/or modify
|
8
|
-
# it under the terms of the GNU Affero General Public License as
|
9
|
-
# published by the Free Software Foundation, either version 3 of the
|
10
|
-
# License, or (at your option) any later version.
|
11
|
-
#
|
12
|
-
# This program is distributed in the hope that it will be useful,
|
13
|
-
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
-
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
-
# GNU Affero General Public License for more details.
|
16
|
-
#
|
17
|
-
# You should have received a copy of the GNU Affero General Public License
|
18
|
-
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
-
|
20
|
-
from unittest.mock import AsyncMock, MagicMock, Mock, patch
|
21
|
-
|
22
|
-
import pytest
|
23
|
-
|
24
|
-
from nucliadb.common.cluster.settings import settings as cluster_settings
|
25
|
-
from nucliadb.ingest.orm.exceptions import ResourceNotIndexable
|
26
|
-
from nucliadb.ingest.orm.processor import Processor, validate_indexable_resource
|
27
|
-
from nucliadb_protos import noderesources_pb2
|
28
|
-
|
29
|
-
|
30
|
-
@pytest.fixture()
|
31
|
-
def txn():
|
32
|
-
yield AsyncMock()
|
33
|
-
|
34
|
-
|
35
|
-
@pytest.fixture()
|
36
|
-
def driver(txn):
|
37
|
-
mock = MagicMock()
|
38
|
-
mock.transaction.return_value.__aenter__.return_value = txn
|
39
|
-
yield mock
|
40
|
-
|
41
|
-
|
42
|
-
@pytest.fixture()
|
43
|
-
def sm():
|
44
|
-
mock = AsyncMock()
|
45
|
-
mock.add_resource = AsyncMock()
|
46
|
-
with patch("nucliadb.ingest.orm.processor.get_shard_manager", return_value=mock):
|
47
|
-
yield mock
|
48
|
-
|
49
|
-
|
50
|
-
@pytest.fixture()
|
51
|
-
def processor(driver, sm):
|
52
|
-
yield Processor(driver, None)
|
53
|
-
|
54
|
-
|
55
|
-
@pytest.fixture()
|
56
|
-
def resource():
|
57
|
-
mock = MagicMock()
|
58
|
-
mock.set_basic = AsyncMock()
|
59
|
-
yield mock
|
60
|
-
|
61
|
-
|
62
|
-
@pytest.fixture()
|
63
|
-
def kb():
|
64
|
-
mock = MagicMock(kbid="kbid")
|
65
|
-
mock.get_resource_shard_id = AsyncMock()
|
66
|
-
mock.get_resource_shard = AsyncMock()
|
67
|
-
yield mock
|
68
|
-
|
69
|
-
|
70
|
-
async def test_commit_slug(processor: Processor, txn, resource):
|
71
|
-
another_txn = Mock()
|
72
|
-
resource.txn = another_txn
|
73
|
-
resource.set_slug = AsyncMock()
|
74
|
-
|
75
|
-
await processor.commit_slug(resource)
|
76
|
-
|
77
|
-
resource.set_slug.assert_awaited_once()
|
78
|
-
txn.commit.assert_awaited_once()
|
79
|
-
assert resource.txn is another_txn
|
80
|
-
|
81
|
-
|
82
|
-
async def test_mark_resource_error(processor: Processor, txn, resource, kb, sm):
|
83
|
-
await processor._mark_resource_error(kb, resource, partition="partition", seqid=1)
|
84
|
-
txn.commit.assert_called_once()
|
85
|
-
resource.set_basic.assert_awaited_once()
|
86
|
-
sm.add_resource.assert_awaited_once_with(
|
87
|
-
kb.get_resource_shard.return_value,
|
88
|
-
resource.indexer.brain,
|
89
|
-
1,
|
90
|
-
partition="partition",
|
91
|
-
kb="kbid",
|
92
|
-
)
|
93
|
-
|
94
|
-
|
95
|
-
async def test_mark_resource_error_handle_error(
|
96
|
-
processor: Processor, kb, resource, txn
|
97
|
-
):
|
98
|
-
resource.set_basic.side_effect = Exception("test")
|
99
|
-
await processor._mark_resource_error(kb, resource, partition="partition", seqid=1)
|
100
|
-
txn.commit.assert_not_called()
|
101
|
-
|
102
|
-
|
103
|
-
async def test_mark_resource_error_skip_no_shard(
|
104
|
-
processor: Processor, resource, driver, kb, txn
|
105
|
-
):
|
106
|
-
kb.get_resource_shard.return_value = None
|
107
|
-
await processor._mark_resource_error(kb, resource, partition="partition", seqid=1)
|
108
|
-
txn.commit.assert_not_called()
|
109
|
-
|
110
|
-
|
111
|
-
async def test_mark_resource_error_skip_no_resource(
|
112
|
-
processor: Processor, kb, driver, txn
|
113
|
-
):
|
114
|
-
await processor._mark_resource_error(kb, None, partition="partition", seqid=1)
|
115
|
-
txn.commit.assert_not_called()
|
116
|
-
|
117
|
-
|
118
|
-
def test_validate_indexable_resource():
|
119
|
-
resource = noderesources_pb2.Resource()
|
120
|
-
resource.paragraphs["test"].paragraphs["test"].sentences["test"].vector.append(1.0)
|
121
|
-
validate_indexable_resource(resource)
|
122
|
-
|
123
|
-
|
124
|
-
def test_validate_indexable_resource_throws_error_for_max():
|
125
|
-
resource = noderesources_pb2.Resource()
|
126
|
-
for i in range(cluster_settings.max_resource_paragraphs + 1):
|
127
|
-
resource.paragraphs["test"].paragraphs[f"test{i}"].sentences[
|
128
|
-
"test"
|
129
|
-
].vector.append(1.0)
|
130
|
-
with pytest.raises(ResourceNotIndexable):
|
131
|
-
validate_indexable_resource(resource)
|