nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/ingest/serialize.py
CHANGED
@@ -23,28 +23,23 @@ from typing import Optional
|
|
23
23
|
import nucliadb_models as models
|
24
24
|
from nucliadb.common.maindb.driver import Transaction
|
25
25
|
from nucliadb.common.maindb.utils import get_driver
|
26
|
+
from nucliadb.common.models_utils import from_proto
|
26
27
|
from nucliadb.ingest.fields.base import Field
|
27
28
|
from nucliadb.ingest.fields.conversation import Conversation
|
28
29
|
from nucliadb.ingest.fields.file import File
|
29
30
|
from nucliadb.ingest.fields.link import Link
|
30
31
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
31
32
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
32
|
-
from nucliadb_models.common import
|
33
|
+
from nucliadb_models.common import FieldTypeName
|
33
34
|
from nucliadb_models.resource import (
|
34
35
|
ConversationFieldData,
|
35
36
|
ConversationFieldExtractedData,
|
36
|
-
DatetimeFieldData,
|
37
|
-
DatetimeFieldExtractedData,
|
38
37
|
Error,
|
39
38
|
ExtractedDataType,
|
40
39
|
ExtractedDataTypeName,
|
41
40
|
FileFieldData,
|
42
41
|
FileFieldExtractedData,
|
43
42
|
GenericFieldData,
|
44
|
-
KeywordsetFieldData,
|
45
|
-
KeywordsetFieldExtractedData,
|
46
|
-
LayoutFieldData,
|
47
|
-
LayoutFieldExtractedData,
|
48
43
|
LinkFieldData,
|
49
44
|
LinkFieldExtractedData,
|
50
45
|
QueueType,
|
@@ -70,36 +65,32 @@ async def set_resource_field_extracted_data(
|
|
70
65
|
if ExtractedDataTypeName.TEXT in wanted_extracted_data:
|
71
66
|
data_et = await field.get_extracted_text()
|
72
67
|
if data_et is not None:
|
73
|
-
field_data.text =
|
68
|
+
field_data.text = from_proto.extracted_text(data_et)
|
74
69
|
|
75
70
|
metadata_wanted = ExtractedDataTypeName.METADATA in wanted_extracted_data
|
76
|
-
shortened_metadata_wanted =
|
77
|
-
ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
|
78
|
-
)
|
71
|
+
shortened_metadata_wanted = ExtractedDataTypeName.SHORTENED_METADATA in wanted_extracted_data
|
79
72
|
if metadata_wanted or shortened_metadata_wanted:
|
80
73
|
data_fcm = await field.get_field_metadata()
|
81
74
|
|
82
75
|
if data_fcm is not None:
|
83
|
-
field_data.metadata =
|
76
|
+
field_data.metadata = from_proto.field_computed_metadata(
|
84
77
|
data_fcm, shortened=shortened_metadata_wanted and not metadata_wanted
|
85
78
|
)
|
86
79
|
|
87
80
|
if ExtractedDataTypeName.LARGE_METADATA in wanted_extracted_data:
|
88
81
|
data_lcm = await field.get_large_field_metadata()
|
89
82
|
if data_lcm is not None:
|
90
|
-
field_data.large_metadata =
|
91
|
-
data_lcm
|
92
|
-
)
|
83
|
+
field_data.large_metadata = from_proto.large_computed_metadata(data_lcm)
|
93
84
|
|
94
85
|
if ExtractedDataTypeName.VECTOR in wanted_extracted_data:
|
95
86
|
data_vec = await field.get_vectors()
|
96
87
|
if data_vec is not None:
|
97
|
-
field_data.vectors =
|
88
|
+
field_data.vectors = from_proto.vector_object(data_vec)
|
98
89
|
|
99
90
|
if ExtractedDataTypeName.QA in wanted_extracted_data:
|
100
91
|
qa = await field.get_question_answers()
|
101
92
|
if qa is not None:
|
102
|
-
field_data.question_answers =
|
93
|
+
field_data.question_answers = from_proto.field_question_answers(qa)
|
103
94
|
|
104
95
|
if (
|
105
96
|
isinstance(field, File)
|
@@ -108,7 +99,7 @@ async def set_resource_field_extracted_data(
|
|
108
99
|
):
|
109
100
|
data_fed = await field.get_file_extracted_data()
|
110
101
|
if data_fed is not None:
|
111
|
-
field_data.file =
|
102
|
+
field_data.file = from_proto.file_extracted_data(data_fed)
|
112
103
|
|
113
104
|
if (
|
114
105
|
isinstance(field, Link)
|
@@ -117,7 +108,7 @@ async def set_resource_field_extracted_data(
|
|
117
108
|
):
|
118
109
|
data_led = await field.get_link_extracted_data()
|
119
110
|
if data_led is not None:
|
120
|
-
field_data.link =
|
111
|
+
field_data.link = from_proto.link_extracted_data(data_led)
|
121
112
|
|
122
113
|
|
123
114
|
async def serialize(
|
@@ -130,7 +121,7 @@ async def serialize(
|
|
130
121
|
slug: Optional[str] = None,
|
131
122
|
) -> Optional[Resource]:
|
132
123
|
driver = get_driver()
|
133
|
-
async with driver.transaction(
|
124
|
+
async with driver.transaction(read_only=True) as txn:
|
134
125
|
return await managed_serialize(
|
135
126
|
txn,
|
136
127
|
kbid,
|
@@ -153,9 +144,7 @@ async def managed_serialize(
|
|
153
144
|
service_name: Optional[str] = None,
|
154
145
|
slug: Optional[str] = None,
|
155
146
|
) -> Optional[Resource]:
|
156
|
-
orm_resource = await get_orm_resource(
|
157
|
-
txn, kbid, rid=rid, slug=slug, service_name=service_name
|
158
|
-
)
|
147
|
+
orm_resource = await get_orm_resource(txn, kbid, rid=rid, slug=slug, service_name=service_name)
|
159
148
|
if orm_resource is None:
|
160
149
|
return None
|
161
150
|
|
@@ -163,9 +152,7 @@ async def managed_serialize(
|
|
163
152
|
|
164
153
|
include_values = ResourceProperties.VALUES in show
|
165
154
|
|
166
|
-
include_extracted_data =
|
167
|
-
ResourceProperties.EXTRACTED in show and extracted is not []
|
168
|
-
)
|
155
|
+
include_extracted_data = ResourceProperties.EXTRACTED in show and extracted is not []
|
169
156
|
|
170
157
|
if ResourceProperties.BASIC in show:
|
171
158
|
await orm_resource.get_basic()
|
@@ -175,8 +162,8 @@ async def managed_serialize(
|
|
175
162
|
resource.title = orm_resource.basic.title
|
176
163
|
resource.summary = orm_resource.basic.summary
|
177
164
|
resource.icon = orm_resource.basic.icon
|
178
|
-
resource.layout = orm_resource.basic.layout
|
179
165
|
resource.thumbnail = orm_resource.basic.thumbnail
|
166
|
+
resource.hidden = orm_resource.basic.hidden
|
180
167
|
resource.created = (
|
181
168
|
orm_resource.basic.created.ToDatetime()
|
182
169
|
if orm_resource.basic.HasField("created")
|
@@ -188,49 +175,37 @@ async def managed_serialize(
|
|
188
175
|
else None
|
189
176
|
)
|
190
177
|
|
191
|
-
resource.metadata =
|
192
|
-
|
193
|
-
)
|
194
|
-
resource.usermetadata = models.UserMetadata.from_message(
|
195
|
-
orm_resource.basic.usermetadata
|
196
|
-
)
|
178
|
+
resource.metadata = from_proto.metadata(orm_resource.basic.metadata)
|
179
|
+
resource.usermetadata = from_proto.user_metadata(orm_resource.basic.usermetadata)
|
197
180
|
resource.fieldmetadata = [
|
198
|
-
|
199
|
-
for fm in orm_resource.basic.fieldmetadata
|
181
|
+
from_proto.user_field_metadata(fm) for fm in orm_resource.basic.fieldmetadata
|
200
182
|
]
|
201
|
-
resource.computedmetadata =
|
202
|
-
orm_resource.basic.computedmetadata
|
203
|
-
)
|
183
|
+
resource.computedmetadata = from_proto.computed_metadata(orm_resource.basic.computedmetadata)
|
204
184
|
|
205
185
|
resource.last_seqid = orm_resource.basic.last_seqid
|
206
186
|
|
207
187
|
# 0 on the proto means it was not ever set, as first valid value for this field will allways be 1
|
208
188
|
resource.last_account_seq = (
|
209
|
-
orm_resource.basic.last_account_seq
|
210
|
-
if orm_resource.basic.last_account_seq != 0
|
211
|
-
else None
|
189
|
+
orm_resource.basic.last_account_seq if orm_resource.basic.last_account_seq != 0 else None
|
212
190
|
)
|
213
|
-
resource.queue = QueueType[
|
214
|
-
orm_resource.basic.QueueType.Name(orm_resource.basic.queue)
|
215
|
-
]
|
191
|
+
resource.queue = QueueType[orm_resource.basic.QueueType.Name(orm_resource.basic.queue)]
|
216
192
|
|
217
193
|
if ResourceProperties.RELATIONS in show:
|
218
194
|
await orm_resource.get_relations()
|
219
195
|
if orm_resource.relations is not None:
|
220
196
|
resource.relations = [
|
221
|
-
|
222
|
-
for relation in orm_resource.relations.relations
|
197
|
+
from_proto.relation(relation) for relation in orm_resource.relations.relations
|
223
198
|
]
|
224
199
|
|
225
200
|
if ResourceProperties.ORIGIN in show:
|
226
201
|
await orm_resource.get_origin()
|
227
202
|
if orm_resource.origin is not None:
|
228
|
-
resource.origin =
|
203
|
+
resource.origin = from_proto.origin(orm_resource.origin)
|
229
204
|
|
230
205
|
if ResourceProperties.EXTRA in show:
|
231
206
|
await orm_resource.get_extra()
|
232
207
|
if orm_resource.extra is not None:
|
233
|
-
resource.extra =
|
208
|
+
resource.extra = from_proto.extra(orm_resource.extra)
|
234
209
|
|
235
210
|
include_errors = ResourceProperties.ERRORS in show
|
236
211
|
|
@@ -241,11 +216,11 @@ async def managed_serialize(
|
|
241
216
|
for gid in orm_resource.security.access_groups:
|
242
217
|
resource.security.access_groups.append(gid)
|
243
218
|
|
244
|
-
if field_type_filter and (include_values or include_extracted_data):
|
219
|
+
if (field_type_filter and (include_values or include_extracted_data)) or include_errors:
|
245
220
|
await orm_resource.get_fields()
|
246
221
|
resource.data = ResourceData()
|
247
|
-
for (field_type,
|
248
|
-
field_type_name =
|
222
|
+
for (field_type, _), field in orm_resource.fields.items():
|
223
|
+
field_type_name = from_proto.field_type_name(field_type)
|
249
224
|
if field_type_name not in field_type_filter:
|
250
225
|
continue
|
251
226
|
|
@@ -260,18 +235,12 @@ async def managed_serialize(
|
|
260
235
|
if field.id not in resource.data.texts:
|
261
236
|
resource.data.texts[field.id] = TextFieldData()
|
262
237
|
if include_value:
|
263
|
-
serialized_value = (
|
264
|
-
models.FieldText.from_message(value)
|
265
|
-
if value is not None
|
266
|
-
else None
|
267
|
-
)
|
238
|
+
serialized_value = from_proto.field_text(value) if value is not None else None
|
268
239
|
resource.data.texts[field.id].value = serialized_value
|
269
240
|
if include_errors:
|
270
241
|
error = await field.get_error()
|
271
242
|
if error is not None:
|
272
|
-
resource.data.texts[field.id].error = Error(
|
273
|
-
body=error.error, code=error.code
|
274
|
-
)
|
243
|
+
resource.data.texts[field.id].error = Error(body=error.error, code=error.code)
|
275
244
|
if include_extracted_data:
|
276
245
|
resource.data.texts[field.id].extracted = TextFieldExtractedData()
|
277
246
|
await set_resource_field_extracted_data(
|
@@ -287,20 +256,14 @@ async def managed_serialize(
|
|
287
256
|
resource.data.files[field.id] = FileFieldData()
|
288
257
|
if include_value:
|
289
258
|
if value is not None:
|
290
|
-
resource.data.files[
|
291
|
-
field.id
|
292
|
-
].value = models.FieldFile.from_message(
|
293
|
-
value # type: ignore
|
294
|
-
)
|
259
|
+
resource.data.files[field.id].value = from_proto.field_file(value)
|
295
260
|
else:
|
296
261
|
resource.data.files[field.id].value = None
|
297
262
|
|
298
263
|
if include_errors:
|
299
264
|
error = await field.get_error()
|
300
265
|
if error is not None:
|
301
|
-
resource.data.files[field.id].error = Error(
|
302
|
-
body=error.error, code=error.code
|
303
|
-
)
|
266
|
+
resource.data.files[field.id].error = Error(body=error.error, code=error.code)
|
304
267
|
|
305
268
|
if include_extracted_data:
|
306
269
|
resource.data.files[field.id].extracted = FileFieldExtractedData()
|
@@ -316,16 +279,12 @@ async def managed_serialize(
|
|
316
279
|
if field.id not in resource.data.links:
|
317
280
|
resource.data.links[field.id] = LinkFieldData()
|
318
281
|
if include_value and value is not None:
|
319
|
-
resource.data.links[field.id].value =
|
320
|
-
value
|
321
|
-
)
|
282
|
+
resource.data.links[field.id].value = from_proto.field_link(value)
|
322
283
|
|
323
284
|
if include_errors:
|
324
285
|
error = await field.get_error()
|
325
286
|
if error is not None:
|
326
|
-
resource.data.links[field.id].error = Error(
|
327
|
-
body=error.error, code=error.code
|
328
|
-
)
|
287
|
+
resource.data.links[field.id].error = Error(body=error.error, code=error.code)
|
329
288
|
|
330
289
|
if include_extracted_data:
|
331
290
|
resource.data.links[field.id].extracted = LinkFieldExtractedData()
|
@@ -335,33 +294,6 @@ async def managed_serialize(
|
|
335
294
|
field_type_name,
|
336
295
|
extracted,
|
337
296
|
)
|
338
|
-
elif field_type_name is FieldTypeName.LAYOUT:
|
339
|
-
if resource.data.layouts is None:
|
340
|
-
resource.data.layouts = {}
|
341
|
-
if field.id not in resource.data.layouts:
|
342
|
-
resource.data.layouts[field.id] = LayoutFieldData()
|
343
|
-
if include_value:
|
344
|
-
resource.data.layouts[
|
345
|
-
field.id
|
346
|
-
].value = models.FieldLayout.from_message(
|
347
|
-
value # type: ignore
|
348
|
-
)
|
349
|
-
if include_errors:
|
350
|
-
error = await field.get_error()
|
351
|
-
if error is not None:
|
352
|
-
resource.data.layouts[field.id].error = Error(
|
353
|
-
body=error.error, code=error.code
|
354
|
-
)
|
355
|
-
if include_extracted_data:
|
356
|
-
resource.data.layouts[field.id].extracted = (
|
357
|
-
LayoutFieldExtractedData()
|
358
|
-
)
|
359
|
-
await set_resource_field_extracted_data(
|
360
|
-
field,
|
361
|
-
resource.data.layouts[field.id].extracted,
|
362
|
-
field_type_name,
|
363
|
-
extracted,
|
364
|
-
)
|
365
297
|
elif field_type_name is FieldTypeName.CONVERSATION:
|
366
298
|
if resource.data.conversations is None:
|
367
299
|
resource.data.conversations = {}
|
@@ -375,73 +307,15 @@ async def managed_serialize(
|
|
375
307
|
)
|
376
308
|
if include_value and isinstance(field, Conversation):
|
377
309
|
value = await field.get_metadata()
|
378
|
-
resource.data.conversations[field.id].value = (
|
379
|
-
models.FieldConversation.from_message(value)
|
380
|
-
)
|
310
|
+
resource.data.conversations[field.id].value = from_proto.field_conversation(value)
|
381
311
|
if include_extracted_data:
|
382
|
-
resource.data.conversations[field.id].extracted = (
|
383
|
-
ConversationFieldExtractedData()
|
384
|
-
)
|
312
|
+
resource.data.conversations[field.id].extracted = ConversationFieldExtractedData()
|
385
313
|
await set_resource_field_extracted_data(
|
386
314
|
field,
|
387
315
|
resource.data.conversations[field.id].extracted,
|
388
316
|
field_type_name,
|
389
317
|
extracted,
|
390
318
|
)
|
391
|
-
elif field_type_name is FieldTypeName.DATETIME:
|
392
|
-
if resource.data.datetimes is None:
|
393
|
-
resource.data.datetimes = {}
|
394
|
-
if field.id not in resource.data.datetimes:
|
395
|
-
resource.data.datetimes[field.id] = DatetimeFieldData()
|
396
|
-
if include_errors:
|
397
|
-
error = await field.get_error()
|
398
|
-
if error is not None:
|
399
|
-
resource.data.datetimes[field.id].error = Error(
|
400
|
-
body=error.error, code=error.code
|
401
|
-
)
|
402
|
-
if include_value:
|
403
|
-
resource.data.datetimes[
|
404
|
-
field.id
|
405
|
-
].value = models.FieldDatetime.from_message(
|
406
|
-
value # type: ignore
|
407
|
-
)
|
408
|
-
if include_extracted_data:
|
409
|
-
resource.data.datetimes[field.id].extracted = (
|
410
|
-
DatetimeFieldExtractedData()
|
411
|
-
)
|
412
|
-
await set_resource_field_extracted_data(
|
413
|
-
field,
|
414
|
-
resource.data.datetimes[field.id].extracted,
|
415
|
-
field_type_name,
|
416
|
-
extracted,
|
417
|
-
)
|
418
|
-
elif field_type_name is FieldTypeName.KEYWORDSET:
|
419
|
-
if resource.data.keywordsets is None:
|
420
|
-
resource.data.keywordsets = {field.id: KeywordsetFieldData()}
|
421
|
-
if field.id not in resource.data.keywordsets:
|
422
|
-
resource.data.keywordsets[field.id] = KeywordsetFieldData()
|
423
|
-
if include_errors:
|
424
|
-
error = await field.get_error()
|
425
|
-
if error is not None:
|
426
|
-
resource.data.keywordsets[field.id].error = Error(
|
427
|
-
body=error.error, code=error.code
|
428
|
-
)
|
429
|
-
if include_value:
|
430
|
-
resource.data.keywordsets[
|
431
|
-
field.id
|
432
|
-
].value = models.FieldKeywordset.from_message(
|
433
|
-
value # type: ignore
|
434
|
-
)
|
435
|
-
if include_extracted_data:
|
436
|
-
resource.data.keywordsets[field.id].extracted = (
|
437
|
-
KeywordsetFieldExtractedData()
|
438
|
-
)
|
439
|
-
await set_resource_field_extracted_data(
|
440
|
-
field,
|
441
|
-
resource.data.keywordsets[field.id].extracted,
|
442
|
-
field_type_name,
|
443
|
-
extracted,
|
444
|
-
)
|
445
319
|
elif field_type_name is FieldTypeName.GENERIC:
|
446
320
|
if resource.data.generics is None:
|
447
321
|
resource.data.generics = {}
|
@@ -452,14 +326,10 @@ async def managed_serialize(
|
|
452
326
|
if include_errors:
|
453
327
|
error = await field.get_error()
|
454
328
|
if error is not None:
|
455
|
-
resource.data.generics[field.id].error = Error(
|
456
|
-
body=error.error, code=error.code
|
457
|
-
)
|
329
|
+
resource.data.generics[field.id].error = Error(body=error.error, code=error.code)
|
458
330
|
if include_extracted_data:
|
459
331
|
resource.data.generics[field.id].extracted = TextFieldExtractedData(
|
460
|
-
text=models.ExtractedText(
|
461
|
-
text=resource.data.generics[field.id].value
|
462
|
-
)
|
332
|
+
text=models.ExtractedText(text=resource.data.generics[field.id].value)
|
463
333
|
)
|
464
334
|
return resource
|
465
335
|
|
@@ -496,6 +366,6 @@ async def get_resource_uuid_by_slug(
|
|
496
366
|
) -> Optional[str]:
|
497
367
|
storage = await get_storage(service_name=service_name)
|
498
368
|
driver = get_driver()
|
499
|
-
async with driver.transaction() as txn:
|
369
|
+
async with driver.transaction(read_only=True) as txn:
|
500
370
|
kb = KnowledgeBox(txn, storage, kbid)
|
501
371
|
return await kb.get_resource_uuid_by_slug(slug)
|
@@ -48,9 +48,7 @@ async def start_grpc(service_name: Optional[str] = None):
|
|
48
48
|
|
49
49
|
await server.start()
|
50
50
|
|
51
|
-
logger.info(
|
52
|
-
f"======= Ingest GRPC running on http://0.0.0.0:{settings.grpc_port}/ ======"
|
53
|
-
)
|
51
|
+
logger.info(f"======= Ingest GRPC running on http://0.0.0.0:{settings.grpc_port}/ ======")
|
54
52
|
|
55
53
|
async def finalizer():
|
56
54
|
await health_check_finalizer()
|