nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -403
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +70 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +143 -117
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +37 -128
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
nucliadb/writer/back_pressure.py
CHANGED
@@ -28,7 +28,6 @@ from typing import Optional
|
|
28
28
|
from async_lru import alru_cache
|
29
29
|
from cachetools import TTLCache
|
30
30
|
from fastapi import HTTPException, Request
|
31
|
-
from nucliadb_protos.writer_pb2 import ShardObject
|
32
31
|
|
33
32
|
from nucliadb.common import datamanagers
|
34
33
|
from nucliadb.common.cluster.manager import get_index_nodes
|
@@ -37,6 +36,7 @@ from nucliadb.common.context.fastapi import get_app_context
|
|
37
36
|
from nucliadb.common.http_clients.processing import ProcessingHTTPClient
|
38
37
|
from nucliadb.writer import logger
|
39
38
|
from nucliadb.writer.settings import back_pressure_settings as settings
|
39
|
+
from nucliadb_protos.writer_pb2 import ShardObject
|
40
40
|
from nucliadb_telemetry import metrics
|
41
41
|
from nucliadb_utils import const
|
42
42
|
from nucliadb_utils.nats import NatsConnectionManager
|
@@ -112,9 +112,7 @@ def cached_back_pressure(kbid: str, resource_uuid: Optional[str] = None):
|
|
112
112
|
if data is not None:
|
113
113
|
try_after = data.try_after
|
114
114
|
back_pressure_type = data.type
|
115
|
-
RATE_LIMITED_REQUESTS_COUNTER.inc(
|
116
|
-
{"type": back_pressure_type, "cached": "true"}
|
117
|
-
)
|
115
|
+
RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "true"})
|
118
116
|
logger.info(
|
119
117
|
"Back pressure applied from cache",
|
120
118
|
extra={
|
@@ -137,9 +135,7 @@ def cached_back_pressure(kbid: str, resource_uuid: Optional[str] = None):
|
|
137
135
|
except BackPressureException as exc:
|
138
136
|
try_after = exc.data.try_after
|
139
137
|
back_pressure_type = exc.data.type
|
140
|
-
RATE_LIMITED_REQUESTS_COUNTER.inc(
|
141
|
-
{"type": back_pressure_type, "cached": "false"}
|
142
|
-
)
|
138
|
+
RATE_LIMITED_REQUESTS_COUNTER.inc({"type": back_pressure_type, "cached": "false"})
|
143
139
|
_cache.set(cache_key, exc.data)
|
144
140
|
raise HTTPException(
|
145
141
|
status_code=429,
|
@@ -248,14 +244,10 @@ class Materializer:
|
|
248
244
|
for node in get_index_nodes():
|
249
245
|
try:
|
250
246
|
with back_pressure_observer({"type": "get_indexing_pending"}):
|
251
|
-
self.indexing_pending[node.id] = (
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
consumer=const.Streams.INDEX.group.format(
|
256
|
-
node=node.id
|
257
|
-
),
|
258
|
-
)
|
247
|
+
self.indexing_pending[node.id] = await get_nats_consumer_pending_messages(
|
248
|
+
self.nats_manager,
|
249
|
+
stream=const.Streams.INDEX.name,
|
250
|
+
consumer=const.Streams.INDEX.group.format(node=node.id),
|
259
251
|
)
|
260
252
|
except Exception:
|
261
253
|
logger.exception(
|
@@ -336,9 +328,7 @@ def get_materializer() -> Materializer:
|
|
336
328
|
return MATERIALIZER
|
337
329
|
|
338
330
|
|
339
|
-
async def maybe_back_pressure(
|
340
|
-
request: Request, kbid: str, resource_uuid: Optional[str] = None
|
341
|
-
) -> None:
|
331
|
+
async def maybe_back_pressure(request: Request, kbid: str, resource_uuid: Optional[str] = None) -> None:
|
342
332
|
"""
|
343
333
|
This function does system checks to see if we need to put back pressure on writes.
|
344
334
|
In that case, a HTTP 429 will be raised with the estimated time to try again.
|
@@ -348,9 +338,7 @@ async def maybe_back_pressure(
|
|
348
338
|
await back_pressure_checks(request, kbid, resource_uuid)
|
349
339
|
|
350
340
|
|
351
|
-
async def back_pressure_checks(
|
352
|
-
request: Request, kbid: str, resource_uuid: Optional[str] = None
|
353
|
-
):
|
341
|
+
async def back_pressure_checks(request: Request, kbid: str, resource_uuid: Optional[str] = None):
|
354
342
|
"""
|
355
343
|
Will raise a 429 if back pressure is needed:
|
356
344
|
- If the processing engine is behind.
|
@@ -361,9 +349,7 @@ async def back_pressure_checks(
|
|
361
349
|
materializer = get_materializer()
|
362
350
|
with cached_back_pressure(kbid, resource_uuid):
|
363
351
|
check_ingest_behind(materializer.get_ingest_pending())
|
364
|
-
await check_indexing_behind(
|
365
|
-
context, kbid, resource_uuid, materializer.get_indexing_pending()
|
366
|
-
)
|
352
|
+
await check_indexing_behind(context, kbid, resource_uuid, materializer.get_indexing_pending())
|
367
353
|
await check_processing_behind(materializer, kbid)
|
368
354
|
|
369
355
|
|
@@ -418,9 +404,7 @@ async def check_indexing_behind(
|
|
418
404
|
|
419
405
|
# Get nodes that are involved in the indexing of the request
|
420
406
|
if resource_uuid is not None:
|
421
|
-
nodes_to_check = await get_nodes_for_resource_shard(
|
422
|
-
context, kbid, resource_uuid
|
423
|
-
)
|
407
|
+
nodes_to_check = await get_nodes_for_resource_shard(context, kbid, resource_uuid)
|
424
408
|
else:
|
425
409
|
nodes_to_check = await get_nodes_for_kb_active_shards(context, kbid)
|
426
410
|
|
@@ -488,9 +472,7 @@ def estimate_try_after(rate: float, pending: int, max_wait: int) -> datetime:
|
|
488
472
|
|
489
473
|
|
490
474
|
@alru_cache(maxsize=1024, ttl=60 * 15)
|
491
|
-
async def get_nodes_for_kb_active_shards(
|
492
|
-
context: ApplicationContext, kbid: str
|
493
|
-
) -> list[str]:
|
475
|
+
async def get_nodes_for_kb_active_shards(context: ApplicationContext, kbid: str) -> list[str]:
|
494
476
|
with back_pressure_observer({"type": "get_kb_active_shard"}):
|
495
477
|
active_shard = await get_kb_active_shard(context, kbid)
|
496
478
|
if active_shard is None:
|
@@ -521,20 +503,16 @@ async def get_nats_consumer_pending_messages(
|
|
521
503
|
return consumer_info.num_pending
|
522
504
|
|
523
505
|
|
524
|
-
async def get_kb_active_shard(
|
525
|
-
context
|
526
|
-
) -> Optional[ShardObject]:
|
527
|
-
async with context.kv_driver.transaction() as txn:
|
506
|
+
async def get_kb_active_shard(context: ApplicationContext, kbid: str) -> Optional[ShardObject]:
|
507
|
+
async with context.kv_driver.transaction(read_only=True) as txn:
|
528
508
|
return await context.shard_manager.get_current_active_shard(txn, kbid)
|
529
509
|
|
530
510
|
|
531
511
|
async def get_resource_shard(
|
532
512
|
context: ApplicationContext, kbid: str, resource_uuid: str
|
533
513
|
) -> Optional[ShardObject]:
|
534
|
-
async with datamanagers.
|
535
|
-
shard_id = await datamanagers.resources.get_resource_shard_id(
|
536
|
-
txn, kbid=kbid, rid=resource_uuid
|
537
|
-
)
|
514
|
+
async with datamanagers.with_ro_transaction() as txn:
|
515
|
+
shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=resource_uuid)
|
538
516
|
if shard_id is None:
|
539
517
|
# Resource does not exist
|
540
518
|
logger.debug(
|
nucliadb/writer/exceptions.py
CHANGED
nucliadb/writer/lifecycle.py
CHANGED
@@ -17,13 +17,20 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
from
|
20
|
+
from contextlib import asynccontextmanager
|
21
|
+
|
22
|
+
from fastapi import FastAPI
|
23
|
+
|
24
|
+
from nucliadb.common.context.fastapi import inject_app_context
|
25
|
+
from nucliadb.ingest.processing import start_processing_engine, stop_processing_engine
|
21
26
|
from nucliadb.ingest.utils import start_ingest, stop_ingest
|
22
27
|
from nucliadb.writer import SERVICE_NAME
|
28
|
+
from nucliadb.writer.back_pressure import start_materializer, stop_materializer
|
29
|
+
from nucliadb.writer.settings import back_pressure_settings
|
23
30
|
from nucliadb.writer.tus import finalize as storage_finalize
|
24
31
|
from nucliadb.writer.tus import initialize as storage_initialize
|
25
|
-
from nucliadb.writer.utilities import get_processing
|
26
32
|
from nucliadb_telemetry.utils import clean_telemetry, setup_telemetry
|
33
|
+
from nucliadb_utils.settings import is_onprem_nucliadb
|
27
34
|
from nucliadb_utils.utilities import (
|
28
35
|
finalize_utilities,
|
29
36
|
start_partitioning_utility,
|
@@ -32,29 +39,28 @@ from nucliadb_utils.utilities import (
|
|
32
39
|
)
|
33
40
|
|
34
41
|
|
35
|
-
|
36
|
-
|
42
|
+
@asynccontextmanager
|
43
|
+
async def lifespan(app: FastAPI):
|
44
|
+
back_pressure_enabled = back_pressure_settings.enabled and not is_onprem_nucliadb()
|
37
45
|
|
46
|
+
await setup_telemetry(SERVICE_NAME)
|
38
47
|
await start_ingest(SERVICE_NAME)
|
39
|
-
|
40
48
|
await start_processing_engine()
|
41
|
-
|
42
49
|
start_partitioning_utility()
|
43
|
-
|
44
50
|
await start_transaction_utility(SERVICE_NAME)
|
45
51
|
await storage_initialize()
|
46
52
|
|
53
|
+
# Inject application context into the fastapi app's state
|
54
|
+
async with inject_app_context(app) as context:
|
55
|
+
if back_pressure_enabled:
|
56
|
+
await start_materializer(context)
|
57
|
+
yield
|
47
58
|
|
48
|
-
|
59
|
+
if back_pressure_enabled:
|
60
|
+
await stop_materializer()
|
49
61
|
await stop_transaction_utility()
|
50
|
-
|
51
62
|
await stop_ingest()
|
52
|
-
|
53
|
-
if processing is not None:
|
54
|
-
await processing.finalize()
|
55
|
-
|
63
|
+
await stop_processing_engine()
|
56
64
|
await storage_finalize()
|
57
|
-
|
58
65
|
await clean_telemetry(SERVICE_NAME)
|
59
|
-
|
60
66
|
await finalize_utilities()
|
nucliadb/writer/py.typed
ADDED
File without changes
|
@@ -19,9 +19,10 @@
|
|
19
19
|
#
|
20
20
|
from datetime import datetime
|
21
21
|
|
22
|
-
from nucliadb_protos.writer_pb2 import Audit
|
23
22
|
from starlette.requests import Request
|
24
23
|
|
24
|
+
from nucliadb_protos.writer_pb2 import Audit
|
25
|
+
|
25
26
|
|
26
27
|
def parse_audit(audit: Audit, request: Request):
|
27
28
|
audit.user = request.headers.get("X-NUCLIADB-USER", "")
|
@@ -18,8 +18,31 @@
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
20
|
from datetime import datetime
|
21
|
+
from typing import Optional
|
21
22
|
|
22
23
|
from fastapi import HTTPException
|
24
|
+
|
25
|
+
from nucliadb.common.models_utils import to_proto
|
26
|
+
from nucliadb.common.models_utils.from_proto import (
|
27
|
+
RelationNodeTypeMap,
|
28
|
+
RelationTypeMap,
|
29
|
+
)
|
30
|
+
from nucliadb.ingest.orm.utils import set_title
|
31
|
+
from nucliadb.ingest.processing import PushPayload
|
32
|
+
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
33
|
+
from nucliadb_models.file import FileField
|
34
|
+
from nucliadb_models.link import LinkField
|
35
|
+
from nucliadb_models.metadata import (
|
36
|
+
ParagraphAnnotation,
|
37
|
+
QuestionAnswerAnnotation,
|
38
|
+
)
|
39
|
+
from nucliadb_models.text import TEXT_FORMAT_TO_MIMETYPE, PushTextFormat, Text
|
40
|
+
from nucliadb_models.writer import (
|
41
|
+
ComingResourcePayload,
|
42
|
+
CreateResourcePayload,
|
43
|
+
UpdateResourcePayload,
|
44
|
+
)
|
45
|
+
from nucliadb_protos.knowledgebox_pb2 import KnowledgeBoxConfig
|
23
46
|
from nucliadb_protos.resources_pb2 import (
|
24
47
|
Answers,
|
25
48
|
Basic,
|
@@ -30,38 +53,19 @@ from nucliadb_protos.resources_pb2 import (
|
|
30
53
|
Metadata,
|
31
54
|
PageSelections,
|
32
55
|
Paragraph,
|
56
|
+
TokenSplit,
|
57
|
+
UserFieldMetadata,
|
58
|
+
VisualSelection,
|
33
59
|
)
|
34
60
|
from nucliadb_protos.resources_pb2 import ParagraphAnnotation as PBParagraphAnnotation
|
35
61
|
from nucliadb_protos.resources_pb2 import (
|
36
62
|
QuestionAnswerAnnotation as PBQuestionAnswerAnnotation,
|
37
63
|
)
|
38
|
-
from nucliadb_protos.resources_pb2 import TokenSplit, UserFieldMetadata, VisualSelection
|
39
64
|
from nucliadb_protos.utils_pb2 import Relation, RelationNode
|
40
65
|
from nucliadb_protos.writer_pb2 import BrokerMessage
|
41
66
|
|
42
|
-
from nucliadb.ingest.orm.utils import set_title
|
43
|
-
from nucliadb.ingest.processing import PushPayload
|
44
|
-
from nucliadb_models.common import FIELD_TYPES_MAP_REVERSE
|
45
|
-
from nucliadb_models.file import FileField
|
46
|
-
from nucliadb_models.link import LinkField
|
47
|
-
from nucliadb_models.metadata import (
|
48
|
-
ParagraphAnnotation,
|
49
|
-
QuestionAnswerAnnotation,
|
50
|
-
RelationNodeTypeMap,
|
51
|
-
RelationTypeMap,
|
52
|
-
)
|
53
|
-
from nucliadb_models.text import TEXT_FORMAT_TO_MIMETYPE, PushTextFormat, Text
|
54
|
-
from nucliadb_models.writer import (
|
55
|
-
GENERIC_MIME_TYPE,
|
56
|
-
ComingResourcePayload,
|
57
|
-
CreateResourcePayload,
|
58
|
-
UpdateResourcePayload,
|
59
|
-
)
|
60
|
-
|
61
67
|
|
62
|
-
def parse_basic_modify(
|
63
|
-
bm: BrokerMessage, item: ComingResourcePayload, toprocess: PushPayload
|
64
|
-
):
|
68
|
+
def parse_basic_modify(bm: BrokerMessage, item: ComingResourcePayload, toprocess: PushPayload):
|
65
69
|
bm.basic.modified.FromDatetime(datetime.now())
|
66
70
|
if item.title:
|
67
71
|
set_title(bm, toprocess, item.title)
|
@@ -83,14 +87,9 @@ def parse_basic_modify(
|
|
83
87
|
bm.basic.metadata.useful = True
|
84
88
|
bm.basic.metadata.status = Metadata.Status.PENDING
|
85
89
|
|
86
|
-
toprocess.genericfield["summary"] = Text(
|
87
|
-
body=item.summary, format=PushTextFormat.PLAIN
|
88
|
-
)
|
90
|
+
toprocess.genericfield["summary"] = Text(body=item.summary, format=PushTextFormat.PLAIN)
|
89
91
|
if item.thumbnail:
|
90
92
|
bm.basic.thumbnail = item.thumbnail
|
91
|
-
if item.layout:
|
92
|
-
bm.basic.layout = item.layout
|
93
|
-
|
94
93
|
if item.metadata is not None:
|
95
94
|
bm.basic.metadata.metadata.update(item.metadata.metadata)
|
96
95
|
if item.metadata.language:
|
@@ -147,9 +146,8 @@ def parse_basic_modify(
|
|
147
146
|
userfieldmetadata.question_answers.append(qa_annotation_pb)
|
148
147
|
|
149
148
|
userfieldmetadata.field.field = fieldmetadata.field.field
|
150
|
-
|
151
|
-
|
152
|
-
]
|
149
|
+
|
150
|
+
userfieldmetadata.field.field_type = to_proto.field_type(fieldmetadata.field.field_type)
|
153
151
|
|
154
152
|
bm.basic.fieldmetadata.append(userfieldmetadata)
|
155
153
|
|
@@ -167,9 +165,7 @@ def parse_basic_modify(
|
|
167
165
|
]
|
168
166
|
)
|
169
167
|
|
170
|
-
relation_node_resource = RelationNode(
|
171
|
-
value=bm.uuid, ntype=RelationNode.NodeType.RESOURCE
|
172
|
-
)
|
168
|
+
relation_node_resource = RelationNode(value=bm.uuid, ntype=RelationNode.NodeType.RESOURCE)
|
173
169
|
relations = []
|
174
170
|
for relation in item.usermetadata.relations:
|
175
171
|
if relation.from_ is None:
|
@@ -205,8 +201,16 @@ def parse_basic_modify(
|
|
205
201
|
unique_groups = list(set(item.security.access_groups))
|
206
202
|
bm.security.access_groups.extend(unique_groups)
|
207
203
|
|
204
|
+
if item.hidden is not None:
|
205
|
+
bm.basic.hidden = item.hidden
|
206
|
+
|
208
207
|
|
209
|
-
def
|
208
|
+
def parse_basic_creation(
|
209
|
+
bm: BrokerMessage,
|
210
|
+
item: CreateResourcePayload,
|
211
|
+
toprocess: PushPayload,
|
212
|
+
kb_config: Optional[KnowledgeBoxConfig],
|
213
|
+
):
|
210
214
|
bm.basic.created.FromDatetime(datetime.now())
|
211
215
|
|
212
216
|
if item.title is None:
|
@@ -215,6 +219,10 @@ def parse_basic(bm: BrokerMessage, item: CreateResourcePayload, toprocess: PushP
|
|
215
219
|
|
216
220
|
parse_basic_modify(bm, item, toprocess)
|
217
221
|
|
222
|
+
if item.hidden is None:
|
223
|
+
if kb_config and kb_config.hidden_resources_hide_on_creation:
|
224
|
+
bm.basic.hidden = True
|
225
|
+
|
218
226
|
|
219
227
|
def set_status(basic: Basic, item: CreateResourcePayload):
|
220
228
|
basic.metadata.status = Metadata.Status.PENDING
|
@@ -227,15 +235,11 @@ def set_status_modify(basic: Basic, item: UpdateResourcePayload):
|
|
227
235
|
def validate_classifications(paragraph: ParagraphAnnotation):
|
228
236
|
classifications = paragraph.classifications
|
229
237
|
if len(classifications) == 0:
|
230
|
-
raise HTTPException(
|
231
|
-
status_code=422, detail="ensure classifications has at least 1 items"
|
232
|
-
)
|
238
|
+
raise HTTPException(status_code=422, detail="ensure classifications has at least 1 items")
|
233
239
|
|
234
|
-
unique_classifications = {tuple(cf.
|
240
|
+
unique_classifications = {tuple(cf.model_dump().values()) for cf in classifications}
|
235
241
|
if len(unique_classifications) != len(classifications):
|
236
|
-
raise HTTPException(
|
237
|
-
status_code=422, detail="Paragraph classifications need to be unique"
|
238
|
-
)
|
242
|
+
raise HTTPException(status_code=422, detail="Paragraph classifications need to be unique")
|
239
243
|
|
240
244
|
|
241
245
|
def compute_title(item: CreateResourcePayload, rid: str) -> str:
|
@@ -273,9 +277,7 @@ def build_question_answer_annotation_pb(
|
|
273
277
|
pb.cancelled_by_user = qa_annotation.cancelled_by_user
|
274
278
|
pb.question_answer.question.text = qa_annotation.question_answer.question.text
|
275
279
|
if qa_annotation.question_answer.question.language is not None:
|
276
|
-
pb.question_answer.question.language =
|
277
|
-
qa_annotation.question_answer.question.language
|
278
|
-
)
|
280
|
+
pb.question_answer.question.language = qa_annotation.question_answer.question.language
|
279
281
|
pb.question_answer.question.ids_paragraphs.extend(
|
280
282
|
qa_annotation.question_answer.question.ids_paragraphs
|
281
283
|
)
|
@@ -21,23 +21,23 @@ from datetime import datetime
|
|
21
21
|
from typing import Optional, Union
|
22
22
|
|
23
23
|
from google.protobuf.json_format import MessageToDict
|
24
|
-
from nucliadb_protos.writer_pb2 import BrokerMessage
|
25
24
|
|
26
25
|
import nucliadb_models as models
|
26
|
+
from nucliadb.common.models_utils import from_proto, to_proto
|
27
27
|
from nucliadb.ingest.fields.conversation import Conversation
|
28
28
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
29
29
|
from nucliadb.ingest.processing import PushPayload
|
30
30
|
from nucliadb.writer import SERVICE_NAME
|
31
|
-
from nucliadb.writer.layouts import serialize_blocks
|
32
31
|
from nucliadb.writer.utilities import get_processing
|
33
|
-
from nucliadb_models.common import
|
32
|
+
from nucliadb_models.common import FieldTypeName
|
33
|
+
from nucliadb_models.content_types import GENERIC_MIME_TYPE
|
34
34
|
from nucliadb_models.conversation import PushConversation
|
35
35
|
from nucliadb_models.writer import (
|
36
|
-
GENERIC_MIME_TYPE,
|
37
36
|
CreateResourcePayload,
|
38
37
|
UpdateResourcePayload,
|
39
38
|
)
|
40
39
|
from nucliadb_protos import resources_pb2
|
40
|
+
from nucliadb_protos.writer_pb2 import BrokerMessage
|
41
41
|
from nucliadb_utils.storages.storage import StorageField
|
42
42
|
from nucliadb_utils.utilities import get_storage
|
43
43
|
|
@@ -50,6 +50,7 @@ async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
|
|
50
50
|
language=field_pb.language,
|
51
51
|
password=field_pb.password,
|
52
52
|
file=models.File(payload=None, uri=field_pb.file.uri),
|
53
|
+
extract_strategy=field_pb.extract_strategy,
|
53
54
|
)
|
54
55
|
return processing.convert_external_filefield_to_str(file_field)
|
55
56
|
else:
|
@@ -80,12 +81,11 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
80
81
|
storage = await get_storage(service_name=SERVICE_NAME)
|
81
82
|
await resource.get_fields()
|
82
83
|
for (field_type, field_id), field in resource.fields.items():
|
83
|
-
field_type_name =
|
84
|
+
field_type_name = from_proto.field_type_name(field_type)
|
84
85
|
|
85
86
|
if field_type_name not in {
|
86
87
|
FieldTypeName.TEXT,
|
87
88
|
FieldTypeName.FILE,
|
88
|
-
FieldTypeName.LAYOUT,
|
89
89
|
FieldTypeName.CONVERSATION,
|
90
90
|
FieldTypeName.LINK,
|
91
91
|
}:
|
@@ -114,28 +114,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
114
114
|
parsed_text["format"] = models.PushTextFormat[parsed_text["format"]]
|
115
115
|
toprocess.textfield[field_id] = models.Text(**parsed_text)
|
116
116
|
|
117
|
-
if field_type_name is FieldTypeName.
|
118
|
-
parsed_layout = MessageToDict(
|
119
|
-
field_pb,
|
120
|
-
preserving_proto_field_name=True,
|
121
|
-
including_default_value_fields=True,
|
122
|
-
)
|
123
|
-
parsed_layout["format"] = resources_pb2.FieldLayout.Format.Value(
|
124
|
-
parsed_layout["format"]
|
125
|
-
)
|
126
|
-
|
127
|
-
for blockid, block in parsed_layout["body"]["blocks"].items():
|
128
|
-
cf = field_pb.body.blocks[blockid].file
|
129
|
-
block["file"] = await processing.convert_internal_cf_to_str(cf, storage)
|
130
|
-
|
131
|
-
parsed_layout["blocks"] = parsed_layout.get("body", {}).get("blocks", {})
|
132
|
-
del parsed_layout["body"]
|
133
|
-
|
134
|
-
toprocess.layoutfield[field_id] = models.LayoutDiff(**parsed_layout)
|
135
|
-
|
136
|
-
if field_type_name is FieldTypeName.CONVERSATION and isinstance(
|
137
|
-
field, Conversation
|
138
|
-
):
|
117
|
+
if field_type_name is FieldTypeName.CONVERSATION and isinstance(field, Conversation):
|
139
118
|
metadata = await field.get_metadata()
|
140
119
|
if metadata.pages == 0:
|
141
120
|
continue
|
@@ -156,14 +135,13 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
156
135
|
await processing.convert_internal_cf_to_str(cf, storage)
|
157
136
|
for cf in message.content.attachments
|
158
137
|
]
|
159
|
-
parsed_message["content"]
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
full_conversation.messages.append(
|
165
|
-
models.PushMessage(**parsed_message)
|
138
|
+
if "attachments_fields" in parsed_message["content"]:
|
139
|
+
# Not defined on the push payload
|
140
|
+
del parsed_message["content"]["attachments_fields"]
|
141
|
+
parsed_message["content"]["format"] = resources_pb2.MessageContent.Format.Value(
|
142
|
+
parsed_message["content"]["format"]
|
166
143
|
)
|
144
|
+
full_conversation.messages.append(models.PushMessage(**parsed_message))
|
167
145
|
toprocess.conversationfield[field_id] = full_conversation
|
168
146
|
|
169
147
|
|
@@ -176,9 +154,7 @@ async def parse_fields(
|
|
176
154
|
x_skip_store: bool,
|
177
155
|
):
|
178
156
|
for key, file_field in item.files.items():
|
179
|
-
await parse_file_field(
|
180
|
-
key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store
|
181
|
-
)
|
157
|
+
await parse_file_field(key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store)
|
182
158
|
|
183
159
|
for key, link_field in item.links.items():
|
184
160
|
parse_link_field(key, link_field, writer, toprocess)
|
@@ -186,19 +162,8 @@ async def parse_fields(
|
|
186
162
|
for key, text_field in item.texts.items():
|
187
163
|
parse_text_field(key, text_field, writer, toprocess)
|
188
164
|
|
189
|
-
for key, layout_field in item.layouts.items():
|
190
|
-
await parse_layout_field(key, layout_field, writer, toprocess, kbid, uuid)
|
191
|
-
|
192
165
|
for key, conversation_field in item.conversations.items():
|
193
|
-
await parse_conversation_field(
|
194
|
-
key, conversation_field, writer, toprocess, kbid, uuid
|
195
|
-
)
|
196
|
-
|
197
|
-
for key, datetime_field in item.datetimes.items():
|
198
|
-
parse_datetime_field(key, datetime_field, writer, toprocess)
|
199
|
-
|
200
|
-
for key, keywordset_field in item.keywordsets.items():
|
201
|
-
parse_keywordset_field(key, keywordset_field, writer, toprocess)
|
166
|
+
await parse_conversation_field(key, conversation_field, writer, toprocess, kbid, uuid)
|
202
167
|
|
203
168
|
|
204
169
|
def parse_text_field(
|
@@ -207,10 +172,10 @@ def parse_text_field(
|
|
207
172
|
writer: BrokerMessage,
|
208
173
|
toprocess: PushPayload,
|
209
174
|
) -> None:
|
175
|
+
if text_field.extract_strategy is not None:
|
176
|
+
writer.texts[key].extract_strategy = text_field.extract_strategy
|
210
177
|
writer.texts[key].body = text_field.body
|
211
|
-
writer.texts[key].format = resources_pb2.FieldText.Format.Value(
|
212
|
-
text_field.format.value
|
213
|
-
)
|
178
|
+
writer.texts[key].format = resources_pb2.FieldText.Format.Value(text_field.format.value)
|
214
179
|
etw = resources_pb2.ExtractedTextWrapper()
|
215
180
|
etw.field.field = key
|
216
181
|
etw.field.field_type = resources_pb2.FieldType.TEXT
|
@@ -219,6 +184,7 @@ def parse_text_field(
|
|
219
184
|
toprocess.textfield[key] = models.Text(
|
220
185
|
body=text_field.body,
|
221
186
|
format=getattr(models.PushTextFormat, text_field.format.value),
|
187
|
+
extract_strategy=text_field.extract_strategy,
|
222
188
|
)
|
223
189
|
|
224
190
|
|
@@ -251,6 +217,8 @@ async def parse_internal_file_field(
|
|
251
217
|
writer.files[key].added.FromDatetime(datetime.now())
|
252
218
|
if file_field.language:
|
253
219
|
writer.files[key].language = file_field.language
|
220
|
+
if file_field.extract_strategy is not None:
|
221
|
+
writer.files[key].extract_strategy = file_field.extract_strategy
|
254
222
|
|
255
223
|
processing = get_processing()
|
256
224
|
|
@@ -286,6 +254,8 @@ def parse_external_file_field(
|
|
286
254
|
writer.files[key].added.FromDatetime(datetime.now())
|
287
255
|
if file_field.language:
|
288
256
|
writer.files[key].language = file_field.language
|
257
|
+
if file_field.extract_strategy is not None:
|
258
|
+
writer.files[key].extract_strategy = file_field.extract_strategy
|
289
259
|
uri = file_field.file.uri
|
290
260
|
writer.files[key].url = uri # type: ignore
|
291
261
|
writer.files[key].file.uri = uri # type: ignore
|
@@ -328,6 +298,9 @@ def parse_link_field(
|
|
328
298
|
if link_field.xpath is not None:
|
329
299
|
writer.links[key].xpath = link_field.xpath
|
330
300
|
|
301
|
+
if link_field.extract_strategy is not None:
|
302
|
+
writer.links[key].extract_strategy = link_field.extract_strategy
|
303
|
+
|
331
304
|
toprocess.linkfield[key] = models.LinkUpload(
|
332
305
|
link=link_field.uri,
|
333
306
|
headers=link_field.headers or {},
|
@@ -335,78 +308,7 @@ def parse_link_field(
|
|
335
308
|
localstorage=link_field.localstorage or {},
|
336
309
|
css_selector=link_field.css_selector,
|
337
310
|
xpath=link_field.xpath,
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
def parse_keywordset_field(
|
342
|
-
key: str,
|
343
|
-
keywordset_field: models.FieldKeywordset,
|
344
|
-
writer: BrokerMessage,
|
345
|
-
toprocess: PushPayload,
|
346
|
-
) -> None:
|
347
|
-
if keywordset_field.keywords is None:
|
348
|
-
return
|
349
|
-
|
350
|
-
for keyword in keywordset_field.keywords:
|
351
|
-
fieldpb = resources_pb2.Keyword()
|
352
|
-
fieldpb.value = keyword.value
|
353
|
-
writer.keywordsets[key].keywords.append(fieldpb)
|
354
|
-
|
355
|
-
|
356
|
-
def parse_datetime_field(
|
357
|
-
key: str,
|
358
|
-
datetime_field: models.FieldDatetime,
|
359
|
-
writer: BrokerMessage,
|
360
|
-
toprocess: PushPayload,
|
361
|
-
) -> None:
|
362
|
-
if datetime_field.value is None:
|
363
|
-
return
|
364
|
-
|
365
|
-
writer.datetimes[key].value.FromDatetime(datetime_field.value)
|
366
|
-
|
367
|
-
|
368
|
-
async def parse_layout_field(
|
369
|
-
key: str,
|
370
|
-
layout_field: models.InputLayoutField,
|
371
|
-
writer: BrokerMessage,
|
372
|
-
toprocess: PushPayload,
|
373
|
-
kbid: str,
|
374
|
-
uuid: str,
|
375
|
-
) -> None:
|
376
|
-
storage = await get_storage(service_name=SERVICE_NAME)
|
377
|
-
processing = get_processing()
|
378
|
-
|
379
|
-
lc: resources_pb2.FieldLayout = await serialize_blocks(
|
380
|
-
layout_field, kbid, uuid, key, storage
|
381
|
-
)
|
382
|
-
writer.layouts[key].CopyFrom(lc)
|
383
|
-
|
384
|
-
toprocess_blocks = {}
|
385
|
-
for blockid, block in layout_field.body.blocks.items():
|
386
|
-
sf_conv_field: StorageField = storage.layout_field(
|
387
|
-
kbid, uuid, field=key, ident=block.ident
|
388
|
-
)
|
389
|
-
cf_conv_field = await storage.upload_b64file_to_cloudfile(
|
390
|
-
sf_conv_field,
|
391
|
-
block.file.payload.encode(),
|
392
|
-
block.file.filename,
|
393
|
-
block.file.content_type,
|
394
|
-
block.file.md5,
|
395
|
-
)
|
396
|
-
|
397
|
-
toprocess_blocks[blockid] = models.PushLayoutBlock(
|
398
|
-
x=block.x,
|
399
|
-
y=block.y,
|
400
|
-
cols=block.cols,
|
401
|
-
rows=block.rows,
|
402
|
-
type=block.type,
|
403
|
-
ident=block.ident,
|
404
|
-
payload=block.payload,
|
405
|
-
file=await processing.convert_internal_cf_to_str(cf_conv_field, storage),
|
406
|
-
)
|
407
|
-
|
408
|
-
toprocess.layoutfield[key] = models.LayoutDiff(
|
409
|
-
format=lc.format, blocks=toprocess_blocks # type: ignore
|
311
|
+
extract_strategy=link_field.extract_strategy,
|
410
312
|
)
|
411
313
|
|
412
314
|
|
@@ -420,7 +322,6 @@ async def parse_conversation_field(
|
|
420
322
|
) -> None:
|
421
323
|
storage = await get_storage(service_name=SERVICE_NAME)
|
422
324
|
processing = get_processing()
|
423
|
-
|
424
325
|
field_value = resources_pb2.Conversation()
|
425
326
|
convs = models.PushConversation()
|
426
327
|
for message in conversation_field.messages:
|
@@ -441,8 +342,16 @@ async def parse_conversation_field(
|
|
441
342
|
)
|
442
343
|
|
443
344
|
cm.content.text = message.content.text
|
444
|
-
cm.content.format = resources_pb2.MessageContent.Format.Value(
|
445
|
-
|
345
|
+
cm.content.format = resources_pb2.MessageContent.Format.Value(message.content.format.value)
|
346
|
+
cm.content.attachments_fields.extend(
|
347
|
+
[
|
348
|
+
resources_pb2.FieldRef(
|
349
|
+
field_type=to_proto.field_type_name(attachment.field_type),
|
350
|
+
field_id=attachment.field_id,
|
351
|
+
split=attachment.split if attachment.split is not None else "",
|
352
|
+
)
|
353
|
+
for attachment in message.content.attachments_fields
|
354
|
+
]
|
446
355
|
)
|
447
356
|
|
448
357
|
for count, file in enumerate(message.content.attachments):
|