nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0003_allfields_key.py +1 -35
- migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
- migrations/0010_fix_corrupt_indexes.py +10 -10
- migrations/0011_materialize_labelset_ids.py +1 -16
- migrations/0012_rollover_shards.py +5 -10
- migrations/0014_rollover_shards.py +4 -5
- migrations/0015_targeted_rollover.py +5 -10
- migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
- migrations/0017_multiple_writable_shards.py +2 -4
- migrations/0018_purge_orphan_kbslugs.py +5 -7
- migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
- migrations/0020_drain_nodes_from_cluster.py +3 -3
- nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
- nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
- migrations/0023_backfill_pg_catalog.py +80 -0
- migrations/0025_assign_models_to_kbs_v2.py +113 -0
- migrations/0026_fix_high_cardinality_content_types.py +61 -0
- migrations/0027_rollover_texts3.py +73 -0
- nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
- migrations/pg/0002_catalog.py +42 -0
- nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
- nucliadb/common/cluster/base.py +30 -16
- nucliadb/common/cluster/discovery/base.py +6 -14
- nucliadb/common/cluster/discovery/k8s.py +9 -19
- nucliadb/common/cluster/discovery/manual.py +1 -3
- nucliadb/common/cluster/discovery/utils.py +1 -3
- nucliadb/common/cluster/grpc_node_dummy.py +3 -11
- nucliadb/common/cluster/index_node.py +10 -19
- nucliadb/common/cluster/manager.py +174 -59
- nucliadb/common/cluster/rebalance.py +27 -29
- nucliadb/common/cluster/rollover.py +353 -194
- nucliadb/common/cluster/settings.py +6 -0
- nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
- nucliadb/common/cluster/standalone/index_node.py +4 -11
- nucliadb/common/cluster/standalone/service.py +2 -6
- nucliadb/common/cluster/standalone/utils.py +2 -6
- nucliadb/common/cluster/utils.py +29 -22
- nucliadb/common/constants.py +20 -0
- nucliadb/common/context/__init__.py +3 -0
- nucliadb/common/context/fastapi.py +8 -5
- nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
- nucliadb/common/datamanagers/__init__.py +7 -1
- nucliadb/common/datamanagers/atomic.py +22 -4
- nucliadb/common/datamanagers/cluster.py +5 -5
- nucliadb/common/datamanagers/entities.py +6 -16
- nucliadb/common/datamanagers/fields.py +84 -0
- nucliadb/common/datamanagers/kb.py +83 -37
- nucliadb/common/datamanagers/labels.py +26 -56
- nucliadb/common/datamanagers/processing.py +2 -6
- nucliadb/common/datamanagers/resources.py +41 -103
- nucliadb/common/datamanagers/rollover.py +76 -15
- nucliadb/common/datamanagers/synonyms.py +1 -1
- nucliadb/common/datamanagers/utils.py +15 -6
- nucliadb/common/datamanagers/vectorsets.py +110 -0
- nucliadb/common/external_index_providers/base.py +257 -0
- nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
- nucliadb/common/external_index_providers/manager.py +101 -0
- nucliadb/common/external_index_providers/pinecone.py +933 -0
- nucliadb/common/external_index_providers/settings.py +52 -0
- nucliadb/common/http_clients/auth.py +3 -6
- nucliadb/common/http_clients/processing.py +6 -11
- nucliadb/common/http_clients/utils.py +1 -3
- nucliadb/common/ids.py +240 -0
- nucliadb/common/locking.py +29 -7
- nucliadb/common/maindb/driver.py +11 -35
- nucliadb/common/maindb/exceptions.py +3 -0
- nucliadb/common/maindb/local.py +22 -9
- nucliadb/common/maindb/pg.py +206 -111
- nucliadb/common/maindb/utils.py +11 -42
- nucliadb/common/models_utils/from_proto.py +479 -0
- nucliadb/common/models_utils/to_proto.py +60 -0
- nucliadb/common/nidx.py +260 -0
- nucliadb/export_import/datamanager.py +25 -19
- nucliadb/export_import/exporter.py +5 -11
- nucliadb/export_import/importer.py +5 -7
- nucliadb/export_import/models.py +3 -3
- nucliadb/export_import/tasks.py +4 -4
- nucliadb/export_import/utils.py +25 -37
- nucliadb/health.py +1 -3
- nucliadb/ingest/app.py +15 -11
- nucliadb/ingest/consumer/auditing.py +21 -19
- nucliadb/ingest/consumer/consumer.py +82 -47
- nucliadb/ingest/consumer/materializer.py +5 -12
- nucliadb/ingest/consumer/pull.py +12 -27
- nucliadb/ingest/consumer/service.py +19 -17
- nucliadb/ingest/consumer/shard_creator.py +2 -4
- nucliadb/ingest/consumer/utils.py +1 -3
- nucliadb/ingest/fields/base.py +137 -105
- nucliadb/ingest/fields/conversation.py +18 -5
- nucliadb/ingest/fields/exceptions.py +1 -4
- nucliadb/ingest/fields/file.py +7 -16
- nucliadb/ingest/fields/link.py +5 -10
- nucliadb/ingest/fields/text.py +9 -4
- nucliadb/ingest/orm/brain.py +200 -213
- nucliadb/ingest/orm/broker_message.py +181 -0
- nucliadb/ingest/orm/entities.py +36 -51
- nucliadb/ingest/orm/exceptions.py +12 -0
- nucliadb/ingest/orm/knowledgebox.py +322 -197
- nucliadb/ingest/orm/processor/__init__.py +2 -700
- nucliadb/ingest/orm/processor/auditing.py +4 -23
- nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
- nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
- nucliadb/ingest/orm/processor/processor.py +752 -0
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
- nucliadb/ingest/orm/resource.py +249 -402
- nucliadb/ingest/orm/utils.py +4 -4
- nucliadb/ingest/partitions.py +3 -9
- nucliadb/ingest/processing.py +64 -73
- nucliadb/ingest/py.typed +0 -0
- nucliadb/ingest/serialize.py +37 -167
- nucliadb/ingest/service/__init__.py +1 -3
- nucliadb/ingest/service/writer.py +185 -412
- nucliadb/ingest/settings.py +10 -20
- nucliadb/ingest/utils.py +3 -6
- nucliadb/learning_proxy.py +242 -55
- nucliadb/metrics_exporter.py +30 -19
- nucliadb/middleware/__init__.py +1 -3
- nucliadb/migrator/command.py +1 -3
- nucliadb/migrator/datamanager.py +13 -13
- nucliadb/migrator/migrator.py +47 -30
- nucliadb/migrator/utils.py +18 -10
- nucliadb/purge/__init__.py +139 -33
- nucliadb/purge/orphan_shards.py +7 -13
- nucliadb/reader/__init__.py +1 -3
- nucliadb/reader/api/models.py +1 -12
- nucliadb/reader/api/v1/__init__.py +0 -1
- nucliadb/reader/api/v1/download.py +21 -88
- nucliadb/reader/api/v1/export_import.py +1 -1
- nucliadb/reader/api/v1/knowledgebox.py +10 -10
- nucliadb/reader/api/v1/learning_config.py +2 -6
- nucliadb/reader/api/v1/resource.py +62 -88
- nucliadb/reader/api/v1/services.py +64 -83
- nucliadb/reader/app.py +12 -29
- nucliadb/reader/lifecycle.py +18 -4
- nucliadb/reader/py.typed +0 -0
- nucliadb/reader/reader/notifications.py +10 -28
- nucliadb/search/__init__.py +1 -3
- nucliadb/search/api/v1/__init__.py +1 -2
- nucliadb/search/api/v1/ask.py +17 -10
- nucliadb/search/api/v1/catalog.py +184 -0
- nucliadb/search/api/v1/feedback.py +16 -24
- nucliadb/search/api/v1/find.py +36 -36
- nucliadb/search/api/v1/knowledgebox.py +89 -60
- nucliadb/search/api/v1/resource/ask.py +2 -8
- nucliadb/search/api/v1/resource/search.py +49 -70
- nucliadb/search/api/v1/search.py +44 -210
- nucliadb/search/api/v1/suggest.py +39 -54
- nucliadb/search/app.py +12 -32
- nucliadb/search/lifecycle.py +10 -3
- nucliadb/search/predict.py +136 -187
- nucliadb/search/py.typed +0 -0
- nucliadb/search/requesters/utils.py +25 -58
- nucliadb/search/search/cache.py +149 -20
- nucliadb/search/search/chat/ask.py +571 -123
- nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
- nucliadb/search/search/chat/images.py +41 -17
- nucliadb/search/search/chat/prompt.py +817 -266
- nucliadb/search/search/chat/query.py +213 -309
- nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
- nucliadb/search/search/fetch.py +43 -36
- nucliadb/search/search/filters.py +9 -15
- nucliadb/search/search/find.py +214 -53
- nucliadb/search/search/find_merge.py +408 -391
- nucliadb/search/search/hydrator.py +191 -0
- nucliadb/search/search/merge.py +187 -223
- nucliadb/search/search/metrics.py +73 -2
- nucliadb/search/search/paragraphs.py +64 -106
- nucliadb/search/search/pgcatalog.py +233 -0
- nucliadb/search/search/predict_proxy.py +1 -1
- nucliadb/search/search/query.py +305 -150
- nucliadb/search/search/query_parser/exceptions.py +22 -0
- nucliadb/search/search/query_parser/models.py +101 -0
- nucliadb/search/search/query_parser/parser.py +183 -0
- nucliadb/search/search/rank_fusion.py +204 -0
- nucliadb/search/search/rerankers.py +270 -0
- nucliadb/search/search/shards.py +3 -32
- nucliadb/search/search/summarize.py +7 -18
- nucliadb/search/search/utils.py +27 -4
- nucliadb/search/settings.py +15 -1
- nucliadb/standalone/api_router.py +4 -10
- nucliadb/standalone/app.py +8 -14
- nucliadb/standalone/auth.py +7 -21
- nucliadb/standalone/config.py +7 -10
- nucliadb/standalone/lifecycle.py +26 -25
- nucliadb/standalone/migrations.py +1 -3
- nucliadb/standalone/purge.py +1 -1
- nucliadb/standalone/py.typed +0 -0
- nucliadb/standalone/run.py +3 -6
- nucliadb/standalone/settings.py +9 -16
- nucliadb/standalone/versions.py +15 -5
- nucliadb/tasks/consumer.py +8 -12
- nucliadb/tasks/producer.py +7 -6
- nucliadb/tests/config.py +53 -0
- nucliadb/train/__init__.py +1 -3
- nucliadb/train/api/utils.py +1 -2
- nucliadb/train/api/v1/shards.py +1 -1
- nucliadb/train/api/v1/trainset.py +2 -4
- nucliadb/train/app.py +10 -31
- nucliadb/train/generator.py +10 -19
- nucliadb/train/generators/field_classifier.py +7 -19
- nucliadb/train/generators/field_streaming.py +156 -0
- nucliadb/train/generators/image_classifier.py +12 -18
- nucliadb/train/generators/paragraph_classifier.py +5 -9
- nucliadb/train/generators/paragraph_streaming.py +6 -9
- nucliadb/train/generators/question_answer_streaming.py +19 -20
- nucliadb/train/generators/sentence_classifier.py +9 -15
- nucliadb/train/generators/token_classifier.py +48 -39
- nucliadb/train/generators/utils.py +14 -18
- nucliadb/train/lifecycle.py +7 -3
- nucliadb/train/nodes.py +23 -32
- nucliadb/train/py.typed +0 -0
- nucliadb/train/servicer.py +13 -21
- nucliadb/train/settings.py +2 -6
- nucliadb/train/types.py +13 -10
- nucliadb/train/upload.py +3 -6
- nucliadb/train/uploader.py +19 -23
- nucliadb/train/utils.py +1 -1
- nucliadb/writer/__init__.py +1 -3
- nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
- nucliadb/writer/api/v1/export_import.py +67 -14
- nucliadb/writer/api/v1/field.py +16 -269
- nucliadb/writer/api/v1/knowledgebox.py +218 -68
- nucliadb/writer/api/v1/resource.py +68 -88
- nucliadb/writer/api/v1/services.py +51 -70
- nucliadb/writer/api/v1/slug.py +61 -0
- nucliadb/writer/api/v1/transaction.py +67 -0
- nucliadb/writer/api/v1/upload.py +114 -113
- nucliadb/writer/app.py +6 -43
- nucliadb/writer/back_pressure.py +16 -38
- nucliadb/writer/exceptions.py +0 -4
- nucliadb/writer/lifecycle.py +21 -15
- nucliadb/writer/py.typed +0 -0
- nucliadb/writer/resource/audit.py +2 -1
- nucliadb/writer/resource/basic.py +48 -46
- nucliadb/writer/resource/field.py +25 -127
- nucliadb/writer/resource/origin.py +1 -2
- nucliadb/writer/settings.py +6 -2
- nucliadb/writer/tus/__init__.py +17 -15
- nucliadb/writer/tus/azure.py +111 -0
- nucliadb/writer/tus/dm.py +17 -5
- nucliadb/writer/tus/exceptions.py +1 -3
- nucliadb/writer/tus/gcs.py +49 -84
- nucliadb/writer/tus/local.py +21 -37
- nucliadb/writer/tus/s3.py +28 -68
- nucliadb/writer/tus/storage.py +5 -56
- nucliadb/writer/vectorsets.py +125 -0
- nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
- nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
- nucliadb/common/maindb/redis.py +0 -194
- nucliadb/common/maindb/tikv.py +0 -433
- nucliadb/ingest/fields/layout.py +0 -58
- nucliadb/ingest/tests/conftest.py +0 -30
- nucliadb/ingest/tests/fixtures.py +0 -764
- nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
- nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
- nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
- nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
- nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
- nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
- nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
- nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
- nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
- nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
- nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
- nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
- nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
- nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
- nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
- nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
- nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
- nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
- nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
- nucliadb/ingest/tests/unit/test_cache.py +0 -31
- nucliadb/ingest/tests/unit/test_partitions.py +0 -40
- nucliadb/ingest/tests/unit/test_processing.py +0 -171
- nucliadb/middleware/transaction.py +0 -117
- nucliadb/reader/api/v1/learning_collector.py +0 -63
- nucliadb/reader/tests/__init__.py +0 -19
- nucliadb/reader/tests/conftest.py +0 -31
- nucliadb/reader/tests/fixtures.py +0 -136
- nucliadb/reader/tests/test_list_resources.py +0 -75
- nucliadb/reader/tests/test_reader_file_download.py +0 -273
- nucliadb/reader/tests/test_reader_resource.py +0 -353
- nucliadb/reader/tests/test_reader_resource_field.py +0 -219
- nucliadb/search/api/v1/chat.py +0 -263
- nucliadb/search/api/v1/resource/chat.py +0 -174
- nucliadb/search/tests/__init__.py +0 -19
- nucliadb/search/tests/conftest.py +0 -33
- nucliadb/search/tests/fixtures.py +0 -199
- nucliadb/search/tests/node.py +0 -466
- nucliadb/search/tests/unit/__init__.py +0 -18
- nucliadb/search/tests/unit/api/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
- nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
- nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
- nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
- nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
- nucliadb/search/tests/unit/search/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
- nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
- nucliadb/search/tests/unit/search/search/__init__.py +0 -19
- nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
- nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
- nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
- nucliadb/search/tests/unit/search/test_fetch.py +0 -108
- nucliadb/search/tests/unit/search/test_filters.py +0 -125
- nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
- nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
- nucliadb/search/tests/unit/search/test_query.py +0 -153
- nucliadb/search/tests/unit/test_app.py +0 -79
- nucliadb/search/tests/unit/test_find_merge.py +0 -112
- nucliadb/search/tests/unit/test_merge.py +0 -34
- nucliadb/search/tests/unit/test_predict.py +0 -525
- nucliadb/standalone/tests/__init__.py +0 -19
- nucliadb/standalone/tests/conftest.py +0 -33
- nucliadb/standalone/tests/fixtures.py +0 -38
- nucliadb/standalone/tests/unit/__init__.py +0 -18
- nucliadb/standalone/tests/unit/test_api_router.py +0 -61
- nucliadb/standalone/tests/unit/test_auth.py +0 -169
- nucliadb/standalone/tests/unit/test_introspect.py +0 -35
- nucliadb/standalone/tests/unit/test_migrations.py +0 -63
- nucliadb/standalone/tests/unit/test_versions.py +0 -68
- nucliadb/tests/benchmarks/__init__.py +0 -19
- nucliadb/tests/benchmarks/test_search.py +0 -99
- nucliadb/tests/conftest.py +0 -32
- nucliadb/tests/fixtures.py +0 -735
- nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
- nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
- nucliadb/tests/migrations/test_migration_0017.py +0 -76
- nucliadb/tests/migrations/test_migration_0018.py +0 -95
- nucliadb/tests/tikv.py +0 -240
- nucliadb/tests/unit/__init__.py +0 -19
- nucliadb/tests/unit/common/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
- nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
- nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
- nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
- nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
- nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
- nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
- nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
- nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
- nucliadb/tests/unit/common/maindb/__init__.py +0 -18
- nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
- nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
- nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
- nucliadb/tests/unit/common/test_context.py +0 -36
- nucliadb/tests/unit/export_import/__init__.py +0 -19
- nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
- nucliadb/tests/unit/export_import/test_utils.py +0 -301
- nucliadb/tests/unit/migrator/__init__.py +0 -19
- nucliadb/tests/unit/migrator/test_migrator.py +0 -87
- nucliadb/tests/unit/tasks/__init__.py +0 -19
- nucliadb/tests/unit/tasks/conftest.py +0 -42
- nucliadb/tests/unit/tasks/test_consumer.py +0 -92
- nucliadb/tests/unit/tasks/test_producer.py +0 -95
- nucliadb/tests/unit/tasks/test_tasks.py +0 -58
- nucliadb/tests/unit/test_field_ids.py +0 -49
- nucliadb/tests/unit/test_health.py +0 -86
- nucliadb/tests/unit/test_kb_slugs.py +0 -54
- nucliadb/tests/unit/test_learning_proxy.py +0 -252
- nucliadb/tests/unit/test_metrics_exporter.py +0 -77
- nucliadb/tests/unit/test_purge.py +0 -136
- nucliadb/tests/utils/__init__.py +0 -74
- nucliadb/tests/utils/aiohttp_session.py +0 -44
- nucliadb/tests/utils/broker_messages/__init__.py +0 -171
- nucliadb/tests/utils/broker_messages/fields.py +0 -197
- nucliadb/tests/utils/broker_messages/helpers.py +0 -33
- nucliadb/tests/utils/entities.py +0 -78
- nucliadb/train/api/v1/check.py +0 -60
- nucliadb/train/tests/__init__.py +0 -19
- nucliadb/train/tests/conftest.py +0 -29
- nucliadb/train/tests/fixtures.py +0 -342
- nucliadb/train/tests/test_field_classification.py +0 -122
- nucliadb/train/tests/test_get_entities.py +0 -80
- nucliadb/train/tests/test_get_info.py +0 -51
- nucliadb/train/tests/test_get_ontology.py +0 -34
- nucliadb/train/tests/test_get_ontology_count.py +0 -63
- nucliadb/train/tests/test_image_classification.py +0 -221
- nucliadb/train/tests/test_list_fields.py +0 -39
- nucliadb/train/tests/test_list_paragraphs.py +0 -73
- nucliadb/train/tests/test_list_resources.py +0 -39
- nucliadb/train/tests/test_list_sentences.py +0 -71
- nucliadb/train/tests/test_paragraph_classification.py +0 -123
- nucliadb/train/tests/test_paragraph_streaming.py +0 -118
- nucliadb/train/tests/test_question_answer_streaming.py +0 -239
- nucliadb/train/tests/test_sentence_classification.py +0 -143
- nucliadb/train/tests/test_token_classification.py +0 -136
- nucliadb/train/tests/utils.py +0 -101
- nucliadb/writer/layouts/__init__.py +0 -51
- nucliadb/writer/layouts/v1.py +0 -59
- nucliadb/writer/tests/__init__.py +0 -19
- nucliadb/writer/tests/conftest.py +0 -31
- nucliadb/writer/tests/fixtures.py +0 -191
- nucliadb/writer/tests/test_fields.py +0 -475
- nucliadb/writer/tests/test_files.py +0 -740
- nucliadb/writer/tests/test_knowledgebox.py +0 -49
- nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
- nucliadb/writer/tests/test_resources.py +0 -476
- nucliadb/writer/tests/test_service.py +0 -137
- nucliadb/writer/tests/test_tus.py +0 -203
- nucliadb/writer/tests/utils.py +0 -35
- nucliadb/writer/tus/pg.py +0 -125
- nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
- nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
- {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
- /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
- /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
- /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
- {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -17,12 +17,11 @@
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
19
|
#
|
20
|
-
from nucliadb_protos.resources_pb2 import FieldType
|
21
|
-
|
22
20
|
from nucliadb.common.maindb.driver import Driver
|
23
21
|
from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
|
24
22
|
from nucliadb.ingest.orm.resource import Resource
|
25
23
|
from nucliadb_protos import audit_pb2, writer_pb2
|
24
|
+
from nucliadb_protos.resources_pb2 import FieldType
|
26
25
|
from nucliadb_utils.storages.storage import Storage
|
27
26
|
|
28
27
|
|
@@ -35,7 +34,7 @@ async def collect_audit_fields(
|
|
35
34
|
return []
|
36
35
|
|
37
36
|
audit_storage_fields: list[audit_pb2.AuditField] = []
|
38
|
-
async with driver.transaction() as txn:
|
37
|
+
async with driver.transaction(read_only=True) as txn:
|
39
38
|
kb = KnowledgeBox(txn, storage, message.kbid)
|
40
39
|
resource = Resource(txn, storage, kb, message.uuid)
|
41
40
|
field_keys = await resource.get_fields_ids()
|
@@ -55,9 +54,7 @@ async def collect_audit_fields(
|
|
55
54
|
audit_storage_fields.append(auditfield)
|
56
55
|
|
57
56
|
for fieldid in message.delete_fields or []:
|
58
|
-
field = await resource.get_field(
|
59
|
-
fieldid.field, writer_pb2.FieldType.FILE, load=True
|
60
|
-
)
|
57
|
+
field = await resource.get_field(fieldid.field, writer_pb2.FieldType.FILE, load=True)
|
61
58
|
audit_field = audit_pb2.AuditField()
|
62
59
|
audit_field.action = audit_pb2.AuditField.FieldAction.DELETED
|
63
60
|
audit_field.field_id = fieldid.field
|
@@ -94,26 +91,11 @@ def iterate_auditable_fields(
|
|
94
91
|
yield key
|
95
92
|
yielded.add(key)
|
96
93
|
|
97
|
-
for field_id in message.layouts.keys():
|
98
|
-
key = (field_id, writer_pb2.FieldType.LAYOUT)
|
99
|
-
yield key
|
100
|
-
yielded.add(key)
|
101
|
-
|
102
94
|
for field_id in message.texts.keys():
|
103
95
|
key = (field_id, writer_pb2.FieldType.TEXT)
|
104
96
|
yield key
|
105
97
|
yielded.add(key)
|
106
98
|
|
107
|
-
for field_id in message.keywordsets.keys():
|
108
|
-
key = (field_id, writer_pb2.FieldType.KEYWORDSET)
|
109
|
-
yield key
|
110
|
-
yielded.add(key)
|
111
|
-
|
112
|
-
for field_id in message.datetimes.keys():
|
113
|
-
key = (field_id, writer_pb2.FieldType.DATETIME)
|
114
|
-
yield key
|
115
|
-
yielded.add(key)
|
116
|
-
|
117
99
|
for field_id in message.links.keys():
|
118
100
|
key = (field_id, writer_pb2.FieldType.LINK)
|
119
101
|
yield key
|
@@ -124,8 +106,7 @@ def iterate_auditable_fields(
|
|
124
106
|
continue
|
125
107
|
|
126
108
|
if not (
|
127
|
-
field_id in message.files
|
128
|
-
or message.type is writer_pb2.BrokerMessage.MessageType.DELETE
|
109
|
+
field_id in message.files or message.type is writer_pb2.BrokerMessage.MessageType.DELETE
|
129
110
|
):
|
130
111
|
continue
|
131
112
|
|
@@ -0,0 +1,164 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
import logging
|
22
|
+
from dataclasses import dataclass, field
|
23
|
+
from typing import Optional, Sequence
|
24
|
+
|
25
|
+
from nucliadb.ingest.orm.resource import Resource
|
26
|
+
from nucliadb.ingest.processing import ProcessingEngine, PushPayload, Source
|
27
|
+
from nucliadb_models.text import PushTextFormat, Text
|
28
|
+
from nucliadb_protos import resources_pb2, writer_pb2
|
29
|
+
from nucliadb_protos.resources_pb2 import FieldID, FieldType
|
30
|
+
from nucliadb_utils.utilities import Utility, get_partitioning, get_utility
|
31
|
+
|
32
|
+
logger = logging.getLogger("ingest-processor")
|
33
|
+
|
34
|
+
|
35
|
+
@dataclass
|
36
|
+
class GeneratedFields:
|
37
|
+
texts: list[str] = field(default_factory=list)
|
38
|
+
links: list[str] = field(default_factory=list)
|
39
|
+
files: list[str] = field(default_factory=list)
|
40
|
+
conversations: list[str] = field(default_factory=list)
|
41
|
+
|
42
|
+
def is_not_empty(self) -> bool:
|
43
|
+
return (len(self.texts) + len(self.links) + len(self.files) + len(self.conversations)) > 0
|
44
|
+
|
45
|
+
|
46
|
+
async def get_generated_fields(bm: writer_pb2.BrokerMessage, resource: Resource) -> GeneratedFields:
|
47
|
+
"""Processing can send messages with generated fields. Those can be
|
48
|
+
generated with a data augmentation task and, as learning can't queue it to
|
49
|
+
process, nucliadb is responsible to send the generated field to process (and
|
50
|
+
ingest the processed thing later).
|
51
|
+
|
52
|
+
Given a broker message and a resource, this function returns the list of
|
53
|
+
generated fields, that can be empty.
|
54
|
+
|
55
|
+
"""
|
56
|
+
generated_fields = GeneratedFields()
|
57
|
+
|
58
|
+
# only messages from processor can add generated fields
|
59
|
+
if bm.source != writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
|
60
|
+
return generated_fields
|
61
|
+
|
62
|
+
# search all fields
|
63
|
+
|
64
|
+
all_fields = await resource.get_all_field_ids(for_update=False)
|
65
|
+
fields: Sequence[FieldID]
|
66
|
+
if all_fields is None:
|
67
|
+
fields = []
|
68
|
+
else:
|
69
|
+
fields = all_fields.fields
|
70
|
+
|
71
|
+
for field_id in bm.texts:
|
72
|
+
field = FieldID(field_type=FieldType.TEXT, field=field_id)
|
73
|
+
if field not in fields:
|
74
|
+
generated_fields.texts.append(field_id)
|
75
|
+
|
76
|
+
for field_id in bm.links:
|
77
|
+
field = FieldID(field_type=FieldType.LINK, field=field_id)
|
78
|
+
if field not in fields:
|
79
|
+
generated_fields.links.append(field_id)
|
80
|
+
|
81
|
+
for field_id in bm.files:
|
82
|
+
field = FieldID(field_type=FieldType.FILE, field=field_id)
|
83
|
+
if field not in fields:
|
84
|
+
generated_fields.files.append(field_id)
|
85
|
+
|
86
|
+
for field_id in bm.conversations:
|
87
|
+
field = FieldID(field_type=FieldType.CONVERSATION, field=field_id)
|
88
|
+
if field not in fields:
|
89
|
+
generated_fields.conversations.append(field_id)
|
90
|
+
|
91
|
+
return generated_fields
|
92
|
+
|
93
|
+
|
94
|
+
async def send_generated_fields_to_process(
|
95
|
+
kbid: str,
|
96
|
+
resource: Resource,
|
97
|
+
generated_fields: GeneratedFields,
|
98
|
+
bm: writer_pb2.BrokerMessage,
|
99
|
+
):
|
100
|
+
partitioning = get_partitioning()
|
101
|
+
partition = partitioning.generate_partition(kbid, resource.uuid)
|
102
|
+
|
103
|
+
processing: ProcessingEngine = get_utility(Utility.PROCESSING)
|
104
|
+
|
105
|
+
payload = _generate_processing_payload_for_fields(kbid, resource.uuid, generated_fields, bm)
|
106
|
+
if payload is not None:
|
107
|
+
processing_info = await processing.send_to_process(payload, partition)
|
108
|
+
logger.info(
|
109
|
+
"Sent generated fields to process",
|
110
|
+
extra={"processing_info": processing_info},
|
111
|
+
)
|
112
|
+
|
113
|
+
|
114
|
+
def _generate_processing_payload_for_fields(
|
115
|
+
kbid: str,
|
116
|
+
rid: str,
|
117
|
+
fields: GeneratedFields,
|
118
|
+
bm: writer_pb2.BrokerMessage,
|
119
|
+
) -> Optional[PushPayload]:
|
120
|
+
partitioning = get_partitioning()
|
121
|
+
partition = partitioning.generate_partition(kbid, rid)
|
122
|
+
|
123
|
+
payload = PushPayload(kbid=kbid, uuid=rid, userid="nucliadb-ingest", partition=partition)
|
124
|
+
|
125
|
+
payload.kbid = bm.kbid
|
126
|
+
payload.uuid = rid
|
127
|
+
payload.source = Source.INGEST
|
128
|
+
payload.slug = bm.slug
|
129
|
+
|
130
|
+
# populate generated fields
|
131
|
+
|
132
|
+
for text in fields.texts:
|
133
|
+
payload.textfield[text] = _bm_text_field_to_processing(bm.texts[text])
|
134
|
+
|
135
|
+
for file in fields.files:
|
136
|
+
logger.warning(
|
137
|
+
"Ingest received a broker message from processor with a new file field! Skipping",
|
138
|
+
extra={"kbid": kbid, "rid": rid, "field_id": file},
|
139
|
+
)
|
140
|
+
pass
|
141
|
+
|
142
|
+
for link in fields.links:
|
143
|
+
logger.warning(
|
144
|
+
"Ingest received a broker message from processor with a new link field!",
|
145
|
+
extra={"kbid": kbid, "rid": rid, "field_id": link},
|
146
|
+
)
|
147
|
+
pass
|
148
|
+
|
149
|
+
for conversation in fields.conversations:
|
150
|
+
logger.warning(
|
151
|
+
"Ingest received a broker message from processor with a new conversation field! Skipping",
|
152
|
+
extra={"kbid": kbid, "rid": rid, "field_id": conversation},
|
153
|
+
)
|
154
|
+
pass
|
155
|
+
|
156
|
+
if len(fields.texts) > 0:
|
157
|
+
return payload
|
158
|
+
else:
|
159
|
+
# we don't want to send weird empty messages to processing
|
160
|
+
return None
|
161
|
+
|
162
|
+
|
163
|
+
def _bm_text_field_to_processing(text_field: resources_pb2.FieldText) -> Text:
|
164
|
+
return Text(body=text_field.body, format=PushTextFormat(text_field.format))
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# Copyright (C) 2021 Bosutech XXI S.L.
|
2
|
+
#
|
3
|
+
# nucliadb is offered under the AGPL v3.0 and as commercial software.
|
4
|
+
# For commercial licensing, contact us at info@nuclia.com.
|
5
|
+
#
|
6
|
+
# AGPL:
|
7
|
+
# This program is free software: you can redistribute it and/or modify
|
8
|
+
# it under the terms of the GNU Affero General Public License as
|
9
|
+
# published by the Free Software Foundation, either version 3 of the
|
10
|
+
# License, or (at your option) any later version.
|
11
|
+
#
|
12
|
+
# This program is distributed in the hope that it will be useful,
|
13
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
15
|
+
# GNU Affero General Public License for more details.
|
16
|
+
#
|
17
|
+
# You should have received a copy of the GNU Affero General Public License
|
18
|
+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
19
|
+
#
|
20
|
+
|
21
|
+
from typing import cast
|
22
|
+
|
23
|
+
from nucliadb.common.maindb.driver import Transaction
|
24
|
+
from nucliadb.common.maindb.pg import PGDriver, PGTransaction
|
25
|
+
from nucliadb.common.maindb.utils import get_driver
|
26
|
+
from nucliadb_telemetry import metrics
|
27
|
+
|
28
|
+
from ..resource import Resource
|
29
|
+
|
30
|
+
observer = metrics.Observer("pg_catalog_write", labels={"type": ""})
|
31
|
+
|
32
|
+
|
33
|
+
def _pg_transaction(txn: Transaction) -> PGTransaction:
|
34
|
+
return cast(PGTransaction, txn)
|
35
|
+
|
36
|
+
|
37
|
+
def pgcatalog_enabled(kbid):
|
38
|
+
return isinstance(get_driver(), PGDriver)
|
39
|
+
|
40
|
+
|
41
|
+
@observer.wrap({"type": "update"})
|
42
|
+
async def pgcatalog_update(txn: Transaction, kbid: str, resource: Resource):
|
43
|
+
if not pgcatalog_enabled(kbid):
|
44
|
+
return
|
45
|
+
|
46
|
+
if resource.basic is None:
|
47
|
+
raise ValueError("Cannot index into the catalog a resource without basic metadata ")
|
48
|
+
|
49
|
+
created_at = resource.basic.created.ToDatetime()
|
50
|
+
modified_at = resource.basic.modified.ToDatetime()
|
51
|
+
if modified_at < created_at:
|
52
|
+
modified_at = created_at
|
53
|
+
|
54
|
+
async with _pg_transaction(txn).connection.cursor() as cur:
|
55
|
+
await cur.execute(
|
56
|
+
"""
|
57
|
+
INSERT INTO catalog
|
58
|
+
(kbid, rid, title, created_at, modified_at, labels)
|
59
|
+
VALUES
|
60
|
+
(%(kbid)s, %(rid)s, %(title)s, %(created_at)s, %(modified_at)s, %(labels)s)
|
61
|
+
ON CONFLICT (kbid, rid) DO UPDATE SET
|
62
|
+
title = excluded.title,
|
63
|
+
created_at = excluded.created_at,
|
64
|
+
modified_at = excluded.modified_at,
|
65
|
+
labels = excluded.labels""",
|
66
|
+
{
|
67
|
+
"kbid": resource.kb.kbid,
|
68
|
+
"rid": resource.uuid,
|
69
|
+
"title": resource.basic.title,
|
70
|
+
"created_at": created_at,
|
71
|
+
"modified_at": modified_at,
|
72
|
+
"labels": list(resource.indexer.brain.labels),
|
73
|
+
},
|
74
|
+
)
|
75
|
+
|
76
|
+
|
77
|
+
@observer.wrap({"type": "delete"})
|
78
|
+
async def pgcatalog_delete(txn: Transaction, kbid: str, rid: str):
|
79
|
+
if not pgcatalog_enabled(kbid):
|
80
|
+
return
|
81
|
+
async with _pg_transaction(txn).connection.cursor() as cur:
|
82
|
+
await cur.execute(
|
83
|
+
"DELETE FROM catalog where kbid = %(kbid)s AND rid = %(rid)s", {"kbid": kbid, "rid": rid}
|
84
|
+
)
|