PyPI - nucliadb - Versions diffs - 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl - Mend

nucliadb 4.0.0.post542py3-none-any.whl → 6.2.1.post2777py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (418) hide show

migrations/0003_allfields_key.py +1 -35
migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
migrations/0010_fix_corrupt_indexes.py +10 -10
migrations/0011_materialize_labelset_ids.py +1 -16
migrations/0012_rollover_shards.py +5 -10
migrations/0014_rollover_shards.py +4 -5
migrations/0015_targeted_rollover.py +5 -10
migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
migrations/0017_multiple_writable_shards.py +2 -4
migrations/0018_purge_orphan_kbslugs.py +5 -7
migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
migrations/0020_drain_nodes_from_cluster.py +3 -3
nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
migrations/0023_backfill_pg_catalog.py +80 -0
migrations/0025_assign_models_to_kbs_v2.py +113 -0
migrations/0026_fix_high_cardinality_content_types.py +61 -0
migrations/0027_rollover_texts3.py +73 -0
nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
migrations/pg/0002_catalog.py +42 -0
nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
nucliadb/common/cluster/base.py +30 -16
nucliadb/common/cluster/discovery/base.py +6 -14
nucliadb/common/cluster/discovery/k8s.py +9 -19
nucliadb/common/cluster/discovery/manual.py +1 -3
nucliadb/common/cluster/discovery/utils.py +1 -3
nucliadb/common/cluster/grpc_node_dummy.py +3 -11
nucliadb/common/cluster/index_node.py +10 -19
nucliadb/common/cluster/manager.py +174 -59
nucliadb/common/cluster/rebalance.py +27 -29
nucliadb/common/cluster/rollover.py +353 -194
nucliadb/common/cluster/settings.py +6 -0
nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
nucliadb/common/cluster/standalone/index_node.py +4 -11
nucliadb/common/cluster/standalone/service.py +2 -6
nucliadb/common/cluster/standalone/utils.py +2 -6
nucliadb/common/cluster/utils.py +29 -22
nucliadb/common/constants.py +20 -0
nucliadb/common/context/__init__.py +3 -0
nucliadb/common/context/fastapi.py +8 -5
nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
nucliadb/common/datamanagers/__init__.py +7 -1
nucliadb/common/datamanagers/atomic.py +22 -4
nucliadb/common/datamanagers/cluster.py +5 -5
nucliadb/common/datamanagers/entities.py +6 -16
nucliadb/common/datamanagers/fields.py +84 -0
nucliadb/common/datamanagers/kb.py +83 -37
nucliadb/common/datamanagers/labels.py +26 -56
nucliadb/common/datamanagers/processing.py +2 -6
nucliadb/common/datamanagers/resources.py +41 -103
nucliadb/common/datamanagers/rollover.py +76 -15
nucliadb/common/datamanagers/synonyms.py +1 -1
nucliadb/common/datamanagers/utils.py +15 -6
nucliadb/common/datamanagers/vectorsets.py +110 -0
nucliadb/common/external_index_providers/base.py +257 -0
nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
nucliadb/common/external_index_providers/manager.py +101 -0
nucliadb/common/external_index_providers/pinecone.py +933 -0
nucliadb/common/external_index_providers/settings.py +52 -0
nucliadb/common/http_clients/auth.py +3 -6
nucliadb/common/http_clients/processing.py +6 -11
nucliadb/common/http_clients/utils.py +1 -3
nucliadb/common/ids.py +240 -0
nucliadb/common/locking.py +29 -7
nucliadb/common/maindb/driver.py +11 -35
nucliadb/common/maindb/exceptions.py +3 -0
nucliadb/common/maindb/local.py +22 -9
nucliadb/common/maindb/pg.py +206 -111
nucliadb/common/maindb/utils.py +11 -42
nucliadb/common/models_utils/from_proto.py +479 -0
nucliadb/common/models_utils/to_proto.py +60 -0
nucliadb/common/nidx.py +260 -0
nucliadb/export_import/datamanager.py +25 -19
nucliadb/export_import/exporter.py +5 -11
nucliadb/export_import/importer.py +5 -7
nucliadb/export_import/models.py +3 -3
nucliadb/export_import/tasks.py +4 -4
nucliadb/export_import/utils.py +25 -37
nucliadb/health.py +1 -3
nucliadb/ingest/app.py +15 -11
nucliadb/ingest/consumer/auditing.py +21 -19
nucliadb/ingest/consumer/consumer.py +82 -47
nucliadb/ingest/consumer/materializer.py +5 -12
nucliadb/ingest/consumer/pull.py +12 -27
nucliadb/ingest/consumer/service.py +19 -17
nucliadb/ingest/consumer/shard_creator.py +2 -4
nucliadb/ingest/consumer/utils.py +1 -3
nucliadb/ingest/fields/base.py +137 -105
nucliadb/ingest/fields/conversation.py +18 -5
nucliadb/ingest/fields/exceptions.py +1 -4
nucliadb/ingest/fields/file.py +7 -16
nucliadb/ingest/fields/link.py +5 -10
nucliadb/ingest/fields/text.py +9 -4
nucliadb/ingest/orm/brain.py +200 -213
nucliadb/ingest/orm/broker_message.py +181 -0
nucliadb/ingest/orm/entities.py +36 -51
nucliadb/ingest/orm/exceptions.py +12 -0
nucliadb/ingest/orm/knowledgebox.py +322 -197
nucliadb/ingest/orm/processor/__init__.py +2 -700
nucliadb/ingest/orm/processor/auditing.py +4 -23
nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
nucliadb/ingest/orm/processor/processor.py +752 -0
nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
nucliadb/ingest/orm/resource.py +249 -402
nucliadb/ingest/orm/utils.py +4 -4
nucliadb/ingest/partitions.py +3 -9
nucliadb/ingest/processing.py +64 -73
nucliadb/ingest/py.typed +0 -0
nucliadb/ingest/serialize.py +37 -167
nucliadb/ingest/service/__init__.py +1 -3
nucliadb/ingest/service/writer.py +185 -412
nucliadb/ingest/settings.py +10 -20
nucliadb/ingest/utils.py +3 -6
nucliadb/learning_proxy.py +242 -55
nucliadb/metrics_exporter.py +30 -19
nucliadb/middleware/__init__.py +1 -3
nucliadb/migrator/command.py +1 -3
nucliadb/migrator/datamanager.py +13 -13
nucliadb/migrator/migrator.py +47 -30
nucliadb/migrator/utils.py +18 -10
nucliadb/purge/__init__.py +139 -33
nucliadb/purge/orphan_shards.py +7 -13
nucliadb/reader/__init__.py +1 -3
nucliadb/reader/api/models.py +1 -12
nucliadb/reader/api/v1/__init__.py +0 -1
nucliadb/reader/api/v1/download.py +21 -88
nucliadb/reader/api/v1/export_import.py +1 -1
nucliadb/reader/api/v1/knowledgebox.py +10 -10
nucliadb/reader/api/v1/learning_config.py +2 -6
nucliadb/reader/api/v1/resource.py +62 -88
nucliadb/reader/api/v1/services.py +64 -83
nucliadb/reader/app.py +12 -29
nucliadb/reader/lifecycle.py +18 -4
nucliadb/reader/py.typed +0 -0
nucliadb/reader/reader/notifications.py +10 -28
nucliadb/search/__init__.py +1 -3
nucliadb/search/api/v1/__init__.py +1 -2
nucliadb/search/api/v1/ask.py +17 -10
nucliadb/search/api/v1/catalog.py +184 -0
nucliadb/search/api/v1/feedback.py +16 -24
nucliadb/search/api/v1/find.py +36 -36
nucliadb/search/api/v1/knowledgebox.py +89 -60
nucliadb/search/api/v1/resource/ask.py +2 -8
nucliadb/search/api/v1/resource/search.py +49 -70
nucliadb/search/api/v1/search.py +44 -210
nucliadb/search/api/v1/suggest.py +39 -54
nucliadb/search/app.py +12 -32
nucliadb/search/lifecycle.py +10 -3
nucliadb/search/predict.py +136 -187
nucliadb/search/py.typed +0 -0
nucliadb/search/requesters/utils.py +25 -58
nucliadb/search/search/cache.py +149 -20
nucliadb/search/search/chat/ask.py +571 -123
nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
nucliadb/search/search/chat/images.py +41 -17
nucliadb/search/search/chat/prompt.py +817 -266
nucliadb/search/search/chat/query.py +213 -309
nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
nucliadb/search/search/fetch.py +43 -36
nucliadb/search/search/filters.py +9 -15
nucliadb/search/search/find.py +214 -53
nucliadb/search/search/find_merge.py +408 -391
nucliadb/search/search/hydrator.py +191 -0
nucliadb/search/search/merge.py +187 -223
nucliadb/search/search/metrics.py +73 -2
nucliadb/search/search/paragraphs.py +64 -106
nucliadb/search/search/pgcatalog.py +233 -0
nucliadb/search/search/predict_proxy.py +1 -1
nucliadb/search/search/query.py +305 -150
nucliadb/search/search/query_parser/exceptions.py +22 -0
nucliadb/search/search/query_parser/models.py +101 -0
nucliadb/search/search/query_parser/parser.py +183 -0
nucliadb/search/search/rank_fusion.py +204 -0
nucliadb/search/search/rerankers.py +270 -0
nucliadb/search/search/shards.py +3 -32
nucliadb/search/search/summarize.py +7 -18
nucliadb/search/search/utils.py +27 -4
nucliadb/search/settings.py +15 -1
nucliadb/standalone/api_router.py +4 -10
nucliadb/standalone/app.py +8 -14
nucliadb/standalone/auth.py +7 -21
nucliadb/standalone/config.py +7 -10
nucliadb/standalone/lifecycle.py +26 -25
nucliadb/standalone/migrations.py +1 -3
nucliadb/standalone/purge.py +1 -1
nucliadb/standalone/py.typed +0 -0
nucliadb/standalone/run.py +3 -6
nucliadb/standalone/settings.py +9 -16
nucliadb/standalone/versions.py +15 -5
nucliadb/tasks/consumer.py +8 -12
nucliadb/tasks/producer.py +7 -6
nucliadb/tests/config.py +53 -0
nucliadb/train/__init__.py +1 -3
nucliadb/train/api/utils.py +1 -2
nucliadb/train/api/v1/shards.py +1 -1
nucliadb/train/api/v1/trainset.py +2 -4
nucliadb/train/app.py +10 -31
nucliadb/train/generator.py +10 -19
nucliadb/train/generators/field_classifier.py +7 -19
nucliadb/train/generators/field_streaming.py +156 -0
nucliadb/train/generators/image_classifier.py +12 -18
nucliadb/train/generators/paragraph_classifier.py +5 -9
nucliadb/train/generators/paragraph_streaming.py +6 -9
nucliadb/train/generators/question_answer_streaming.py +19 -20
nucliadb/train/generators/sentence_classifier.py +9 -15
nucliadb/train/generators/token_classifier.py +48 -39
nucliadb/train/generators/utils.py +14 -18
nucliadb/train/lifecycle.py +7 -3
nucliadb/train/nodes.py +23 -32
nucliadb/train/py.typed +0 -0
nucliadb/train/servicer.py +13 -21
nucliadb/train/settings.py +2 -6
nucliadb/train/types.py +13 -10
nucliadb/train/upload.py +3 -6
nucliadb/train/uploader.py +19 -23
nucliadb/train/utils.py +1 -1
nucliadb/writer/__init__.py +1 -3
nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
nucliadb/writer/api/v1/export_import.py +67 -14
nucliadb/writer/api/v1/field.py +16 -269
nucliadb/writer/api/v1/knowledgebox.py +218 -68
nucliadb/writer/api/v1/resource.py +68 -88
nucliadb/writer/api/v1/services.py +51 -70
nucliadb/writer/api/v1/slug.py +61 -0
nucliadb/writer/api/v1/transaction.py +67 -0
nucliadb/writer/api/v1/upload.py +114 -113
nucliadb/writer/app.py +6 -43
nucliadb/writer/back_pressure.py +16 -38
nucliadb/writer/exceptions.py +0 -4
nucliadb/writer/lifecycle.py +21 -15
nucliadb/writer/py.typed +0 -0
nucliadb/writer/resource/audit.py +2 -1
nucliadb/writer/resource/basic.py +48 -46
nucliadb/writer/resource/field.py +25 -127
nucliadb/writer/resource/origin.py +1 -2
nucliadb/writer/settings.py +6 -2
nucliadb/writer/tus/__init__.py +17 -15
nucliadb/writer/tus/azure.py +111 -0
nucliadb/writer/tus/dm.py +17 -5
nucliadb/writer/tus/exceptions.py +1 -3
nucliadb/writer/tus/gcs.py +49 -84
nucliadb/writer/tus/local.py +21 -37
nucliadb/writer/tus/s3.py +28 -68
nucliadb/writer/tus/storage.py +5 -56
nucliadb/writer/vectorsets.py +125 -0
nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
{nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
nucliadb/common/maindb/redis.py +0 -194
nucliadb/common/maindb/tikv.py +0 -433
nucliadb/ingest/fields/layout.py +0 -58
nucliadb/ingest/tests/conftest.py +0 -30
nucliadb/ingest/tests/fixtures.py +0 -764
nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
nucliadb/ingest/tests/unit/test_cache.py +0 -31
nucliadb/ingest/tests/unit/test_partitions.py +0 -40
nucliadb/ingest/tests/unit/test_processing.py +0 -171
nucliadb/middleware/transaction.py +0 -117
nucliadb/reader/api/v1/learning_collector.py +0 -63
nucliadb/reader/tests/__init__.py +0 -19
nucliadb/reader/tests/conftest.py +0 -31
nucliadb/reader/tests/fixtures.py +0 -136
nucliadb/reader/tests/test_list_resources.py +0 -75
nucliadb/reader/tests/test_reader_file_download.py +0 -273
nucliadb/reader/tests/test_reader_resource.py +0 -353
nucliadb/reader/tests/test_reader_resource_field.py +0 -219
nucliadb/search/api/v1/chat.py +0 -263
nucliadb/search/api/v1/resource/chat.py +0 -174
nucliadb/search/tests/__init__.py +0 -19
nucliadb/search/tests/conftest.py +0 -33
nucliadb/search/tests/fixtures.py +0 -199
nucliadb/search/tests/node.py +0 -466
nucliadb/search/tests/unit/__init__.py +0 -18
nucliadb/search/tests/unit/api/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
nucliadb/search/tests/unit/search/__init__.py +0 -18
nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
nucliadb/search/tests/unit/search/search/__init__.py +0 -19
nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
nucliadb/search/tests/unit/search/test_fetch.py +0 -108
nucliadb/search/tests/unit/search/test_filters.py +0 -125
nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
nucliadb/search/tests/unit/search/test_query.py +0 -153
nucliadb/search/tests/unit/test_app.py +0 -79
nucliadb/search/tests/unit/test_find_merge.py +0 -112
nucliadb/search/tests/unit/test_merge.py +0 -34
nucliadb/search/tests/unit/test_predict.py +0 -525
nucliadb/standalone/tests/__init__.py +0 -19
nucliadb/standalone/tests/conftest.py +0 -33
nucliadb/standalone/tests/fixtures.py +0 -38
nucliadb/standalone/tests/unit/__init__.py +0 -18
nucliadb/standalone/tests/unit/test_api_router.py +0 -61
nucliadb/standalone/tests/unit/test_auth.py +0 -169
nucliadb/standalone/tests/unit/test_introspect.py +0 -35
nucliadb/standalone/tests/unit/test_migrations.py +0 -63
nucliadb/standalone/tests/unit/test_versions.py +0 -68
nucliadb/tests/benchmarks/__init__.py +0 -19
nucliadb/tests/benchmarks/test_search.py +0 -99
nucliadb/tests/conftest.py +0 -32
nucliadb/tests/fixtures.py +0 -735
nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
nucliadb/tests/migrations/test_migration_0017.py +0 -76
nucliadb/tests/migrations/test_migration_0018.py +0 -95
nucliadb/tests/tikv.py +0 -240
nucliadb/tests/unit/__init__.py +0 -19
nucliadb/tests/unit/common/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
nucliadb/tests/unit/common/maindb/__init__.py +0 -18
nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
nucliadb/tests/unit/common/test_context.py +0 -36
nucliadb/tests/unit/export_import/__init__.py +0 -19
nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
nucliadb/tests/unit/export_import/test_utils.py +0 -301
nucliadb/tests/unit/migrator/__init__.py +0 -19
nucliadb/tests/unit/migrator/test_migrator.py +0 -87
nucliadb/tests/unit/tasks/__init__.py +0 -19
nucliadb/tests/unit/tasks/conftest.py +0 -42
nucliadb/tests/unit/tasks/test_consumer.py +0 -92
nucliadb/tests/unit/tasks/test_producer.py +0 -95
nucliadb/tests/unit/tasks/test_tasks.py +0 -58
nucliadb/tests/unit/test_field_ids.py +0 -49
nucliadb/tests/unit/test_health.py +0 -86
nucliadb/tests/unit/test_kb_slugs.py +0 -54
nucliadb/tests/unit/test_learning_proxy.py +0 -252
nucliadb/tests/unit/test_metrics_exporter.py +0 -77
nucliadb/tests/unit/test_purge.py +0 -136
nucliadb/tests/utils/__init__.py +0 -74
nucliadb/tests/utils/aiohttp_session.py +0 -44
nucliadb/tests/utils/broker_messages/__init__.py +0 -171
nucliadb/tests/utils/broker_messages/fields.py +0 -197
nucliadb/tests/utils/broker_messages/helpers.py +0 -33
nucliadb/tests/utils/entities.py +0 -78
nucliadb/train/api/v1/check.py +0 -60
nucliadb/train/tests/__init__.py +0 -19
nucliadb/train/tests/conftest.py +0 -29
nucliadb/train/tests/fixtures.py +0 -342
nucliadb/train/tests/test_field_classification.py +0 -122
nucliadb/train/tests/test_get_entities.py +0 -80
nucliadb/train/tests/test_get_info.py +0 -51
nucliadb/train/tests/test_get_ontology.py +0 -34
nucliadb/train/tests/test_get_ontology_count.py +0 -63
nucliadb/train/tests/test_image_classification.py +0 -221
nucliadb/train/tests/test_list_fields.py +0 -39
nucliadb/train/tests/test_list_paragraphs.py +0 -73
nucliadb/train/tests/test_list_resources.py +0 -39
nucliadb/train/tests/test_list_sentences.py +0 -71
nucliadb/train/tests/test_paragraph_classification.py +0 -123
nucliadb/train/tests/test_paragraph_streaming.py +0 -118
nucliadb/train/tests/test_question_answer_streaming.py +0 -239
nucliadb/train/tests/test_sentence_classification.py +0 -143
nucliadb/train/tests/test_token_classification.py +0 -136
nucliadb/train/tests/utils.py +0 -101
nucliadb/writer/layouts/__init__.py +0 -51
nucliadb/writer/layouts/v1.py +0 -59
nucliadb/writer/tests/__init__.py +0 -19
nucliadb/writer/tests/conftest.py +0 -31
nucliadb/writer/tests/fixtures.py +0 -191
nucliadb/writer/tests/test_fields.py +0 -475
nucliadb/writer/tests/test_files.py +0 -740
nucliadb/writer/tests/test_knowledgebox.py +0 -49
nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
nucliadb/writer/tests/test_resources.py +0 -476
nucliadb/writer/tests/test_service.py +0 -137
nucliadb/writer/tests/test_tus.py +0 -203
nucliadb/writer/tests/utils.py +0 -35
nucliadb/writer/tus/pg.py +0 -125
nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
{nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
/nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
/nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
/nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
/nucliadb/{ingest/tests → tests}/vectors.py +0 -0
{nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
{nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
{nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0

nucliadb/common/external_index_providers/settings.py ADDED Viewed

@@ -0,0 +1,52 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+from pydantic import Field
+from pydantic_settings import BaseSettings
+class ExternalIndexProvidersSettings(BaseSettings):
+    pinecone_upsert_parallelism: int = Field(
+        default=3,
+        title="Pinecone upsert parallelism",
+        description="Number of parallel upserts to Pinecone on each set resource operation",
+    )
+    pinecone_delete_parallelism: int = Field(
+        default=2,
+        title="Pinecone delete parallelism",
+        description="Number of parallel deletes to Pinecone on each delete resource operation",
+    )
+    pinecone_upsert_timeout: float = Field(
+        default=10.0,
+        title="Pinecone upsert timeout",
+        description="Timeout in seconds for each upsert operation to Pinecone",
+    )
+    pinecone_delete_timeout: float = Field(
+        default=10.0,
+        title="Pinecone delete timeout",
+        description="Timeout in seconds for each delete operation to Pinecone",
+    )
+    pinecone_query_timeout: float = Field(
+        default=10.0,
+        title="Pinecone query timeout",
+        description="Timeout in seconds for each query operation to Pinecone",
+    )
+settings = ExternalIndexProvidersSettings()

nucliadb/common/http_clients/auth.py CHANGED Viewed

@@ -44,14 +44,11 @@ class NucliaAuthHTTPClient:
     def __init__(self):
         self.session = aiohttp.ClientSession()
         self.base_url = (
-            nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone)
-            + "/api"
+            nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone) + "/api"
         )
         self.headers = {}
         if nuclia_settings.nuclia_service_account is not None:
-            self.headers["X-NUCLIA-NUAKEY"] = (
-                f"Bearer {nuclia_settings.nuclia_service_account}"
-            )
+            self.headers["X-NUCLIA-NUAKEY"] = f"Bearer {nuclia_settings.nuclia_service_account}"
     async def __aenter__(self):
         return self
@@ -67,4 +64,4 @@ class NucliaAuthHTTPClient:
         async with self.session.get(url, headers=self.headers) as resp:
             resp_text = await resp.text()
             check_status(resp, resp_text)
-            return AuthInfoResponse.parse_raw(resp_text)
+            return AuthInfoResponse.model_validate_json(resp_text)

nucliadb/common/http_clients/processing.py CHANGED Viewed

@@ -48,10 +48,7 @@ def get_processing_api_url() -> str:
             + "/api/v1/processing"
         )
     else:
-        return (
-            nuclia_settings.nuclia_processing_cluster_url
-            + "/api/v1/internal/processing"
-        )
+        return nuclia_settings.nuclia_processing_cluster_url + "/api/v1/internal/processing"
 class PullResponse(pydantic.BaseModel):
@@ -159,9 +156,7 @@ class ProcessingHTTPClient:
         self.base_url = get_processing_api_url()
         self.headers = {}
         if nuclia_settings.nuclia_service_account is not None:
-            self.headers["X-STF-NUAKEY"] = (
-                f"Bearer {nuclia_settings.nuclia_service_account}"
-            )
+            self.headers["X-STF-NUAKEY"] = f"Bearer {nuclia_settings.nuclia_service_account}"
     async def __aenter__(self):
         return self
@@ -187,7 +182,7 @@ class ProcessingHTTPClient:
         async with self.session.get(url, headers=self.headers, params=params) as resp:
             resp_text = await resp.text()
             check_status(resp, resp_text)
-            return PullResponse.parse_raw(resp_text)
+            return PullResponse.model_validate_json(resp_text)
     async def pull_position(self, partition: str) -> int:
         url = self.base_url + "/pull/position"
@@ -195,7 +190,7 @@ class ProcessingHTTPClient:
         async with self.session.get(url, headers=self.headers, params=params) as resp:
             resp_text = await resp.text()
             check_status(resp, resp_text)
-            data = PullPosition.parse_raw(resp_text)
+            data = PullPosition.model_validate_json(resp_text)
             return data.cursor
     async def requests(
@@ -217,7 +212,7 @@ class ProcessingHTTPClient:
         async with self.session.get(url, headers=self.headers, params=params) as resp:
             resp_text = await resp.text()
             check_status(resp, resp_text)
-            return RequestsResults.parse_raw(resp_text)
+            return RequestsResults.model_validate_json(resp_text)
     async def stats(self, kbid: str, timeout: Optional[float] = 1.0) -> StatsResponse:
         url = self.base_url + "/stats"
@@ -229,4 +224,4 @@ class ProcessingHTTPClient:
         ) as resp:
             resp_text = await resp.text()
             check_status(resp, resp_text)
-            return StatsResponse.parse_raw(resp_text)
+            return StatsResponse.model_validate_json(resp_text)

nucliadb/common/http_clients/utils.py CHANGED Viewed

@@ -30,9 +30,7 @@ def check_status(resp: aiohttp.ClientResponse, resp_text: str) -> None:
     elif resp.status == 404:
         raise exceptions.NotFoundException(f"Resource not found: {resp_text}")
     elif resp.status in (401, 403):
-        raise exceptions.AuthorizationException(
-            f"Unauthorized to access: {resp.status}"
-        )
+        raise exceptions.AuthorizationException(f"Unauthorized to access: {resp.status}")
     elif resp.status == 429:
         raise exceptions.RateLimitException("Rate limited")
     else:

nucliadb/common/ids.py ADDED Viewed

@@ -0,0 +1,240 @@
+# Copyright (C) 2021 Bosutech XXI S.L.
+#
+# nucliadb is offered under the AGPL v3.0 and as commercial software.
+# For commercial licensing, contact us at info@nuclia.com.
+#
+# AGPL:
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+"""
+This module aims to centralize how we build ids for resources, fields,
+paragraphs... Avoiding spread of id construction and parsing everywhere
+"""
+from dataclasses import dataclass
+from typing import Optional
+from nucliadb_protos.resources_pb2 import FieldType
+FIELD_TYPE_STR_TO_PB: dict[str, FieldType.ValueType] = {
+    "t": FieldType.TEXT,
+    "f": FieldType.FILE,
+    "u": FieldType.LINK,
+    "a": FieldType.GENERIC,
+    "c": FieldType.CONVERSATION,
+}
+FIELD_TYPE_PB_TO_STR = {v: k for k, v in FIELD_TYPE_STR_TO_PB.items()}
+@dataclass
+class FieldId:
+    """
+    Field ids are used to identify fields in resources. They usually have the following format:
+        `rid/field_type/field_key`
+    where field type is one of: `t`, `f`, `u`, `a`, `c` (text, file, link, generic, conversation)
+    and field_key is an identifier for that field type on the resource, usually chosen by the user.
+    In some cases, fields can have subfields, for example, in conversations, where each part of the
+    conversation is a subfield. In those cases, the id has the following format:
+        `rid/field_type/field_key/subfield_id`
+    Examples:
+    >>> FieldId(rid="rid", type="u", key="/my-link")
+    FieldID("rid/u/my-link")
+    >>> FieldId.from_string("rid/u/my-link")
+    FieldID("rid/u/my-link")
+    """
+    rid: str
+    type: str
+    key: str
+    # also knwon as `split`, this indicates a part of a field in, for example, conversations
+    subfield_id: Optional[str] = None
+    def __repr__(self) -> str:
+        return f"FieldId({self.full()})"
+    def short_without_subfield(self) -> str:
+        return f"/{self.type}/{self.key}"
+    def full(self) -> str:
+        if self.subfield_id is None:
+            return f"{self.rid}/{self.type}/{self.key}"
+        else:
+            return f"{self.rid}/{self.type}/{self.key}/{self.subfield_id}"
+    def __hash__(self) -> int:
+        return hash(self.full())
+    @property
+    def pb_type(self) -> FieldType.ValueType:
+        return FIELD_TYPE_STR_TO_PB[self.type]
+    @classmethod
+    def from_pb(
+        cls, rid: str, field_type: FieldType.ValueType, key: str, subfield_id: Optional[str] = None
+    ) -> "FieldId":
+        return cls(rid=rid, type=FIELD_TYPE_PB_TO_STR[field_type], key=key, subfield_id=subfield_id)
+    @classmethod
+    def from_string(cls, value: str) -> "FieldId":
+        """
+        Parse a FieldId from a string
+        Example:
+        >>> fid = FieldId.from_string("rid/u/foo")
+        >>> fid
+        FieldId("rid/u/foo")
+        >>> fid.type
+        'u'
+        >>> fid.key
+        'foo'
+        >>> FieldId.from_string("rid/u/foo/subfield_id").subfield_id
+        'subfield_id'
+        """
+        parts = value.split("/")
+        if len(parts) == 3:
+            rid, _type, key = parts
+            if _type not in FIELD_TYPE_STR_TO_PB:
+                raise ValueError(f"Invalid FieldId: {value}")
+            return cls(rid=rid, type=_type, key=key)
+        elif len(parts) == 4:
+            rid, _type, key, subfield_id = parts
+            if _type not in FIELD_TYPE_STR_TO_PB:
+                raise ValueError(f"Invalid FieldId: {value}")
+            return cls(
+                rid=rid,
+                type=_type,
+                key=key,
+                subfield_id=subfield_id,
+            )
+        else:
+            raise ValueError(f"Invalid FieldId: {value}")
+@dataclass
+class ParagraphId:
+    field_id: FieldId
+    paragraph_start: int
+    paragraph_end: int
+    def __repr__(self) -> str:
+        return f"ParagraphId({self.full()})"
+    def full(self) -> str:
+        return f"{self.field_id.full()}/{self.paragraph_start}-{self.paragraph_end}"
+    def __hash__(self) -> int:
+        return hash(self.full())
+    @property
+    def rid(self) -> str:
+        return self.field_id.rid
+    @classmethod
+    def from_string(cls, value: str) -> "ParagraphId":
+        parts = value.split("/")
+        paragraph_range = parts[-1]
+        start, end = map(int, paragraph_range.split("-"))
+        field_id = FieldId.from_string("/".join(parts[:-1]))
+        return cls(field_id=field_id, paragraph_start=start, paragraph_end=end)
+    @classmethod
+    def from_vector_id(cls, vid: "VectorId") -> "ParagraphId":
+        """
+        Returns a ParagraphId from a vector_key (the index part of the vector_key is ignored).
+        >>> vid = VectorId.from_string("rid/u/field_id/0/0-1")
+        >>> ParagraphId.from_vector_id(vid)
+        ParagraphId("rid/u/field_id/0-1")
+        """
+        return cls(
+            field_id=vid.field_id,
+            paragraph_start=vid.vector_start,
+            paragraph_end=vid.vector_end,
+        )
+@dataclass
+class VectorId:
+    """
+    Ids of vectors are very similar to ParagraphIds, but for legacy reasons, they have an index
+    indicating the position of the corresponding text block in the list of text blocks for the field.
+    Examples:
+    >>> VectorId.from_string("rid/u/field_id/0/0-10")
+    VectorId("rid/u/field_id/0/0-10")
+    >>> VectorId(
+    ...    field_id=FieldId.from_string("rid/u/field_id"),
+    ...    index=0,
+    ...    vector_start=0,
+    ...    vector_end=10,
+    ... )
+    VectorId("rid/u/field_id/0/0-10")
+    """
+    field_id: FieldId
+    index: int
+    vector_start: int
+    vector_end: int
+    def __repr__(self) -> str:
+        return f"VectorId({self.full()})"
+    def full(self) -> str:
+        return f"{self.field_id.full()}/{self.index}/{self.vector_start}-{self.vector_end}"
+    def __hash__(self) -> int:
+        return hash(self.full())
+    @property
+    def rid(self) -> str:
+        return self.field_id.rid
+    @classmethod
+    def from_string(cls, value: str) -> "VectorId":
+        parts = value.split("/")
+        vector_range = parts[-1]
+        start, end = map(int, vector_range.split("-"))
+        index = int(parts[-2])
+        field_id = FieldId.from_string("/".join(parts[:-2]))
+        return cls(field_id=field_id, index=index, vector_start=start, vector_end=end)
+def extract_data_augmentation_id(generated_field_id: str) -> Optional[str]:
+    """Data augmentation generated fields have a strict id with the following
+    format:
+    `da-{task_id}-{original:field_type}-{original:field_id}[-{original:split}]`
+    @return the `task_id`
+    ATENTION: we are assuming ids have been properly generated and `-` is not a
+    valid character, otherwise, this extraction would be wrong and a partial id
+    would be returned.
+    """
+    parts = generated_field_id.split("-")
+    if len(parts) < 4:
+        return None
+    if parts[0] != "da":
+        return None
+    return parts[1] or None

nucliadb/common/locking.py CHANGED Viewed

@@ -35,6 +35,7 @@ logger = logging.getLogger(__name__)
 NEW_SHARD_LOCK = "new-shard-{kbid}"
 RESOURCE_INDEX_LOCK = "resource-index-{kbid}-{resource_id}"
+RESOURCE_CREATION_SLUG_LOCK = "resource-creation-{kbid}-{resource_slug}"
 KB_SHARDS_LOCK = "shards-kb-{kbid}"
 MIGRATIONS_LOCK = "migration"
@@ -83,7 +84,7 @@ class _Lock:
                     else:
                         if time.time() > lock_data.expires_at:
                             # if current time is greater than when it expires, take it over
-                            await self._set_lock_value(txn)
+                            await self._update_lock_value(txn)
                             await txn.commit()
                             break
@@ -99,24 +100,36 @@ class _Lock:
         return self
     async def get_lock_data(self, txn: Transaction) -> Optional[LockValue]:
-        existing_data = await txn.get(self.key)
+        existing_data = await txn.get(self.key, for_update=True)
         if existing_data is None:
             return None
         else:
             return LockValue(**orjson.loads(existing_data))
-    async def _set_lock_value(self, txn: Transaction) -> None:
+    async def _update_lock_value(self, txn: Transaction) -> None:
+        """
+        Update the value for the lock.
+        """
         await txn.set(
             self.key,
             orjson.dumps(LockValue(self.value, time.time() + self.expire_timeout)),
         )
+    async def _set_lock_value(self, txn: Transaction) -> None:
+        """
+        Set the value for the lock. If lock already exists, it doesn't update and raises a ConflictError.
+        """
+        await txn.insert(
+            self.key,
+            orjson.dumps(LockValue(self.value, time.time() + self.expire_timeout)),
+        )
     async def _refresh_task(self) -> None:
         while True:
             try:
                 await asyncio.sleep(self.refresh_timeout)
                 async with self.driver.transaction() as txn:
-                    await self._set_lock_value(txn)
+                    await self._update_lock_value(txn)
                     await txn.commit()
             except (asyncio.CancelledError, RuntimeError):
                 return
@@ -137,10 +150,19 @@ class _Lock:
 def distributed_lock(
     key: str,
-    lock_timeout: float = 60.0,  # max time to wait for lock
-    expire_timeout: float = 30.0,  # how long by default the lock will be held without a refresh
-    refresh_timeout: float = 10.0,  # how often to refresh
+    lock_timeout: float = 60.0,
+    expire_timeout: float = 30.0,
+    refresh_timeout: float = 10.0,
 ) -> _Lock:
+    """
+    Context manager to get a distributed lock on a key.
+    Params:
+    - key: the key to lock with
+    - lock_timeout: maximum time to wait for the lock before ResourceLocked is raised.
+    - expire_timeout: how long by default the lock will be held without a refresh
+    - refresh_timeout: how often to refresh the lock
+    """
     return _Lock(
         key,
         lock_timeout=lock_timeout,

nucliadb/common/maindb/driver.py CHANGED Viewed

@@ -23,7 +23,7 @@ import asyncio
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator, Optional
-DEFAULT_SCAN_LIMIT = 10
+DEFAULT_SCAN_LIMIT = -1
 DEFAULT_BATCH_SCAN_LIMIT = 500
@@ -37,18 +37,24 @@ class Transaction:
     async def commit(self):
         raise NotImplementedError()
-    async def batch_get(self, keys: list[str]) -> list[Optional[bytes]]:
+    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
         raise NotImplementedError()
-    async def get(self, key: str) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
         raise NotImplementedError()
     async def set(self, key: str, value: bytes):
         raise NotImplementedError()
+    async def insert(self, key: str, value: bytes):
+        return await self.set(key, value)
     async def delete(self, key: str):
         raise NotImplementedError()
+    async def delete_by_prefix(self, prefix: str) -> None:
+        raise NotImplementedError()
     def keys(
         self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
     ) -> AsyncGenerator[str, None]:
@@ -74,36 +80,6 @@ class Driver:
                 except Exception:
                     pass
-    async def begin(self, read_only: bool = False) -> Transaction:
-        raise NotImplementedError()
     @asynccontextmanager
-    async def transaction(
-        self, wait_for_abort: bool = True, read_only: bool = False
-    ) -> AsyncGenerator[Transaction, None]:
-        """
-        Use to make sure transaction is always aborted.
-        :param wait_for_abort: If True, wait for abort to finish before returning.
-                               If False, abort is done in background (unless there
-                               is an error)
-        """
-        txn: Optional[Transaction] = None
-        error: bool = False
-        try:
-            txn = await self.begin(read_only=read_only)
-            yield txn
-        except Exception:
-            error = True
-            raise
-        finally:
-            if txn is not None and txn.open:
-                if error or wait_for_abort:
-                    await txn.abort()
-                else:
-                    self._async_abort(txn)
-    def _async_abort(self, txn: Transaction):
-        task = asyncio.create_task(txn.abort())
-        task.add_done_callback(lambda task: self._abort_tasks.remove(task))
-        self._abort_tasks.append(task)
+    async def transaction(self, read_only: bool = False) -> AsyncGenerator[Transaction, None]:
+        yield Transaction()

nucliadb/common/maindb/exceptions.py CHANGED Viewed

@@ -24,3 +24,6 @@ class NotFoundError(Exception): ...
 class UnsetUtility(Exception): ...
+class MaindbServerError(Exception): ...

nucliadb/common/maindb/local.py CHANGED Viewed

@@ -19,7 +19,8 @@
 #
 import glob
 import os
-from typing import Optional
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator, Optional
 from nucliadb.common.maindb.driver import (
     DEFAULT_BATCH_SCAN_LIMIT,
@@ -105,7 +106,7 @@ class LocalTransaction(Transaction):
         self.clean()
         self.open = False
-    async def batch_get(self, keys: list[str]) -> list[Optional[bytes]]:
+    async def batch_get(self, keys: list[str], for_update: bool = False) -> list[Optional[bytes]]:
         results: list[Optional[bytes]] = []
         for key in keys:
             obj = await self.get(key)
@@ -124,7 +125,7 @@ class LocalTransaction(Transaction):
         return results
-    async def get(self, key: str) -> Optional[bytes]:
+    async def get(self, key: str, for_update: bool = False) -> Optional[bytes]:
         if key in self.deleted_keys:
             raise KeyError(f"Not found {key}")
@@ -159,9 +160,15 @@ class LocalTransaction(Transaction):
         if key in self.modified_keys:
             del self.modified_keys[key]
-    async def keys(
-        self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
-    ):
+    async def delete_by_prefix(self, prefix: str) -> None:
+        keys = []
+        for key in self.modified_keys.keys():
+            if key.startswith(prefix):
+                keys.append(key)
+        for key in keys:
+            await self.delete(key)
+    async def keys(self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True):
         prev_key = None
         get_all_keys = count == -1
@@ -195,7 +202,7 @@ class LocalTransaction(Transaction):
     async def count(self, match: str) -> int:
         value = 0
-        async for _ in self.keys(match, count=-1):
+        async for _ in self.keys(match):
             value += 1
         return value
@@ -214,7 +221,13 @@ class LocalDriver(Driver):
     async def finalize(self):
         pass
-    async def begin(self, read_only: bool = False) -> LocalTransaction:
+    @asynccontextmanager
+    async def transaction(self, read_only: bool = False) -> AsyncGenerator[Transaction, None]:
         if self.url is None:
             raise AttributeError("Invalid url")
-        return LocalTransaction(self.url, self)
+        txn = LocalTransaction(self.url, self)
+        try:
+            yield txn
+        finally:
+            if txn.open:
+                await txn.abort()

nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

nucliadb 4.0.0.post542py3-none-any.whl → 6.2.1.post2777py3-none-any.whl