nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +8 -4
- migrations/0028_extracted_vectors_reference.py +1 -1
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +8 -4
- migrations/0039_backfill_converation_splits_metadata.py +106 -0
- migrations/0040_migrate_search_configurations.py +79 -0
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/__init__.py +79 -0
- nucliadb/common/catalog/dummy.py +36 -0
- nucliadb/common/catalog/interface.py +85 -0
- nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
- nucliadb/common/catalog/utils.py +56 -0
- nucliadb/common/cluster/manager.py +8 -23
- nucliadb/common/cluster/rebalance.py +484 -112
- nucliadb/common/cluster/rollover.py +36 -9
- nucliadb/common/cluster/settings.py +4 -9
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +9 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +5 -34
- nucliadb/common/external_index_providers/settings.py +1 -27
- nucliadb/common/filter_expression.py +129 -41
- nucliadb/common/http_clients/exceptions.py +8 -0
- nucliadb/common/http_clients/processing.py +16 -23
- nucliadb/common/http_clients/utils.py +3 -0
- nucliadb/common/ids.py +82 -58
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +22 -5
- nucliadb/common/vector_index_config.py +1 -1
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +10 -8
- nucliadb/ingest/consumer/service.py +5 -30
- nucliadb/ingest/consumer/shard_creator.py +16 -5
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +37 -49
- nucliadb/ingest/fields/conversation.py +55 -9
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +89 -57
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +128 -113
- nucliadb/ingest/orm/knowledgebox.py +91 -59
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +98 -153
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +82 -71
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/partitions.py +12 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +15 -114
- nucliadb/ingest/settings.py +36 -15
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +23 -26
- nucliadb/metrics_exporter.py +20 -6
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +4 -11
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/knowledgebox.py +37 -9
- nucliadb/reader/api/v1/learning_config.py +33 -14
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +3 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +15 -19
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +328 -0
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +28 -8
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +33 -19
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -42
- nucliadb/search/search/chat/ask.py +131 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +453 -32
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +49 -0
- nucliadb/search/search/hydrator/fields.py +217 -0
- nucliadb/search/search/hydrator/images.py +130 -0
- nucliadb/search/search/hydrator/paragraphs.py +323 -0
- nucliadb/search/search/hydrator/resources.py +60 -0
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +24 -7
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +44 -18
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -48
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +5 -6
- nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
- nucliadb/search/search/query_parser/parsers/common.py +21 -13
- nucliadb/search/search/query_parser/parsers/find.py +6 -29
- nucliadb/search/search/query_parser/parsers/graph.py +18 -28
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -56
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +6 -7
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +5 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +4 -10
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +15 -14
- nucliadb/writer/api/v1/knowledgebox.py +18 -56
- nucliadb/writer/api/v1/learning_config.py +5 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +43 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +5 -7
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +15 -22
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb/common/external_index_providers/pinecone.py +0 -894
- nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
- nucliadb/search/search/hydrator.py +0 -197
- nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
|
@@ -19,15 +19,15 @@
|
|
|
19
19
|
#
|
|
20
20
|
import dataclasses
|
|
21
21
|
from datetime import datetime
|
|
22
|
-
from typing import Optional, Union
|
|
23
22
|
|
|
23
|
+
from fastapi import HTTPException
|
|
24
24
|
from google.protobuf.json_format import MessageToDict
|
|
25
25
|
|
|
26
26
|
import nucliadb_models as models
|
|
27
27
|
from nucliadb.common import datamanagers
|
|
28
28
|
from nucliadb.common.maindb.driver import Transaction
|
|
29
29
|
from nucliadb.common.models_utils import from_proto, to_proto
|
|
30
|
-
from nucliadb.ingest.fields.conversation import Conversation
|
|
30
|
+
from nucliadb.ingest.fields.conversation import MAX_CONVERSATION_MESSAGES, Conversation
|
|
31
31
|
from nucliadb.ingest.orm.resource import Resource as ORMResource
|
|
32
32
|
from nucliadb.models.internal import processing as processing_models
|
|
33
33
|
from nucliadb.models.internal.processing import ClassificationLabel, PushConversation, PushPayload
|
|
@@ -87,7 +87,7 @@ async def extract_file_field(
|
|
|
87
87
|
resource: ORMResource,
|
|
88
88
|
toprocess: PushPayload,
|
|
89
89
|
resource_classifications: ResourceClassifications,
|
|
90
|
-
password:
|
|
90
|
+
password: str | None = None,
|
|
91
91
|
):
|
|
92
92
|
field_type = resources_pb2.FieldType.FILE
|
|
93
93
|
field = await resource.get_field(field_id, field_type)
|
|
@@ -182,7 +182,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
|
|
|
182
182
|
async def parse_fields(
|
|
183
183
|
writer: BrokerMessage,
|
|
184
184
|
toprocess: PushPayload,
|
|
185
|
-
item:
|
|
185
|
+
item: CreateResourcePayload | UpdateResourcePayload,
|
|
186
186
|
kbid: str,
|
|
187
187
|
uuid: str,
|
|
188
188
|
x_skip_store: bool,
|
|
@@ -227,6 +227,7 @@ async def parse_fields(
|
|
|
227
227
|
kbid,
|
|
228
228
|
uuid,
|
|
229
229
|
resource_classifications,
|
|
230
|
+
replace_field=True,
|
|
230
231
|
)
|
|
231
232
|
|
|
232
233
|
|
|
@@ -430,11 +431,15 @@ async def parse_conversation_field(
|
|
|
430
431
|
kbid: str,
|
|
431
432
|
uuid: str,
|
|
432
433
|
resource_classifications: ResourceClassifications,
|
|
434
|
+
replace_field: bool,
|
|
433
435
|
) -> None:
|
|
436
|
+
if not replace_field:
|
|
437
|
+
# Appending messages to conversation
|
|
438
|
+
await _conversation_append_checks(kbid, uuid, key, conversation_field)
|
|
434
439
|
classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.CONVERSATION)
|
|
435
440
|
storage = await get_storage(service_name=SERVICE_NAME)
|
|
436
441
|
processing = get_processing()
|
|
437
|
-
field_value = resources_pb2.Conversation()
|
|
442
|
+
field_value = resources_pb2.Conversation(replace_field=replace_field)
|
|
438
443
|
convs = processing_models.PushConversation()
|
|
439
444
|
for message in conversation_field.messages:
|
|
440
445
|
cm = resources_pb2.Message()
|
|
@@ -543,3 +548,36 @@ async def get_stored_resource_classifications(
|
|
|
543
548
|
classif = ClassificationLabel(labelset=f_classif.labelset, label=f_classif.label)
|
|
544
549
|
rc.field_level.setdefault(fid, set()).add(classif)
|
|
545
550
|
return rc
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
async def _conversation_append_checks(
|
|
554
|
+
kbid: str, rid: str, field_id: str, input: models.InputConversationField
|
|
555
|
+
):
|
|
556
|
+
async with datamanagers.with_ro_transaction() as txn:
|
|
557
|
+
resource_obj = await ORMResource.get(txn, kbid=kbid, rid=rid)
|
|
558
|
+
if resource_obj is None:
|
|
559
|
+
return
|
|
560
|
+
conv: Conversation = await resource_obj.get_field(
|
|
561
|
+
field_id, resources_pb2.FieldType.CONVERSATION, load=False
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# Make sure that the max number of messages is not exceeded
|
|
565
|
+
current_message_count = (await conv.get_metadata()).total
|
|
566
|
+
if (
|
|
567
|
+
MAX_CONVERSATION_MESSAGES is not None
|
|
568
|
+
and (len(input.messages) + current_message_count) > MAX_CONVERSATION_MESSAGES
|
|
569
|
+
):
|
|
570
|
+
raise HTTPException(
|
|
571
|
+
status_code=422,
|
|
572
|
+
detail=f"Conversation fields cannot have more than {MAX_CONVERSATION_MESSAGES} messages.",
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
# Make sure input messages use unique idents
|
|
576
|
+
existing_message_ids = set((await conv.get_splits_metadata()).metadata.keys())
|
|
577
|
+
input_message_ids = {message.ident for message in input.messages}
|
|
578
|
+
intersection = input_message_ids.intersection(existing_message_ids)
|
|
579
|
+
if intersection != set():
|
|
580
|
+
raise HTTPException(
|
|
581
|
+
status_code=422,
|
|
582
|
+
detail=f"Message identifiers must be unique field={field_id}: {list(intersection)[:50]}",
|
|
583
|
+
)
|
|
@@ -42,6 +42,13 @@ def parse_origin(origin: Origin, origin_payload: InputOrigin):
|
|
|
42
42
|
origin.metadata.update(origin_payload.metadata)
|
|
43
43
|
if origin_payload.path:
|
|
44
44
|
origin.path = origin_payload.path
|
|
45
|
+
if origin_payload.sync_metadata is not None:
|
|
46
|
+
origin.sync_metadata.CopyFrom(
|
|
47
|
+
resources_pb2.SyncMetadata(
|
|
48
|
+
file_id=origin_payload.sync_metadata.file_id,
|
|
49
|
+
auth_provider=origin_payload.sync_metadata.auth_provider,
|
|
50
|
+
)
|
|
51
|
+
)
|
|
45
52
|
origin.source = Origin.Source.API
|
|
46
53
|
|
|
47
54
|
|
nucliadb/writer/settings.py
CHANGED
|
@@ -17,15 +17,14 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional
|
|
21
20
|
|
|
22
21
|
from pydantic_settings import BaseSettings
|
|
23
22
|
|
|
24
23
|
|
|
25
24
|
class Settings(BaseSettings):
|
|
26
25
|
dm_enabled: bool = True
|
|
27
|
-
dm_redis_host:
|
|
28
|
-
dm_redis_port:
|
|
26
|
+
dm_redis_host: str | None = None
|
|
27
|
+
dm_redis_port: int | None = None
|
|
29
28
|
|
|
30
29
|
|
|
31
30
|
settings = Settings()
|
nucliadb/writer/tus/__init__.py
CHANGED
|
@@ -18,7 +18,6 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
from dataclasses import dataclass
|
|
21
|
-
from typing import Optional
|
|
22
21
|
|
|
23
22
|
from nucliadb.writer.settings import settings as writer_settings
|
|
24
23
|
from nucliadb.writer.tus.dm import FileDataManager, RedisFileDataManagerFactory
|
|
@@ -37,8 +36,8 @@ class TusStorageDriver:
|
|
|
37
36
|
manager: FileStorageManager
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
DRIVER:
|
|
41
|
-
REDIS_FILE_DATA_MANAGER_FACTORY:
|
|
39
|
+
DRIVER: TusStorageDriver | None = None
|
|
40
|
+
REDIS_FILE_DATA_MANAGER_FACTORY: RedisFileDataManagerFactory | None = None
|
|
42
41
|
|
|
43
42
|
|
|
44
43
|
async def initialize():
|
nucliadb/writer/tus/azure.py
CHANGED
|
@@ -19,15 +19,12 @@
|
|
|
19
19
|
#
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
|
-
from typing import Optional
|
|
23
|
-
|
|
24
22
|
from nucliadb.writer import logger
|
|
25
23
|
from nucliadb.writer.tus.dm import FileDataManager
|
|
26
24
|
from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
|
|
27
25
|
from nucliadb_protos.resources_pb2 import CloudFile
|
|
28
26
|
from nucliadb_utils.storages import CHUNK_SIZE
|
|
29
27
|
from nucliadb_utils.storages.azure import AzureObjectStore
|
|
30
|
-
from nucliadb_utils.storages.exceptions import ObjectNotFoundError
|
|
31
28
|
from nucliadb_utils.storages.utils import ObjectMetadata
|
|
32
29
|
|
|
33
30
|
|
|
@@ -41,7 +38,7 @@ class AzureBlobStore(BlobStore):
|
|
|
41
38
|
logger.exception("Error closing AzureBlobStore")
|
|
42
39
|
self._object_store = None
|
|
43
40
|
|
|
44
|
-
async def initialize(self, account_url: str, connection_string:
|
|
41
|
+
async def initialize(self, account_url: str, connection_string: str | None = None):
|
|
45
42
|
self.bucket = "nucliadb-{kbid}"
|
|
46
43
|
self.source = CloudFile.Source.AZURE
|
|
47
44
|
self._object_store = AzureObjectStore(account_url, connection_string=connection_string)
|
|
@@ -63,7 +60,7 @@ class AzureBlobStore(BlobStore):
|
|
|
63
60
|
class AzureFileStorageManager(FileStorageManager):
|
|
64
61
|
storage: AzureBlobStore
|
|
65
62
|
chunk_size = CHUNK_SIZE
|
|
66
|
-
min_upload_size =
|
|
63
|
+
min_upload_size = CHUNK_SIZE
|
|
67
64
|
|
|
68
65
|
@property
|
|
69
66
|
def object_store(self) -> AzureObjectStore:
|
|
@@ -87,7 +84,7 @@ class AzureFileStorageManager(FileStorageManager):
|
|
|
87
84
|
bucket = self.storage.get_bucket_name(kbid)
|
|
88
85
|
try:
|
|
89
86
|
await self.object_store.delete(bucket, uri)
|
|
90
|
-
except
|
|
87
|
+
except KeyError:
|
|
91
88
|
logger.warning(
|
|
92
89
|
"Attempt to delete an upload but not found",
|
|
93
90
|
extra={"uri": uri, "kbid": kbid, "bucket": bucket},
|
|
@@ -108,4 +105,5 @@ class AzureFileStorageManager(FileStorageManager):
|
|
|
108
105
|
return path
|
|
109
106
|
|
|
110
107
|
def validate_intermediate_chunk(self, uploaded_bytes: int):
|
|
111
|
-
|
|
108
|
+
if uploaded_bytes < self.min_upload_size:
|
|
109
|
+
raise ValueError(f"Intermediate chunks cannot be smaller than {self.min_upload_size} bytes")
|
nucliadb/writer/tus/dm.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import time
|
|
21
|
-
from typing import Any
|
|
21
|
+
from typing import Any
|
|
22
22
|
|
|
23
23
|
import backoff
|
|
24
24
|
import orjson
|
|
@@ -43,7 +43,7 @@ DATA: dict[str, Any] = {}
|
|
|
43
43
|
|
|
44
44
|
|
|
45
45
|
class FileDataManager:
|
|
46
|
-
_data:
|
|
46
|
+
_data: dict[str, Any] | None = None
|
|
47
47
|
_loaded = False
|
|
48
48
|
key = None
|
|
49
49
|
_ttl = 60 * 50 * 5 # 5 minutes should be plenty of time between activity
|
|
@@ -63,7 +63,7 @@ class FileDataManager:
|
|
|
63
63
|
if self._data and "last_activity" in self._data:
|
|
64
64
|
# check for another active upload, fail if we're screwing with
|
|
65
65
|
# someone else
|
|
66
|
-
last_activity:
|
|
66
|
+
last_activity: int | None = self._data.get("last_activity")
|
|
67
67
|
if last_activity and (time.time() - last_activity) < self._ttl:
|
|
68
68
|
if request.headers and request.headers.get("tus-override-upload", "0") != "1":
|
|
69
69
|
raise HTTPPreconditionFailed(
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Optional
|
|
21
20
|
|
|
22
21
|
from starlette.exceptions import HTTPException as StarletteHTTPException
|
|
23
22
|
|
|
@@ -27,11 +26,11 @@ class InvalidTUSMetadata(Exception):
|
|
|
27
26
|
|
|
28
27
|
|
|
29
28
|
class HTTPException(StarletteHTTPException):
|
|
30
|
-
_status_code:
|
|
29
|
+
_status_code: int | None = None
|
|
31
30
|
|
|
32
|
-
def __init__(self, detail:
|
|
31
|
+
def __init__(self, detail: str | None = None):
|
|
33
32
|
if self._status_code:
|
|
34
|
-
super(
|
|
33
|
+
super().__init__(status_code=self._status_code, detail=detail)
|
|
35
34
|
else:
|
|
36
35
|
raise AttributeError("Status code not defined")
|
|
37
36
|
|
nucliadb/writer/tus/gcs.py
CHANGED
|
@@ -28,7 +28,6 @@ import tempfile
|
|
|
28
28
|
import uuid
|
|
29
29
|
from concurrent.futures import ThreadPoolExecutor
|
|
30
30
|
from copy import deepcopy
|
|
31
|
-
from typing import Optional
|
|
32
31
|
from urllib.parse import quote_plus
|
|
33
32
|
|
|
34
33
|
import aiohttp
|
|
@@ -74,16 +73,22 @@ RETRIABLE_EXCEPTIONS = (
|
|
|
74
73
|
|
|
75
74
|
|
|
76
75
|
class GCloudBlobStore(BlobStore):
|
|
77
|
-
|
|
76
|
+
_session: aiohttp.ClientSession | None = None
|
|
78
77
|
loop = None
|
|
79
78
|
upload_url: str
|
|
80
79
|
object_base_url: str
|
|
81
|
-
json_credentials:
|
|
80
|
+
json_credentials: str | None
|
|
82
81
|
bucket: str
|
|
83
82
|
location: str
|
|
84
83
|
project: str
|
|
85
84
|
executor = ThreadPoolExecutor(max_workers=5)
|
|
86
85
|
|
|
86
|
+
@property
|
|
87
|
+
def session(self) -> aiohttp.ClientSession:
|
|
88
|
+
if self._session is None: # pragma: no cover
|
|
89
|
+
raise AttributeError("Session not initialized")
|
|
90
|
+
return self._session
|
|
91
|
+
|
|
87
92
|
async def get_access_headers(self):
|
|
88
93
|
if self._credentials is None:
|
|
89
94
|
return {}
|
|
@@ -106,8 +111,9 @@ class GCloudBlobStore(BlobStore):
|
|
|
106
111
|
return access_token.access_token
|
|
107
112
|
|
|
108
113
|
async def finalize(self):
|
|
109
|
-
if self.
|
|
110
|
-
await self.
|
|
114
|
+
if self._session is not None:
|
|
115
|
+
await self._session.close()
|
|
116
|
+
self._session = None
|
|
111
117
|
|
|
112
118
|
async def initialize(
|
|
113
119
|
self,
|
|
@@ -116,7 +122,7 @@ class GCloudBlobStore(BlobStore):
|
|
|
116
122
|
project: str,
|
|
117
123
|
bucket_labels,
|
|
118
124
|
object_base_url: str,
|
|
119
|
-
json_credentials:
|
|
125
|
+
json_credentials: str | None,
|
|
120
126
|
):
|
|
121
127
|
self.bucket = bucket
|
|
122
128
|
self.source = CloudFile.Source.GCS
|
|
@@ -124,7 +130,7 @@ class GCloudBlobStore(BlobStore):
|
|
|
124
130
|
self.project = project
|
|
125
131
|
self.bucket_labels = bucket_labels
|
|
126
132
|
self.object_base_url = object_base_url + "/storage/v1/b"
|
|
127
|
-
self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable"
|
|
133
|
+
self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable"
|
|
128
134
|
self.json_credentials = json_credentials
|
|
129
135
|
self._credentials = None
|
|
130
136
|
|
|
@@ -143,12 +149,9 @@ class GCloudBlobStore(BlobStore):
|
|
|
143
149
|
self._credentials = None
|
|
144
150
|
|
|
145
151
|
loop = asyncio.get_event_loop()
|
|
146
|
-
self.
|
|
152
|
+
self._session = aiohttp.ClientSession(loop=loop, timeout=TIMEOUT)
|
|
147
153
|
|
|
148
154
|
async def check_exists(self, bucket_name: str):
|
|
149
|
-
if self.session is None:
|
|
150
|
-
raise AttributeError()
|
|
151
|
-
|
|
152
155
|
headers = await self.get_access_headers()
|
|
153
156
|
# Using object access url instead of bucket access to avoid
|
|
154
157
|
# giving admin permission to the SA, needed to GET a bucket
|
|
@@ -163,8 +166,6 @@ class GCloudBlobStore(BlobStore):
|
|
|
163
166
|
return False
|
|
164
167
|
|
|
165
168
|
async def create_bucket(self, bucket_name: str):
|
|
166
|
-
if self.session is None:
|
|
167
|
-
raise AttributeError()
|
|
168
169
|
headers = await self.get_access_headers()
|
|
169
170
|
url = f"{self.object_base_url}?project={self.project}"
|
|
170
171
|
|
|
@@ -199,10 +200,6 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
|
199
200
|
_resumable_uri : uri to resumable upload
|
|
200
201
|
_uri : finished uploaded image
|
|
201
202
|
"""
|
|
202
|
-
|
|
203
|
-
if self.storage.session is None:
|
|
204
|
-
raise AttributeError()
|
|
205
|
-
|
|
206
203
|
upload_file_id = dm.get("upload_file_id")
|
|
207
204
|
if upload_file_id is not None:
|
|
208
205
|
await self.delete_upload(upload_file_id, kbid)
|
|
@@ -287,8 +284,6 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
|
287
284
|
|
|
288
285
|
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
|
289
286
|
async def _append(self, dm: FileDataManager, data, offset):
|
|
290
|
-
if self.storage.session is None:
|
|
291
|
-
raise AttributeError()
|
|
292
287
|
if dm.size:
|
|
293
288
|
size = str(dm.size)
|
|
294
289
|
else:
|
|
@@ -315,7 +310,7 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
|
315
310
|
},
|
|
316
311
|
data=data,
|
|
317
312
|
) as call:
|
|
318
|
-
text = await call.text()
|
|
313
|
+
text = await call.text()
|
|
319
314
|
if call.status not in [200, 201, 308]:
|
|
320
315
|
raise GoogleCloudException(f"{call.status}: {text}")
|
|
321
316
|
return call
|
|
@@ -353,8 +348,6 @@ class GCloudFileStorageManager(FileStorageManager):
|
|
|
353
348
|
@backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
|
|
354
349
|
async def finish(self, dm: FileDataManager):
|
|
355
350
|
if dm.size == 0:
|
|
356
|
-
if self.storage.session is None:
|
|
357
|
-
raise AttributeError()
|
|
358
351
|
# In case of empty file, we need to send a PUT request with empty body
|
|
359
352
|
# and Content-Range header set to "bytes */0"
|
|
360
353
|
headers = {
|
nucliadb/writer/tus/s3.py
CHANGED
|
@@ -22,7 +22,6 @@ from __future__ import annotations
|
|
|
22
22
|
import base64
|
|
23
23
|
import uuid
|
|
24
24
|
from contextlib import AsyncExitStack
|
|
25
|
-
from typing import Optional
|
|
26
25
|
|
|
27
26
|
import aiobotocore # type: ignore
|
|
28
27
|
import aiohttp
|
|
@@ -195,8 +194,8 @@ class S3BlobStore(BlobStore):
|
|
|
195
194
|
endpoint_url,
|
|
196
195
|
region_name,
|
|
197
196
|
bucket,
|
|
198
|
-
bucket_tags:
|
|
199
|
-
kms_key_id:
|
|
197
|
+
bucket_tags: dict[str, str] | None = None,
|
|
198
|
+
kms_key_id: str | None = None,
|
|
200
199
|
):
|
|
201
200
|
self.bucket = bucket
|
|
202
201
|
self.bucket_tags = bucket_tags
|
nucliadb/writer/tus/storage.py
CHANGED
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
#
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
|
-
from
|
|
22
|
+
from collections.abc import AsyncIterator
|
|
23
23
|
|
|
24
24
|
from nucliadb.writer.tus.dm import FileDataManager
|
|
25
25
|
from nucliadb_protos.resources_pb2 import CloudFile
|
|
@@ -47,13 +47,13 @@ class BlobStore:
|
|
|
47
47
|
|
|
48
48
|
class FileStorageManager:
|
|
49
49
|
chunk_size: int
|
|
50
|
-
min_upload_size:
|
|
50
|
+
min_upload_size: int | None = None
|
|
51
51
|
|
|
52
52
|
def __init__(self, storage: BlobStore):
|
|
53
53
|
self.storage = storage
|
|
54
54
|
|
|
55
55
|
def iter_data(
|
|
56
|
-
self, uri: str, kbid: str, headers:
|
|
56
|
+
self, uri: str, kbid: str, headers: dict[str, str] | None = None
|
|
57
57
|
) -> AsyncIterator[bytes]:
|
|
58
58
|
raise NotImplementedError()
|
|
59
59
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nucliadb
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.10.0.post5705
|
|
4
4
|
Summary: NucliaDB
|
|
5
5
|
Author-email: Nuclia <nucliadb@nuclia.com>
|
|
6
6
|
License-Expression: AGPL-3.0-or-later
|
|
@@ -12,20 +12,19 @@ Classifier: Development Status :: 4 - Beta
|
|
|
12
12
|
Classifier: Intended Audience :: Developers
|
|
13
13
|
Classifier: Intended Audience :: Information Technology
|
|
14
14
|
Classifier: Programming Language :: Python
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
18
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
|
-
Requires-Python: <4,>=3.
|
|
19
|
+
Requires-Python: <4,>=3.10
|
|
21
20
|
Description-Content-Type: text/markdown
|
|
22
|
-
Requires-Dist: nucliadb-telemetry[all]>=6.
|
|
23
|
-
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.
|
|
24
|
-
Requires-Dist: nucliadb-protos>=6.
|
|
25
|
-
Requires-Dist: nucliadb-models>=6.
|
|
26
|
-
Requires-Dist: nidx-protos>=6.
|
|
21
|
+
Requires-Dist: nucliadb-telemetry[all]>=6.10.0.post5705
|
|
22
|
+
Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.10.0.post5705
|
|
23
|
+
Requires-Dist: nucliadb-protos[grpc]>=6.10.0.post5705
|
|
24
|
+
Requires-Dist: nucliadb-models>=6.10.0.post5705
|
|
25
|
+
Requires-Dist: nidx-protos[grpc]>=6.10.0.post5705
|
|
27
26
|
Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
|
|
28
|
-
Requires-Dist: nuclia-models>=0.
|
|
27
|
+
Requires-Dist: nuclia-models>=0.50.0
|
|
29
28
|
Requires-Dist: uvicorn[standard]
|
|
30
29
|
Requires-Dist: argdantic
|
|
31
30
|
Requires-Dist: aiohttp>=3.11.11
|
|
@@ -35,7 +34,7 @@ Requires-Dist: aiofiles>=0.8.0
|
|
|
35
34
|
Requires-Dist: psutil>=5.9.7
|
|
36
35
|
Requires-Dist: types-psutil>=5.9.5.17
|
|
37
36
|
Requires-Dist: types-aiofiles>=0.8.3
|
|
38
|
-
Requires-Dist: protobuf
|
|
37
|
+
Requires-Dist: protobuf>=5
|
|
39
38
|
Requires-Dist: types-protobuf<6,>=5
|
|
40
39
|
Requires-Dist: grpcio>=1.71.0
|
|
41
40
|
Requires-Dist: grpcio-health-checking>=1.71.0
|
|
@@ -57,7 +56,7 @@ Requires-Dist: jwcrypto>=1.5.6
|
|
|
57
56
|
Requires-Dist: pyyaml>=5.1
|
|
58
57
|
Requires-Dist: fastapi-versioning>=0.10.0
|
|
59
58
|
Requires-Dist: fastapi>=0.95.2
|
|
60
|
-
Requires-Dist: sentry-sdk>=2.8.0
|
|
59
|
+
Requires-Dist: sentry-sdk[fastapi]>=2.8.0
|
|
61
60
|
Requires-Dist: pyjwt>=2.4.0
|
|
62
61
|
Requires-Dist: mmh3>=3.0.0
|
|
63
62
|
Requires-Dist: httpx>=0.23.0
|