nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- migrations/0023_backfill_pg_catalog.py +2 -2
- migrations/0029_backfill_field_status.py +3 -4
- migrations/0032_remove_old_relations.py +2 -3
- migrations/0038_backfill_catalog_field_labels.py +2 -2
- migrations/0039_backfill_converation_splits_metadata.py +2 -2
- migrations/0041_reindex_conversations.py +137 -0
- migrations/pg/0010_shards_index.py +34 -0
- nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
- migrations/pg/0012_catalog_statistics_undo.py +26 -0
- nucliadb/backups/create.py +2 -15
- nucliadb/backups/restore.py +4 -15
- nucliadb/backups/tasks.py +4 -1
- nucliadb/common/back_pressure/cache.py +2 -3
- nucliadb/common/back_pressure/materializer.py +7 -13
- nucliadb/common/back_pressure/settings.py +6 -6
- nucliadb/common/back_pressure/utils.py +1 -0
- nucliadb/common/cache.py +9 -9
- nucliadb/common/catalog/interface.py +12 -12
- nucliadb/common/catalog/pg.py +41 -29
- nucliadb/common/catalog/utils.py +3 -3
- nucliadb/common/cluster/manager.py +5 -4
- nucliadb/common/cluster/rebalance.py +483 -114
- nucliadb/common/cluster/rollover.py +25 -9
- nucliadb/common/cluster/settings.py +3 -8
- nucliadb/common/cluster/utils.py +34 -8
- nucliadb/common/context/__init__.py +7 -8
- nucliadb/common/context/fastapi.py +1 -2
- nucliadb/common/datamanagers/__init__.py +2 -4
- nucliadb/common/datamanagers/atomic.py +4 -2
- nucliadb/common/datamanagers/cluster.py +1 -2
- nucliadb/common/datamanagers/fields.py +3 -4
- nucliadb/common/datamanagers/kb.py +6 -6
- nucliadb/common/datamanagers/labels.py +2 -3
- nucliadb/common/datamanagers/resources.py +10 -33
- nucliadb/common/datamanagers/rollover.py +5 -7
- nucliadb/common/datamanagers/search_configurations.py +1 -2
- nucliadb/common/datamanagers/synonyms.py +1 -2
- nucliadb/common/datamanagers/utils.py +4 -4
- nucliadb/common/datamanagers/vectorsets.py +4 -4
- nucliadb/common/external_index_providers/base.py +32 -5
- nucliadb/common/external_index_providers/manager.py +4 -5
- nucliadb/common/filter_expression.py +128 -40
- nucliadb/common/http_clients/processing.py +12 -23
- nucliadb/common/ids.py +6 -4
- nucliadb/common/locking.py +1 -2
- nucliadb/common/maindb/driver.py +9 -8
- nucliadb/common/maindb/local.py +5 -5
- nucliadb/common/maindb/pg.py +9 -8
- nucliadb/common/nidx.py +3 -4
- nucliadb/export_import/datamanager.py +4 -3
- nucliadb/export_import/exporter.py +11 -19
- nucliadb/export_import/importer.py +13 -6
- nucliadb/export_import/tasks.py +2 -0
- nucliadb/export_import/utils.py +6 -18
- nucliadb/health.py +2 -2
- nucliadb/ingest/app.py +8 -8
- nucliadb/ingest/consumer/consumer.py +8 -10
- nucliadb/ingest/consumer/pull.py +3 -8
- nucliadb/ingest/consumer/service.py +3 -3
- nucliadb/ingest/consumer/utils.py +1 -1
- nucliadb/ingest/fields/base.py +28 -49
- nucliadb/ingest/fields/conversation.py +12 -12
- nucliadb/ingest/fields/exceptions.py +1 -2
- nucliadb/ingest/fields/file.py +22 -8
- nucliadb/ingest/fields/link.py +7 -7
- nucliadb/ingest/fields/text.py +2 -3
- nucliadb/ingest/orm/brain_v2.py +78 -64
- nucliadb/ingest/orm/broker_message.py +2 -4
- nucliadb/ingest/orm/entities.py +10 -209
- nucliadb/ingest/orm/index_message.py +4 -4
- nucliadb/ingest/orm/knowledgebox.py +18 -27
- nucliadb/ingest/orm/processor/auditing.py +1 -3
- nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
- nucliadb/ingest/orm/processor/processor.py +27 -27
- nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
- nucliadb/ingest/orm/resource.py +72 -70
- nucliadb/ingest/orm/utils.py +1 -1
- nucliadb/ingest/processing.py +17 -17
- nucliadb/ingest/serialize.py +202 -145
- nucliadb/ingest/service/writer.py +3 -109
- nucliadb/ingest/settings.py +3 -4
- nucliadb/ingest/utils.py +1 -2
- nucliadb/learning_proxy.py +11 -11
- nucliadb/metrics_exporter.py +5 -4
- nucliadb/middleware/__init__.py +82 -1
- nucliadb/migrator/datamanager.py +3 -4
- nucliadb/migrator/migrator.py +1 -2
- nucliadb/migrator/models.py +1 -2
- nucliadb/migrator/settings.py +1 -2
- nucliadb/models/internal/augment.py +614 -0
- nucliadb/models/internal/processing.py +19 -19
- nucliadb/openapi.py +2 -2
- nucliadb/purge/__init__.py +3 -8
- nucliadb/purge/orphan_shards.py +1 -2
- nucliadb/reader/__init__.py +5 -0
- nucliadb/reader/api/models.py +6 -13
- nucliadb/reader/api/v1/download.py +59 -38
- nucliadb/reader/api/v1/export_import.py +4 -4
- nucliadb/reader/api/v1/learning_config.py +24 -4
- nucliadb/reader/api/v1/resource.py +61 -9
- nucliadb/reader/api/v1/services.py +18 -14
- nucliadb/reader/app.py +3 -1
- nucliadb/reader/reader/notifications.py +1 -2
- nucliadb/search/api/v1/__init__.py +2 -0
- nucliadb/search/api/v1/ask.py +3 -4
- nucliadb/search/api/v1/augment.py +585 -0
- nucliadb/search/api/v1/catalog.py +11 -15
- nucliadb/search/api/v1/find.py +16 -22
- nucliadb/search/api/v1/hydrate.py +25 -25
- nucliadb/search/api/v1/knowledgebox.py +1 -2
- nucliadb/search/api/v1/predict_proxy.py +1 -2
- nucliadb/search/api/v1/resource/ask.py +7 -7
- nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
- nucliadb/search/api/v1/resource/search.py +9 -11
- nucliadb/search/api/v1/retrieve.py +130 -0
- nucliadb/search/api/v1/search.py +28 -32
- nucliadb/search/api/v1/suggest.py +11 -14
- nucliadb/search/api/v1/summarize.py +1 -2
- nucliadb/search/api/v1/utils.py +2 -2
- nucliadb/search/app.py +3 -2
- nucliadb/search/augmentor/__init__.py +21 -0
- nucliadb/search/augmentor/augmentor.py +232 -0
- nucliadb/search/augmentor/fields.py +704 -0
- nucliadb/search/augmentor/metrics.py +24 -0
- nucliadb/search/augmentor/paragraphs.py +334 -0
- nucliadb/search/augmentor/resources.py +238 -0
- nucliadb/search/augmentor/utils.py +33 -0
- nucliadb/search/lifecycle.py +3 -1
- nucliadb/search/predict.py +24 -17
- nucliadb/search/predict_models.py +8 -9
- nucliadb/search/requesters/utils.py +11 -10
- nucliadb/search/search/cache.py +19 -23
- nucliadb/search/search/chat/ask.py +88 -59
- nucliadb/search/search/chat/exceptions.py +3 -5
- nucliadb/search/search/chat/fetcher.py +201 -0
- nucliadb/search/search/chat/images.py +6 -4
- nucliadb/search/search/chat/old_prompt.py +1375 -0
- nucliadb/search/search/chat/parser.py +510 -0
- nucliadb/search/search/chat/prompt.py +563 -615
- nucliadb/search/search/chat/query.py +449 -36
- nucliadb/search/search/chat/rpc.py +85 -0
- nucliadb/search/search/fetch.py +3 -4
- nucliadb/search/search/filters.py +8 -11
- nucliadb/search/search/find.py +33 -31
- nucliadb/search/search/find_merge.py +124 -331
- nucliadb/search/search/graph_strategy.py +14 -12
- nucliadb/search/search/hydrator/__init__.py +3 -152
- nucliadb/search/search/hydrator/fields.py +92 -50
- nucliadb/search/search/hydrator/images.py +7 -7
- nucliadb/search/search/hydrator/paragraphs.py +42 -26
- nucliadb/search/search/hydrator/resources.py +20 -16
- nucliadb/search/search/ingestion_agents.py +5 -5
- nucliadb/search/search/merge.py +90 -94
- nucliadb/search/search/metrics.py +10 -9
- nucliadb/search/search/paragraphs.py +7 -9
- nucliadb/search/search/predict_proxy.py +13 -9
- nucliadb/search/search/query.py +14 -86
- nucliadb/search/search/query_parser/fetcher.py +51 -82
- nucliadb/search/search/query_parser/models.py +19 -20
- nucliadb/search/search/query_parser/old_filters.py +20 -19
- nucliadb/search/search/query_parser/parsers/ask.py +4 -5
- nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
- nucliadb/search/search/query_parser/parsers/common.py +5 -6
- nucliadb/search/search/query_parser/parsers/find.py +6 -26
- nucliadb/search/search/query_parser/parsers/graph.py +13 -23
- nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
- nucliadb/search/search/query_parser/parsers/search.py +15 -53
- nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
- nucliadb/search/search/rank_fusion.py +18 -13
- nucliadb/search/search/rerankers.py +5 -6
- nucliadb/search/search/retrieval.py +300 -0
- nucliadb/search/search/summarize.py +5 -6
- nucliadb/search/search/utils.py +3 -4
- nucliadb/search/settings.py +1 -2
- nucliadb/standalone/api_router.py +1 -1
- nucliadb/standalone/app.py +4 -3
- nucliadb/standalone/auth.py +5 -6
- nucliadb/standalone/lifecycle.py +2 -2
- nucliadb/standalone/run.py +2 -4
- nucliadb/standalone/settings.py +5 -6
- nucliadb/standalone/versions.py +3 -4
- nucliadb/tasks/consumer.py +13 -8
- nucliadb/tasks/models.py +2 -1
- nucliadb/tasks/producer.py +3 -3
- nucliadb/tasks/retries.py +8 -7
- nucliadb/train/api/utils.py +1 -3
- nucliadb/train/api/v1/shards.py +1 -2
- nucliadb/train/api/v1/trainset.py +1 -2
- nucliadb/train/app.py +1 -1
- nucliadb/train/generator.py +4 -4
- nucliadb/train/generators/field_classifier.py +2 -2
- nucliadb/train/generators/field_streaming.py +6 -6
- nucliadb/train/generators/image_classifier.py +2 -2
- nucliadb/train/generators/paragraph_classifier.py +2 -2
- nucliadb/train/generators/paragraph_streaming.py +2 -2
- nucliadb/train/generators/question_answer_streaming.py +2 -2
- nucliadb/train/generators/sentence_classifier.py +2 -2
- nucliadb/train/generators/token_classifier.py +3 -2
- nucliadb/train/generators/utils.py +6 -5
- nucliadb/train/nodes.py +3 -3
- nucliadb/train/resource.py +6 -8
- nucliadb/train/settings.py +3 -4
- nucliadb/train/types.py +11 -11
- nucliadb/train/upload.py +3 -2
- nucliadb/train/uploader.py +1 -2
- nucliadb/train/utils.py +1 -2
- nucliadb/writer/api/v1/export_import.py +4 -1
- nucliadb/writer/api/v1/field.py +7 -11
- nucliadb/writer/api/v1/knowledgebox.py +3 -4
- nucliadb/writer/api/v1/resource.py +9 -20
- nucliadb/writer/api/v1/services.py +10 -132
- nucliadb/writer/api/v1/upload.py +73 -72
- nucliadb/writer/app.py +8 -2
- nucliadb/writer/resource/basic.py +12 -15
- nucliadb/writer/resource/field.py +7 -5
- nucliadb/writer/resource/origin.py +7 -0
- nucliadb/writer/settings.py +2 -3
- nucliadb/writer/tus/__init__.py +2 -3
- nucliadb/writer/tus/azure.py +1 -3
- nucliadb/writer/tus/dm.py +3 -3
- nucliadb/writer/tus/exceptions.py +3 -4
- nucliadb/writer/tus/gcs.py +5 -6
- nucliadb/writer/tus/s3.py +2 -3
- nucliadb/writer/tus/storage.py +3 -3
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
- nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
- nucliadb/common/datamanagers/entities.py +0 -139
- nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
- {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
nucliadb/ingest/fields/base.py
CHANGED
|
@@ -24,7 +24,7 @@ import enum
|
|
|
24
24
|
import logging
|
|
25
25
|
from collections import defaultdict
|
|
26
26
|
from datetime import datetime
|
|
27
|
-
from typing import TYPE_CHECKING, Any, Generic,
|
|
27
|
+
from typing import TYPE_CHECKING, Any, Generic, TypeVar
|
|
28
28
|
|
|
29
29
|
from google.protobuf.message import DecodeError, Message
|
|
30
30
|
|
|
@@ -47,10 +47,8 @@ from nucliadb_protos.resources_pb2 import (
|
|
|
47
47
|
)
|
|
48
48
|
from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
|
|
49
49
|
from nucliadb_protos.writer_pb2 import Error, FieldStatus
|
|
50
|
-
from nucliadb_utils import const
|
|
51
50
|
from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
|
|
52
51
|
from nucliadb_utils.storages.storage import Storage, StorageField
|
|
53
|
-
from nucliadb_utils.utilities import has_feature
|
|
54
52
|
|
|
55
53
|
logger = logging.getLogger(__name__)
|
|
56
54
|
|
|
@@ -77,27 +75,27 @@ PbType = TypeVar("PbType", bound=Message)
|
|
|
77
75
|
|
|
78
76
|
|
|
79
77
|
class Field(Generic[PbType]):
|
|
80
|
-
pbklass:
|
|
78
|
+
pbklass: type[PbType]
|
|
81
79
|
type: str = "x"
|
|
82
|
-
value:
|
|
83
|
-
extracted_text:
|
|
84
|
-
extracted_vectors: dict[
|
|
85
|
-
computed_metadata:
|
|
86
|
-
large_computed_metadata:
|
|
87
|
-
question_answers:
|
|
80
|
+
value: Any | None
|
|
81
|
+
extracted_text: ExtractedText | None
|
|
82
|
+
extracted_vectors: dict[str | None, VectorObject]
|
|
83
|
+
computed_metadata: FieldComputedMetadata | None
|
|
84
|
+
large_computed_metadata: LargeComputedMetadata | None
|
|
85
|
+
question_answers: FieldQuestionAnswers | None
|
|
88
86
|
|
|
89
87
|
def __init__(
|
|
90
88
|
self,
|
|
91
89
|
id: str,
|
|
92
90
|
resource: Resource,
|
|
93
|
-
pb:
|
|
94
|
-
value:
|
|
91
|
+
pb: Any | None = None,
|
|
92
|
+
value: Any | None = None,
|
|
95
93
|
):
|
|
96
94
|
if self.pbklass is None:
|
|
97
95
|
raise InvalidFieldClass()
|
|
98
96
|
|
|
99
97
|
self.value = None
|
|
100
|
-
self.extracted_text:
|
|
98
|
+
self.extracted_text: ExtractedText | None = None
|
|
101
99
|
self.extracted_vectors = {}
|
|
102
100
|
self.computed_metadata = None
|
|
103
101
|
self.large_computed_metadata = None
|
|
@@ -120,7 +118,7 @@ class Field(Generic[PbType]):
|
|
|
120
118
|
|
|
121
119
|
@property
|
|
122
120
|
def kbid(self) -> str:
|
|
123
|
-
return self.resource.
|
|
121
|
+
return self.resource.kbid
|
|
124
122
|
|
|
125
123
|
@property
|
|
126
124
|
def uuid(self) -> str:
|
|
@@ -161,7 +159,7 @@ class Field(Generic[PbType]):
|
|
|
161
159
|
|
|
162
160
|
return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
|
|
163
161
|
|
|
164
|
-
async def db_get_value(self) ->
|
|
162
|
+
async def db_get_value(self) -> PbType | None:
|
|
165
163
|
if self.value is None:
|
|
166
164
|
payload = await datamanagers.fields.get_raw(
|
|
167
165
|
self.resource.txn,
|
|
@@ -224,21 +222,6 @@ class Field(Generic[PbType]):
|
|
|
224
222
|
) -> None:
|
|
225
223
|
# Try delete vectors
|
|
226
224
|
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
|
227
|
-
|
|
228
|
-
if has_feature(const.Features.DEBUG_MISSING_VECTORS):
|
|
229
|
-
# This is a very chatty log. It is just a temporary hint while debugging an issue.
|
|
230
|
-
logger.info(
|
|
231
|
-
"Deleting vectors from storage",
|
|
232
|
-
extra={
|
|
233
|
-
"kbid": self.kbid,
|
|
234
|
-
"rid": self.resource.uuid,
|
|
235
|
-
"field": f"{self.type}/{self.id}",
|
|
236
|
-
"vectorset": vectorset,
|
|
237
|
-
"storage_key_kind": storage_key_kind,
|
|
238
|
-
"key": sf.key,
|
|
239
|
-
"bucket": sf.bucket,
|
|
240
|
-
},
|
|
241
|
-
)
|
|
242
225
|
try:
|
|
243
226
|
await self.storage.delete_upload(sf.key, sf.bucket)
|
|
244
227
|
except KeyError:
|
|
@@ -251,7 +234,7 @@ class Field(Generic[PbType]):
|
|
|
251
234
|
except KeyError:
|
|
252
235
|
pass
|
|
253
236
|
|
|
254
|
-
async def get_error(self) ->
|
|
237
|
+
async def get_error(self) -> Error | None:
|
|
255
238
|
return await datamanagers.fields.get_error(
|
|
256
239
|
self.resource.txn,
|
|
257
240
|
kbid=self.kbid,
|
|
@@ -270,7 +253,7 @@ class Field(Generic[PbType]):
|
|
|
270
253
|
error=error,
|
|
271
254
|
)
|
|
272
255
|
|
|
273
|
-
async def get_status(self) ->
|
|
256
|
+
async def get_status(self) -> FieldStatus | None:
|
|
274
257
|
return await datamanagers.fields.get_status(
|
|
275
258
|
self.resource.txn,
|
|
276
259
|
kbid=self.kbid,
|
|
@@ -289,7 +272,7 @@ class Field(Generic[PbType]):
|
|
|
289
272
|
status=status,
|
|
290
273
|
)
|
|
291
274
|
|
|
292
|
-
async def get_question_answers(self, force=False) ->
|
|
275
|
+
async def get_question_answers(self, force=False) -> FieldQuestionAnswers | None:
|
|
293
276
|
if self.question_answers is None or force:
|
|
294
277
|
sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
|
|
295
278
|
try:
|
|
@@ -306,9 +289,7 @@ class Field(Generic[PbType]):
|
|
|
306
289
|
async def set_question_answers(self, payload: FieldQuestionAnswerWrapper) -> None:
|
|
307
290
|
if self.type in SUBFIELDFIELDS:
|
|
308
291
|
try:
|
|
309
|
-
actual_payload:
|
|
310
|
-
force=True
|
|
311
|
-
)
|
|
292
|
+
actual_payload: FieldQuestionAnswers | None = await self.get_question_answers(force=True)
|
|
312
293
|
except KeyError:
|
|
313
294
|
actual_payload = None
|
|
314
295
|
else:
|
|
@@ -341,7 +322,7 @@ class Field(Generic[PbType]):
|
|
|
341
322
|
self.question_answers = actual_payload
|
|
342
323
|
|
|
343
324
|
async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
|
|
344
|
-
actual_payload:
|
|
325
|
+
actual_payload: ExtractedText | None = None
|
|
345
326
|
if self.type in SUBFIELDFIELDS:
|
|
346
327
|
# Try to get the previously extracted text protobuf if it exists so we can merge it with the new splits
|
|
347
328
|
# coming from the processing payload.
|
|
@@ -392,7 +373,7 @@ class Field(Generic[PbType]):
|
|
|
392
373
|
await self.storage.upload_pb(sf, actual_payload)
|
|
393
374
|
self.extracted_text = actual_payload
|
|
394
375
|
|
|
395
|
-
async def get_extracted_text(self, force=False) ->
|
|
376
|
+
async def get_extracted_text(self, force=False) -> ExtractedText | None:
|
|
396
377
|
if self.extracted_text is None or force:
|
|
397
378
|
async with self.locks["extracted_text"]:
|
|
398
379
|
# Value could have been fetched while waiting for the lock
|
|
@@ -408,10 +389,10 @@ class Field(Generic[PbType]):
|
|
|
408
389
|
payload: ExtractedVectorsWrapper,
|
|
409
390
|
vectorset: str,
|
|
410
391
|
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
|
411
|
-
) ->
|
|
392
|
+
) -> VectorObject | None:
|
|
412
393
|
if self.type in SUBFIELDFIELDS:
|
|
413
394
|
try:
|
|
414
|
-
actual_payload:
|
|
395
|
+
actual_payload: VectorObject | None = await self.get_vectors(
|
|
415
396
|
vectorset=vectorset,
|
|
416
397
|
storage_key_kind=storage_key_kind,
|
|
417
398
|
force=True,
|
|
@@ -422,7 +403,7 @@ class Field(Generic[PbType]):
|
|
|
422
403
|
actual_payload = None
|
|
423
404
|
|
|
424
405
|
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
|
425
|
-
vo:
|
|
406
|
+
vo: VectorObject | None = None
|
|
426
407
|
if actual_payload is None:
|
|
427
408
|
# Its first extracted vectors
|
|
428
409
|
if payload.HasField("file"):
|
|
@@ -474,7 +455,7 @@ class Field(Generic[PbType]):
|
|
|
474
455
|
vectorset: str,
|
|
475
456
|
storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
|
|
476
457
|
force: bool = False,
|
|
477
|
-
) ->
|
|
458
|
+
) -> VectorObject | None:
|
|
478
459
|
if self.extracted_vectors.get(vectorset, None) is None or force:
|
|
479
460
|
sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
|
|
480
461
|
payload = await self.storage.download_pb(sf, VectorObject)
|
|
@@ -485,9 +466,7 @@ class Field(Generic[PbType]):
|
|
|
485
466
|
async def set_field_metadata(self, payload: FieldComputedMetadataWrapper) -> FieldComputedMetadata:
|
|
486
467
|
if self.type in SUBFIELDFIELDS:
|
|
487
468
|
try:
|
|
488
|
-
actual_payload:
|
|
489
|
-
force=True
|
|
490
|
-
)
|
|
469
|
+
actual_payload: FieldComputedMetadata | None = await self.get_field_metadata(force=True)
|
|
491
470
|
except KeyError:
|
|
492
471
|
actual_payload = None
|
|
493
472
|
else:
|
|
@@ -530,7 +509,7 @@ class Field(Generic[PbType]):
|
|
|
530
509
|
|
|
531
510
|
return self.computed_metadata
|
|
532
511
|
|
|
533
|
-
async def get_field_metadata(self, force: bool = False) ->
|
|
512
|
+
async def get_field_metadata(self, force: bool = False) -> FieldComputedMetadata | None:
|
|
534
513
|
if self.computed_metadata is None or force:
|
|
535
514
|
async with self.locks["field_metadata"]:
|
|
536
515
|
# Value could have been fetched while waiting for the lock
|
|
@@ -544,7 +523,7 @@ class Field(Generic[PbType]):
|
|
|
544
523
|
async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
|
|
545
524
|
if self.type in SUBFIELDFIELDS:
|
|
546
525
|
try:
|
|
547
|
-
actual_payload:
|
|
526
|
+
actual_payload: LargeComputedMetadata | None = await self.get_large_field_metadata(
|
|
548
527
|
force=True
|
|
549
528
|
)
|
|
550
529
|
except KeyError:
|
|
@@ -554,7 +533,7 @@ class Field(Generic[PbType]):
|
|
|
554
533
|
|
|
555
534
|
sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
|
|
556
535
|
|
|
557
|
-
new_payload:
|
|
536
|
+
new_payload: LargeComputedMetadata | None = None
|
|
558
537
|
if payload.HasField("file"):
|
|
559
538
|
new_payload = LargeComputedMetadata()
|
|
560
539
|
data = await self.storage.downloadbytescf(payload.file)
|
|
@@ -581,7 +560,7 @@ class Field(Generic[PbType]):
|
|
|
581
560
|
|
|
582
561
|
return self.large_computed_metadata
|
|
583
562
|
|
|
584
|
-
async def get_large_field_metadata(self, force: bool = False) ->
|
|
563
|
+
async def get_large_field_metadata(self, force: bool = False) -> LargeComputedMetadata | None:
|
|
585
564
|
if self.large_computed_metadata is None or force:
|
|
586
565
|
sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
|
|
587
566
|
payload = await self.storage.download_pb(
|
|
@@ -18,14 +18,14 @@
|
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
20
|
import uuid
|
|
21
|
-
from typing import Any
|
|
21
|
+
from typing import Any
|
|
22
22
|
|
|
23
23
|
from nucliadb.ingest.fields.base import Field
|
|
24
|
-
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation,
|
|
24
|
+
from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitsMetadata
|
|
25
25
|
from nucliadb_protos.resources_pb2 import Conversation as PBConversation
|
|
26
26
|
from nucliadb_utils.storages.storage import StorageField
|
|
27
27
|
|
|
28
|
-
MAX_CONVERSATION_MESSAGES =
|
|
28
|
+
MAX_CONVERSATION_MESSAGES = None # No limit
|
|
29
29
|
|
|
30
30
|
PAGE_SIZE = 200
|
|
31
31
|
|
|
@@ -42,7 +42,7 @@ class Conversation(Field[PBConversation]):
|
|
|
42
42
|
pbklass = PBConversation
|
|
43
43
|
type: str = "c"
|
|
44
44
|
value: dict[int, PBConversation]
|
|
45
|
-
metadata:
|
|
45
|
+
metadata: FieldConversation | None
|
|
46
46
|
|
|
47
47
|
_created: bool = False
|
|
48
48
|
|
|
@@ -50,12 +50,12 @@ class Conversation(Field[PBConversation]):
|
|
|
50
50
|
self,
|
|
51
51
|
id: str,
|
|
52
52
|
resource: Any,
|
|
53
|
-
pb:
|
|
54
|
-
value:
|
|
53
|
+
pb: Any | None = None,
|
|
54
|
+
value: dict[int, PBConversation] | None = None,
|
|
55
55
|
):
|
|
56
|
-
super(
|
|
56
|
+
super().__init__(id, resource, pb, value)
|
|
57
57
|
self.value = {}
|
|
58
|
-
self._splits_metadata:
|
|
58
|
+
self._splits_metadata: SplitsMetadata | None = None
|
|
59
59
|
self.metadata = None
|
|
60
60
|
|
|
61
61
|
async def delete_value(self):
|
|
@@ -76,7 +76,7 @@ class Conversation(Field[PBConversation]):
|
|
|
76
76
|
metadata.split_strategy = payload.split_strategy
|
|
77
77
|
|
|
78
78
|
# Get the last page if it exists
|
|
79
|
-
last_page:
|
|
79
|
+
last_page: PBConversation | None = None
|
|
80
80
|
if self._created is False and metadata.pages > 0:
|
|
81
81
|
try:
|
|
82
82
|
last_page = await self.db_get_value(page=metadata.pages)
|
|
@@ -92,7 +92,7 @@ class Conversation(Field[PBConversation]):
|
|
|
92
92
|
# to support the hybrid-onprem deployment as the attachments must be stored
|
|
93
93
|
# at the storage services of the client's premises.
|
|
94
94
|
for message in payload.messages:
|
|
95
|
-
self._splits_metadata.metadata.
|
|
95
|
+
self._splits_metadata.metadata.get_or_create(message.ident)
|
|
96
96
|
new_message_files = []
|
|
97
97
|
for idx, file in enumerate(message.content.attachments):
|
|
98
98
|
if self.storage.needs_move(file, self.kbid):
|
|
@@ -138,7 +138,7 @@ class Conversation(Field[PBConversation]):
|
|
|
138
138
|
await self.db_set_metadata(metadata)
|
|
139
139
|
await self.set_splits_metadata(self._splits_metadata)
|
|
140
140
|
|
|
141
|
-
async def get_value(self, page:
|
|
141
|
+
async def get_value(self, page: int | None = None) -> PBConversation | None:
|
|
142
142
|
# If no page was requested, force fetch of metadata
|
|
143
143
|
# and set the page to the last page
|
|
144
144
|
if page is None and self.metadata is None:
|
|
@@ -153,7 +153,7 @@ class Conversation(Field[PBConversation]):
|
|
|
153
153
|
except PageNotFound:
|
|
154
154
|
return None
|
|
155
155
|
|
|
156
|
-
async def get_full_conversation(self) ->
|
|
156
|
+
async def get_full_conversation(self) -> PBConversation | None:
|
|
157
157
|
"""
|
|
158
158
|
Messages of a conversations may be stored across several pages.
|
|
159
159
|
This method fetches them all and returns a single complete conversation.
|
|
@@ -17,7 +17,6 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Type
|
|
21
20
|
|
|
22
21
|
|
|
23
22
|
class InvalidFieldClass(Exception):
|
|
@@ -25,7 +24,7 @@ class InvalidFieldClass(Exception):
|
|
|
25
24
|
|
|
26
25
|
|
|
27
26
|
class InvalidPBClass(Exception):
|
|
28
|
-
def __init__(self, source:
|
|
27
|
+
def __init__(self, source: type, destination: type):
|
|
29
28
|
self.source = source
|
|
30
29
|
self.destination = destination
|
|
31
30
|
super().__init__(f"Source and destination does not match {self.source} - {self.destination}")
|
nucliadb/ingest/fields/file.py
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any
|
|
21
21
|
|
|
22
22
|
from nucliadb.ingest.fields.base import Field
|
|
23
23
|
from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
|
|
@@ -30,22 +30,22 @@ class File(Field[FieldFile]):
|
|
|
30
30
|
pbklass = FieldFile
|
|
31
31
|
value: FieldFile
|
|
32
32
|
type: str = "f"
|
|
33
|
-
file_extracted_data:
|
|
33
|
+
file_extracted_data: FileExtractedData | None
|
|
34
34
|
|
|
35
35
|
def __init__(
|
|
36
36
|
self,
|
|
37
37
|
id: str,
|
|
38
38
|
resource: Any,
|
|
39
|
-
pb:
|
|
40
|
-
value:
|
|
39
|
+
pb: Any | None = None,
|
|
40
|
+
value: str | None = None,
|
|
41
41
|
):
|
|
42
|
-
super(
|
|
42
|
+
super().__init__(id, resource, pb, value)
|
|
43
43
|
self.file_extracted_data = None
|
|
44
44
|
|
|
45
45
|
async def set_value(self, payload: FieldFile):
|
|
46
46
|
old_file = await self.get_value()
|
|
47
47
|
if old_file is None:
|
|
48
|
-
old_cf:
|
|
48
|
+
old_cf: CloudFile | None = None
|
|
49
49
|
else:
|
|
50
50
|
old_cf = old_file.file
|
|
51
51
|
|
|
@@ -57,7 +57,7 @@ class File(Field[FieldFile]):
|
|
|
57
57
|
|
|
58
58
|
await self.db_set_value(payload)
|
|
59
59
|
|
|
60
|
-
async def get_value(self) ->
|
|
60
|
+
async def get_value(self) -> FieldFile | None:
|
|
61
61
|
return await self.db_get_value()
|
|
62
62
|
|
|
63
63
|
async def set_file_extracted_data(self, file_extracted_data: FileExtractedData):
|
|
@@ -101,10 +101,24 @@ class File(Field[FieldFile]):
|
|
|
101
101
|
await self.storage.upload_pb(sf, file_extracted_data)
|
|
102
102
|
self.file_extracted_data = file_extracted_data
|
|
103
103
|
|
|
104
|
-
async def get_file_extracted_data(self) ->
|
|
104
|
+
async def get_file_extracted_data(self) -> FileExtractedData | None:
|
|
105
105
|
if self.file_extracted_data is None:
|
|
106
106
|
sf: StorageField = self.storage.file_extracted(
|
|
107
107
|
self.kbid, self.uuid, self.type, self.id, FILE_METADATA
|
|
108
108
|
)
|
|
109
109
|
self.file_extracted_data = await self.storage.download_pb(sf, FileExtractedData)
|
|
110
110
|
return self.file_extracted_data
|
|
111
|
+
|
|
112
|
+
async def thumbnail(self) -> StorageField | None:
|
|
113
|
+
"""Access the file field thumbnail."""
|
|
114
|
+
fed = await self.get_file_extracted_data()
|
|
115
|
+
if fed is None:
|
|
116
|
+
return None
|
|
117
|
+
if not fed.HasField("file_thumbnail"):
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
sf: StorageField = self.storage.file_extracted(
|
|
121
|
+
self.kbid, self.uuid, self.type, self.id, "file_thumbnail"
|
|
122
|
+
)
|
|
123
|
+
sf.field = fed.file_thumbnail
|
|
124
|
+
return sf
|
nucliadb/ingest/fields/link.py
CHANGED
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
# You should have received a copy of the GNU Affero General Public License
|
|
18
18
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
19
19
|
#
|
|
20
|
-
from typing import Any
|
|
20
|
+
from typing import Any
|
|
21
21
|
|
|
22
22
|
from nucliadb.ingest.fields.base import Field
|
|
23
23
|
from nucliadb_protos.resources_pb2 import CloudFile, FieldLink, LinkExtractedData
|
|
@@ -30,22 +30,22 @@ class Link(Field[FieldLink]):
|
|
|
30
30
|
pbklass = FieldLink
|
|
31
31
|
value: FieldLink
|
|
32
32
|
type: str = "u"
|
|
33
|
-
link_extracted_data:
|
|
33
|
+
link_extracted_data: LinkExtractedData | None
|
|
34
34
|
|
|
35
35
|
def __init__(
|
|
36
36
|
self,
|
|
37
37
|
id: str,
|
|
38
38
|
resource: Any,
|
|
39
|
-
pb:
|
|
40
|
-
value:
|
|
39
|
+
pb: Any | None = None,
|
|
40
|
+
value: str | None = None,
|
|
41
41
|
):
|
|
42
|
-
super(
|
|
42
|
+
super().__init__(id, resource, pb, value)
|
|
43
43
|
self.link_extracted_data = None
|
|
44
44
|
|
|
45
45
|
async def set_value(self, payload: FieldLink):
|
|
46
46
|
await self.db_set_value(payload)
|
|
47
47
|
|
|
48
|
-
async def get_value(self) ->
|
|
48
|
+
async def get_value(self) -> FieldLink | None:
|
|
49
49
|
return await self.db_get_value()
|
|
50
50
|
|
|
51
51
|
async def set_link_extracted_data(self, link_extracted_data: LinkExtractedData):
|
|
@@ -88,7 +88,7 @@ class Link(Field[FieldLink]):
|
|
|
88
88
|
await self.storage.upload_pb(sf, link_extracted_data)
|
|
89
89
|
self.link_extracted_data = link_extracted_data
|
|
90
90
|
|
|
91
|
-
async def get_link_extracted_data(self) ->
|
|
91
|
+
async def get_link_extracted_data(self) -> LinkExtractedData | None:
|
|
92
92
|
if self.link_extracted_data is None:
|
|
93
93
|
sf: StorageField = self.storage.file_extracted(
|
|
94
94
|
self.kbid, self.uuid, self.type, self.id, LINK_METADATA
|
nucliadb/ingest/fields/text.py
CHANGED
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
#
|
|
20
20
|
|
|
21
21
|
import hashlib
|
|
22
|
-
from typing import Optional
|
|
23
22
|
|
|
24
23
|
from nucliadb.ingest.fields.base import Field
|
|
25
24
|
from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
|
|
@@ -39,8 +38,8 @@ class Text(Field[FieldText]):
|
|
|
39
38
|
|
|
40
39
|
async def set_value(self, payload: FieldText):
|
|
41
40
|
if payload.md5 == "":
|
|
42
|
-
payload.md5 = hashlib.md5(payload.body.encode()).hexdigest()
|
|
41
|
+
payload.md5 = hashlib.md5(payload.body.encode(), usedforsecurity=False).hexdigest()
|
|
43
42
|
await self.db_set_value(payload)
|
|
44
43
|
|
|
45
|
-
async def get_value(self) ->
|
|
44
|
+
async def get_value(self) -> FieldText | None:
|
|
46
45
|
return await self.db_get_value()
|