nucliadb 6.9.1.post5192__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. migrations/0023_backfill_pg_catalog.py +2 -2
  2. migrations/0029_backfill_field_status.py +3 -4
  3. migrations/0032_remove_old_relations.py +2 -3
  4. migrations/0038_backfill_catalog_field_labels.py +2 -2
  5. migrations/0039_backfill_converation_splits_metadata.py +2 -2
  6. migrations/0041_reindex_conversations.py +137 -0
  7. migrations/pg/0010_shards_index.py +34 -0
  8. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  9. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  10. nucliadb/backups/create.py +2 -15
  11. nucliadb/backups/restore.py +4 -15
  12. nucliadb/backups/tasks.py +4 -1
  13. nucliadb/common/back_pressure/cache.py +2 -3
  14. nucliadb/common/back_pressure/materializer.py +7 -13
  15. nucliadb/common/back_pressure/settings.py +6 -6
  16. nucliadb/common/back_pressure/utils.py +1 -0
  17. nucliadb/common/cache.py +9 -9
  18. nucliadb/common/catalog/interface.py +12 -12
  19. nucliadb/common/catalog/pg.py +41 -29
  20. nucliadb/common/catalog/utils.py +3 -3
  21. nucliadb/common/cluster/manager.py +5 -4
  22. nucliadb/common/cluster/rebalance.py +483 -114
  23. nucliadb/common/cluster/rollover.py +25 -9
  24. nucliadb/common/cluster/settings.py +3 -8
  25. nucliadb/common/cluster/utils.py +34 -8
  26. nucliadb/common/context/__init__.py +7 -8
  27. nucliadb/common/context/fastapi.py +1 -2
  28. nucliadb/common/datamanagers/__init__.py +2 -4
  29. nucliadb/common/datamanagers/atomic.py +4 -2
  30. nucliadb/common/datamanagers/cluster.py +1 -2
  31. nucliadb/common/datamanagers/fields.py +3 -4
  32. nucliadb/common/datamanagers/kb.py +6 -6
  33. nucliadb/common/datamanagers/labels.py +2 -3
  34. nucliadb/common/datamanagers/resources.py +10 -33
  35. nucliadb/common/datamanagers/rollover.py +5 -7
  36. nucliadb/common/datamanagers/search_configurations.py +1 -2
  37. nucliadb/common/datamanagers/synonyms.py +1 -2
  38. nucliadb/common/datamanagers/utils.py +4 -4
  39. nucliadb/common/datamanagers/vectorsets.py +4 -4
  40. nucliadb/common/external_index_providers/base.py +32 -5
  41. nucliadb/common/external_index_providers/manager.py +4 -5
  42. nucliadb/common/filter_expression.py +128 -40
  43. nucliadb/common/http_clients/processing.py +12 -23
  44. nucliadb/common/ids.py +6 -4
  45. nucliadb/common/locking.py +1 -2
  46. nucliadb/common/maindb/driver.py +9 -8
  47. nucliadb/common/maindb/local.py +5 -5
  48. nucliadb/common/maindb/pg.py +9 -8
  49. nucliadb/common/nidx.py +3 -4
  50. nucliadb/export_import/datamanager.py +4 -3
  51. nucliadb/export_import/exporter.py +11 -19
  52. nucliadb/export_import/importer.py +13 -6
  53. nucliadb/export_import/tasks.py +2 -0
  54. nucliadb/export_import/utils.py +6 -18
  55. nucliadb/health.py +2 -2
  56. nucliadb/ingest/app.py +8 -8
  57. nucliadb/ingest/consumer/consumer.py +8 -10
  58. nucliadb/ingest/consumer/pull.py +3 -8
  59. nucliadb/ingest/consumer/service.py +3 -3
  60. nucliadb/ingest/consumer/utils.py +1 -1
  61. nucliadb/ingest/fields/base.py +28 -49
  62. nucliadb/ingest/fields/conversation.py +12 -12
  63. nucliadb/ingest/fields/exceptions.py +1 -2
  64. nucliadb/ingest/fields/file.py +22 -8
  65. nucliadb/ingest/fields/link.py +7 -7
  66. nucliadb/ingest/fields/text.py +2 -3
  67. nucliadb/ingest/orm/brain_v2.py +78 -64
  68. nucliadb/ingest/orm/broker_message.py +2 -4
  69. nucliadb/ingest/orm/entities.py +10 -209
  70. nucliadb/ingest/orm/index_message.py +4 -4
  71. nucliadb/ingest/orm/knowledgebox.py +18 -27
  72. nucliadb/ingest/orm/processor/auditing.py +1 -3
  73. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  74. nucliadb/ingest/orm/processor/processor.py +27 -27
  75. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  76. nucliadb/ingest/orm/resource.py +72 -70
  77. nucliadb/ingest/orm/utils.py +1 -1
  78. nucliadb/ingest/processing.py +17 -17
  79. nucliadb/ingest/serialize.py +202 -145
  80. nucliadb/ingest/service/writer.py +3 -109
  81. nucliadb/ingest/settings.py +3 -4
  82. nucliadb/ingest/utils.py +1 -2
  83. nucliadb/learning_proxy.py +11 -11
  84. nucliadb/metrics_exporter.py +5 -4
  85. nucliadb/middleware/__init__.py +82 -1
  86. nucliadb/migrator/datamanager.py +3 -4
  87. nucliadb/migrator/migrator.py +1 -2
  88. nucliadb/migrator/models.py +1 -2
  89. nucliadb/migrator/settings.py +1 -2
  90. nucliadb/models/internal/augment.py +614 -0
  91. nucliadb/models/internal/processing.py +19 -19
  92. nucliadb/openapi.py +2 -2
  93. nucliadb/purge/__init__.py +3 -8
  94. nucliadb/purge/orphan_shards.py +1 -2
  95. nucliadb/reader/__init__.py +5 -0
  96. nucliadb/reader/api/models.py +6 -13
  97. nucliadb/reader/api/v1/download.py +59 -38
  98. nucliadb/reader/api/v1/export_import.py +4 -4
  99. nucliadb/reader/api/v1/learning_config.py +24 -4
  100. nucliadb/reader/api/v1/resource.py +61 -9
  101. nucliadb/reader/api/v1/services.py +18 -14
  102. nucliadb/reader/app.py +3 -1
  103. nucliadb/reader/reader/notifications.py +1 -2
  104. nucliadb/search/api/v1/__init__.py +2 -0
  105. nucliadb/search/api/v1/ask.py +3 -4
  106. nucliadb/search/api/v1/augment.py +585 -0
  107. nucliadb/search/api/v1/catalog.py +11 -15
  108. nucliadb/search/api/v1/find.py +16 -22
  109. nucliadb/search/api/v1/hydrate.py +25 -25
  110. nucliadb/search/api/v1/knowledgebox.py +1 -2
  111. nucliadb/search/api/v1/predict_proxy.py +1 -2
  112. nucliadb/search/api/v1/resource/ask.py +7 -7
  113. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  114. nucliadb/search/api/v1/resource/search.py +9 -11
  115. nucliadb/search/api/v1/retrieve.py +130 -0
  116. nucliadb/search/api/v1/search.py +28 -32
  117. nucliadb/search/api/v1/suggest.py +11 -14
  118. nucliadb/search/api/v1/summarize.py +1 -2
  119. nucliadb/search/api/v1/utils.py +2 -2
  120. nucliadb/search/app.py +3 -2
  121. nucliadb/search/augmentor/__init__.py +21 -0
  122. nucliadb/search/augmentor/augmentor.py +232 -0
  123. nucliadb/search/augmentor/fields.py +704 -0
  124. nucliadb/search/augmentor/metrics.py +24 -0
  125. nucliadb/search/augmentor/paragraphs.py +334 -0
  126. nucliadb/search/augmentor/resources.py +238 -0
  127. nucliadb/search/augmentor/utils.py +33 -0
  128. nucliadb/search/lifecycle.py +3 -1
  129. nucliadb/search/predict.py +24 -17
  130. nucliadb/search/predict_models.py +8 -9
  131. nucliadb/search/requesters/utils.py +11 -10
  132. nucliadb/search/search/cache.py +19 -23
  133. nucliadb/search/search/chat/ask.py +88 -59
  134. nucliadb/search/search/chat/exceptions.py +3 -5
  135. nucliadb/search/search/chat/fetcher.py +201 -0
  136. nucliadb/search/search/chat/images.py +6 -4
  137. nucliadb/search/search/chat/old_prompt.py +1375 -0
  138. nucliadb/search/search/chat/parser.py +510 -0
  139. nucliadb/search/search/chat/prompt.py +563 -615
  140. nucliadb/search/search/chat/query.py +449 -36
  141. nucliadb/search/search/chat/rpc.py +85 -0
  142. nucliadb/search/search/fetch.py +3 -4
  143. nucliadb/search/search/filters.py +8 -11
  144. nucliadb/search/search/find.py +33 -31
  145. nucliadb/search/search/find_merge.py +124 -331
  146. nucliadb/search/search/graph_strategy.py +14 -12
  147. nucliadb/search/search/hydrator/__init__.py +3 -152
  148. nucliadb/search/search/hydrator/fields.py +92 -50
  149. nucliadb/search/search/hydrator/images.py +7 -7
  150. nucliadb/search/search/hydrator/paragraphs.py +42 -26
  151. nucliadb/search/search/hydrator/resources.py +20 -16
  152. nucliadb/search/search/ingestion_agents.py +5 -5
  153. nucliadb/search/search/merge.py +90 -94
  154. nucliadb/search/search/metrics.py +10 -9
  155. nucliadb/search/search/paragraphs.py +7 -9
  156. nucliadb/search/search/predict_proxy.py +13 -9
  157. nucliadb/search/search/query.py +14 -86
  158. nucliadb/search/search/query_parser/fetcher.py +51 -82
  159. nucliadb/search/search/query_parser/models.py +19 -20
  160. nucliadb/search/search/query_parser/old_filters.py +20 -19
  161. nucliadb/search/search/query_parser/parsers/ask.py +4 -5
  162. nucliadb/search/search/query_parser/parsers/catalog.py +5 -6
  163. nucliadb/search/search/query_parser/parsers/common.py +5 -6
  164. nucliadb/search/search/query_parser/parsers/find.py +6 -26
  165. nucliadb/search/search/query_parser/parsers/graph.py +13 -23
  166. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  167. nucliadb/search/search/query_parser/parsers/search.py +15 -53
  168. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  169. nucliadb/search/search/rank_fusion.py +18 -13
  170. nucliadb/search/search/rerankers.py +5 -6
  171. nucliadb/search/search/retrieval.py +300 -0
  172. nucliadb/search/search/summarize.py +5 -6
  173. nucliadb/search/search/utils.py +3 -4
  174. nucliadb/search/settings.py +1 -2
  175. nucliadb/standalone/api_router.py +1 -1
  176. nucliadb/standalone/app.py +4 -3
  177. nucliadb/standalone/auth.py +5 -6
  178. nucliadb/standalone/lifecycle.py +2 -2
  179. nucliadb/standalone/run.py +2 -4
  180. nucliadb/standalone/settings.py +5 -6
  181. nucliadb/standalone/versions.py +3 -4
  182. nucliadb/tasks/consumer.py +13 -8
  183. nucliadb/tasks/models.py +2 -1
  184. nucliadb/tasks/producer.py +3 -3
  185. nucliadb/tasks/retries.py +8 -7
  186. nucliadb/train/api/utils.py +1 -3
  187. nucliadb/train/api/v1/shards.py +1 -2
  188. nucliadb/train/api/v1/trainset.py +1 -2
  189. nucliadb/train/app.py +1 -1
  190. nucliadb/train/generator.py +4 -4
  191. nucliadb/train/generators/field_classifier.py +2 -2
  192. nucliadb/train/generators/field_streaming.py +6 -6
  193. nucliadb/train/generators/image_classifier.py +2 -2
  194. nucliadb/train/generators/paragraph_classifier.py +2 -2
  195. nucliadb/train/generators/paragraph_streaming.py +2 -2
  196. nucliadb/train/generators/question_answer_streaming.py +2 -2
  197. nucliadb/train/generators/sentence_classifier.py +2 -2
  198. nucliadb/train/generators/token_classifier.py +3 -2
  199. nucliadb/train/generators/utils.py +6 -5
  200. nucliadb/train/nodes.py +3 -3
  201. nucliadb/train/resource.py +6 -8
  202. nucliadb/train/settings.py +3 -4
  203. nucliadb/train/types.py +11 -11
  204. nucliadb/train/upload.py +3 -2
  205. nucliadb/train/uploader.py +1 -2
  206. nucliadb/train/utils.py +1 -2
  207. nucliadb/writer/api/v1/export_import.py +4 -1
  208. nucliadb/writer/api/v1/field.py +7 -11
  209. nucliadb/writer/api/v1/knowledgebox.py +3 -4
  210. nucliadb/writer/api/v1/resource.py +9 -20
  211. nucliadb/writer/api/v1/services.py +10 -132
  212. nucliadb/writer/api/v1/upload.py +73 -72
  213. nucliadb/writer/app.py +8 -2
  214. nucliadb/writer/resource/basic.py +12 -15
  215. nucliadb/writer/resource/field.py +7 -5
  216. nucliadb/writer/resource/origin.py +7 -0
  217. nucliadb/writer/settings.py +2 -3
  218. nucliadb/writer/tus/__init__.py +2 -3
  219. nucliadb/writer/tus/azure.py +1 -3
  220. nucliadb/writer/tus/dm.py +3 -3
  221. nucliadb/writer/tus/exceptions.py +3 -4
  222. nucliadb/writer/tus/gcs.py +5 -6
  223. nucliadb/writer/tus/s3.py +2 -3
  224. nucliadb/writer/tus/storage.py +3 -3
  225. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +9 -10
  226. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  227. nucliadb/common/datamanagers/entities.py +0 -139
  228. nucliadb-6.9.1.post5192.dist-info/RECORD +0 -392
  229. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  230. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  231. {nucliadb-6.9.1.post5192.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,7 @@ import enum
24
24
  import logging
25
25
  from collections import defaultdict
26
26
  from datetime import datetime
27
- from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
27
+ from typing import TYPE_CHECKING, Any, Generic, TypeVar
28
28
 
29
29
  from google.protobuf.message import DecodeError, Message
30
30
 
@@ -47,10 +47,8 @@ from nucliadb_protos.resources_pb2 import (
47
47
  )
48
48
  from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
49
49
  from nucliadb_protos.writer_pb2 import Error, FieldStatus
50
- from nucliadb_utils import const
51
50
  from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
52
51
  from nucliadb_utils.storages.storage import Storage, StorageField
53
- from nucliadb_utils.utilities import has_feature
54
52
 
55
53
  logger = logging.getLogger(__name__)
56
54
 
@@ -77,27 +75,27 @@ PbType = TypeVar("PbType", bound=Message)
77
75
 
78
76
 
79
77
  class Field(Generic[PbType]):
80
- pbklass: Type[PbType]
78
+ pbklass: type[PbType]
81
79
  type: str = "x"
82
- value: Optional[Any]
83
- extracted_text: Optional[ExtractedText]
84
- extracted_vectors: dict[Optional[str], VectorObject]
85
- computed_metadata: Optional[FieldComputedMetadata]
86
- large_computed_metadata: Optional[LargeComputedMetadata]
87
- question_answers: Optional[FieldQuestionAnswers]
80
+ value: Any | None
81
+ extracted_text: ExtractedText | None
82
+ extracted_vectors: dict[str | None, VectorObject]
83
+ computed_metadata: FieldComputedMetadata | None
84
+ large_computed_metadata: LargeComputedMetadata | None
85
+ question_answers: FieldQuestionAnswers | None
88
86
 
89
87
  def __init__(
90
88
  self,
91
89
  id: str,
92
90
  resource: Resource,
93
- pb: Optional[Any] = None,
94
- value: Optional[Any] = None,
91
+ pb: Any | None = None,
92
+ value: Any | None = None,
95
93
  ):
96
94
  if self.pbklass is None:
97
95
  raise InvalidFieldClass()
98
96
 
99
97
  self.value = None
100
- self.extracted_text: Optional[ExtractedText] = None
98
+ self.extracted_text: ExtractedText | None = None
101
99
  self.extracted_vectors = {}
102
100
  self.computed_metadata = None
103
101
  self.large_computed_metadata = None
@@ -120,7 +118,7 @@ class Field(Generic[PbType]):
120
118
 
121
119
  @property
122
120
  def kbid(self) -> str:
123
- return self.resource.kb.kbid
121
+ return self.resource.kbid
124
122
 
125
123
  @property
126
124
  def uuid(self) -> str:
@@ -161,7 +159,7 @@ class Field(Generic[PbType]):
161
159
 
162
160
  return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
163
161
 
164
- async def db_get_value(self) -> Optional[PbType]:
162
+ async def db_get_value(self) -> PbType | None:
165
163
  if self.value is None:
166
164
  payload = await datamanagers.fields.get_raw(
167
165
  self.resource.txn,
@@ -224,21 +222,6 @@ class Field(Generic[PbType]):
224
222
  ) -> None:
225
223
  # Try delete vectors
226
224
  sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
227
-
228
- if has_feature(const.Features.DEBUG_MISSING_VECTORS):
229
- # This is a very chatty log. It is just a temporary hint while debugging an issue.
230
- logger.info(
231
- "Deleting vectors from storage",
232
- extra={
233
- "kbid": self.kbid,
234
- "rid": self.resource.uuid,
235
- "field": f"{self.type}/{self.id}",
236
- "vectorset": vectorset,
237
- "storage_key_kind": storage_key_kind,
238
- "key": sf.key,
239
- "bucket": sf.bucket,
240
- },
241
- )
242
225
  try:
243
226
  await self.storage.delete_upload(sf.key, sf.bucket)
244
227
  except KeyError:
@@ -251,7 +234,7 @@ class Field(Generic[PbType]):
251
234
  except KeyError:
252
235
  pass
253
236
 
254
- async def get_error(self) -> Optional[Error]:
237
+ async def get_error(self) -> Error | None:
255
238
  return await datamanagers.fields.get_error(
256
239
  self.resource.txn,
257
240
  kbid=self.kbid,
@@ -270,7 +253,7 @@ class Field(Generic[PbType]):
270
253
  error=error,
271
254
  )
272
255
 
273
- async def get_status(self) -> Optional[FieldStatus]:
256
+ async def get_status(self) -> FieldStatus | None:
274
257
  return await datamanagers.fields.get_status(
275
258
  self.resource.txn,
276
259
  kbid=self.kbid,
@@ -289,7 +272,7 @@ class Field(Generic[PbType]):
289
272
  status=status,
290
273
  )
291
274
 
292
- async def get_question_answers(self, force=False) -> Optional[FieldQuestionAnswers]:
275
+ async def get_question_answers(self, force=False) -> FieldQuestionAnswers | None:
293
276
  if self.question_answers is None or force:
294
277
  sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
295
278
  try:
@@ -306,9 +289,7 @@ class Field(Generic[PbType]):
306
289
  async def set_question_answers(self, payload: FieldQuestionAnswerWrapper) -> None:
307
290
  if self.type in SUBFIELDFIELDS:
308
291
  try:
309
- actual_payload: Optional[FieldQuestionAnswers] = await self.get_question_answers(
310
- force=True
311
- )
292
+ actual_payload: FieldQuestionAnswers | None = await self.get_question_answers(force=True)
312
293
  except KeyError:
313
294
  actual_payload = None
314
295
  else:
@@ -341,7 +322,7 @@ class Field(Generic[PbType]):
341
322
  self.question_answers = actual_payload
342
323
 
343
324
  async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
344
- actual_payload: Optional[ExtractedText] = None
325
+ actual_payload: ExtractedText | None = None
345
326
  if self.type in SUBFIELDFIELDS:
346
327
  # Try to get the previously extracted text protobuf if it exists so we can merge it with the new splits
347
328
  # coming from the processing payload.
@@ -392,7 +373,7 @@ class Field(Generic[PbType]):
392
373
  await self.storage.upload_pb(sf, actual_payload)
393
374
  self.extracted_text = actual_payload
394
375
 
395
- async def get_extracted_text(self, force=False) -> Optional[ExtractedText]:
376
+ async def get_extracted_text(self, force=False) -> ExtractedText | None:
396
377
  if self.extracted_text is None or force:
397
378
  async with self.locks["extracted_text"]:
398
379
  # Value could have been fetched while waiting for the lock
@@ -408,10 +389,10 @@ class Field(Generic[PbType]):
408
389
  payload: ExtractedVectorsWrapper,
409
390
  vectorset: str,
410
391
  storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
411
- ) -> Optional[VectorObject]:
392
+ ) -> VectorObject | None:
412
393
  if self.type in SUBFIELDFIELDS:
413
394
  try:
414
- actual_payload: Optional[VectorObject] = await self.get_vectors(
395
+ actual_payload: VectorObject | None = await self.get_vectors(
415
396
  vectorset=vectorset,
416
397
  storage_key_kind=storage_key_kind,
417
398
  force=True,
@@ -422,7 +403,7 @@ class Field(Generic[PbType]):
422
403
  actual_payload = None
423
404
 
424
405
  sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
425
- vo: Optional[VectorObject] = None
406
+ vo: VectorObject | None = None
426
407
  if actual_payload is None:
427
408
  # Its first extracted vectors
428
409
  if payload.HasField("file"):
@@ -474,7 +455,7 @@ class Field(Generic[PbType]):
474
455
  vectorset: str,
475
456
  storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
476
457
  force: bool = False,
477
- ) -> Optional[VectorObject]:
458
+ ) -> VectorObject | None:
478
459
  if self.extracted_vectors.get(vectorset, None) is None or force:
479
460
  sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
480
461
  payload = await self.storage.download_pb(sf, VectorObject)
@@ -485,9 +466,7 @@ class Field(Generic[PbType]):
485
466
  async def set_field_metadata(self, payload: FieldComputedMetadataWrapper) -> FieldComputedMetadata:
486
467
  if self.type in SUBFIELDFIELDS:
487
468
  try:
488
- actual_payload: Optional[FieldComputedMetadata] = await self.get_field_metadata(
489
- force=True
490
- )
469
+ actual_payload: FieldComputedMetadata | None = await self.get_field_metadata(force=True)
491
470
  except KeyError:
492
471
  actual_payload = None
493
472
  else:
@@ -530,7 +509,7 @@ class Field(Generic[PbType]):
530
509
 
531
510
  return self.computed_metadata
532
511
 
533
- async def get_field_metadata(self, force: bool = False) -> Optional[FieldComputedMetadata]:
512
+ async def get_field_metadata(self, force: bool = False) -> FieldComputedMetadata | None:
534
513
  if self.computed_metadata is None or force:
535
514
  async with self.locks["field_metadata"]:
536
515
  # Value could have been fetched while waiting for the lock
@@ -544,7 +523,7 @@ class Field(Generic[PbType]):
544
523
  async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
545
524
  if self.type in SUBFIELDFIELDS:
546
525
  try:
547
- actual_payload: Optional[LargeComputedMetadata] = await self.get_large_field_metadata(
526
+ actual_payload: LargeComputedMetadata | None = await self.get_large_field_metadata(
548
527
  force=True
549
528
  )
550
529
  except KeyError:
@@ -554,7 +533,7 @@ class Field(Generic[PbType]):
554
533
 
555
534
  sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
556
535
 
557
- new_payload: Optional[LargeComputedMetadata] = None
536
+ new_payload: LargeComputedMetadata | None = None
558
537
  if payload.HasField("file"):
559
538
  new_payload = LargeComputedMetadata()
560
539
  data = await self.storage.downloadbytescf(payload.file)
@@ -581,7 +560,7 @@ class Field(Generic[PbType]):
581
560
 
582
561
  return self.large_computed_metadata
583
562
 
584
- async def get_large_field_metadata(self, force: bool = False) -> Optional[LargeComputedMetadata]:
563
+ async def get_large_field_metadata(self, force: bool = False) -> LargeComputedMetadata | None:
585
564
  if self.large_computed_metadata is None or force:
586
565
  sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
587
566
  payload = await self.storage.download_pb(
@@ -18,14 +18,14 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import uuid
21
- from typing import Any, Optional
21
+ from typing import Any
22
22
 
23
23
  from nucliadb.ingest.fields.base import Field
24
- from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitMetadata, SplitsMetadata
24
+ from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation, SplitsMetadata
25
25
  from nucliadb_protos.resources_pb2 import Conversation as PBConversation
26
26
  from nucliadb_utils.storages.storage import StorageField
27
27
 
28
- MAX_CONVERSATION_MESSAGES = 50 * 1024
28
+ MAX_CONVERSATION_MESSAGES = None # No limit
29
29
 
30
30
  PAGE_SIZE = 200
31
31
 
@@ -42,7 +42,7 @@ class Conversation(Field[PBConversation]):
42
42
  pbklass = PBConversation
43
43
  type: str = "c"
44
44
  value: dict[int, PBConversation]
45
- metadata: Optional[FieldConversation]
45
+ metadata: FieldConversation | None
46
46
 
47
47
  _created: bool = False
48
48
 
@@ -50,12 +50,12 @@ class Conversation(Field[PBConversation]):
50
50
  self,
51
51
  id: str,
52
52
  resource: Any,
53
- pb: Optional[Any] = None,
54
- value: Optional[dict[int, PBConversation]] = None,
53
+ pb: Any | None = None,
54
+ value: dict[int, PBConversation] | None = None,
55
55
  ):
56
- super(Conversation, self).__init__(id, resource, pb, value)
56
+ super().__init__(id, resource, pb, value)
57
57
  self.value = {}
58
- self._splits_metadata: Optional[SplitsMetadata] = None
58
+ self._splits_metadata: SplitsMetadata | None = None
59
59
  self.metadata = None
60
60
 
61
61
  async def delete_value(self):
@@ -76,7 +76,7 @@ class Conversation(Field[PBConversation]):
76
76
  metadata.split_strategy = payload.split_strategy
77
77
 
78
78
  # Get the last page if it exists
79
- last_page: Optional[PBConversation] = None
79
+ last_page: PBConversation | None = None
80
80
  if self._created is False and metadata.pages > 0:
81
81
  try:
82
82
  last_page = await self.db_get_value(page=metadata.pages)
@@ -92,7 +92,7 @@ class Conversation(Field[PBConversation]):
92
92
  # to support the hybrid-onprem deployment as the attachments must be stored
93
93
  # at the storage services of the client's premises.
94
94
  for message in payload.messages:
95
- self._splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
95
+ self._splits_metadata.metadata.get_or_create(message.ident)
96
96
  new_message_files = []
97
97
  for idx, file in enumerate(message.content.attachments):
98
98
  if self.storage.needs_move(file, self.kbid):
@@ -138,7 +138,7 @@ class Conversation(Field[PBConversation]):
138
138
  await self.db_set_metadata(metadata)
139
139
  await self.set_splits_metadata(self._splits_metadata)
140
140
 
141
- async def get_value(self, page: Optional[int] = None) -> Optional[PBConversation]:
141
+ async def get_value(self, page: int | None = None) -> PBConversation | None:
142
142
  # If no page was requested, force fetch of metadata
143
143
  # and set the page to the last page
144
144
  if page is None and self.metadata is None:
@@ -153,7 +153,7 @@ class Conversation(Field[PBConversation]):
153
153
  except PageNotFound:
154
154
  return None
155
155
 
156
- async def get_full_conversation(self) -> Optional[PBConversation]:
156
+ async def get_full_conversation(self) -> PBConversation | None:
157
157
  """
158
158
  Messages of a conversations may be stored across several pages.
159
159
  This method fetches them all and returns a single complete conversation.
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Type
21
20
 
22
21
 
23
22
  class InvalidFieldClass(Exception):
@@ -25,7 +24,7 @@ class InvalidFieldClass(Exception):
25
24
 
26
25
 
27
26
  class InvalidPBClass(Exception):
28
- def __init__(self, source: Type, destination: Type):
27
+ def __init__(self, source: type, destination: type):
29
28
  self.source = source
30
29
  self.destination = destination
31
30
  super().__init__(f"Source and destination does not match {self.source} - {self.destination}")
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Any, Optional
20
+ from typing import Any
21
21
 
22
22
  from nucliadb.ingest.fields.base import Field
23
23
  from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
@@ -30,22 +30,22 @@ class File(Field[FieldFile]):
30
30
  pbklass = FieldFile
31
31
  value: FieldFile
32
32
  type: str = "f"
33
- file_extracted_data: Optional[FileExtractedData]
33
+ file_extracted_data: FileExtractedData | None
34
34
 
35
35
  def __init__(
36
36
  self,
37
37
  id: str,
38
38
  resource: Any,
39
- pb: Optional[Any] = None,
40
- value: Optional[str] = None,
39
+ pb: Any | None = None,
40
+ value: str | None = None,
41
41
  ):
42
- super(File, self).__init__(id, resource, pb, value)
42
+ super().__init__(id, resource, pb, value)
43
43
  self.file_extracted_data = None
44
44
 
45
45
  async def set_value(self, payload: FieldFile):
46
46
  old_file = await self.get_value()
47
47
  if old_file is None:
48
- old_cf: Optional[CloudFile] = None
48
+ old_cf: CloudFile | None = None
49
49
  else:
50
50
  old_cf = old_file.file
51
51
 
@@ -57,7 +57,7 @@ class File(Field[FieldFile]):
57
57
 
58
58
  await self.db_set_value(payload)
59
59
 
60
- async def get_value(self) -> Optional[FieldFile]:
60
+ async def get_value(self) -> FieldFile | None:
61
61
  return await self.db_get_value()
62
62
 
63
63
  async def set_file_extracted_data(self, file_extracted_data: FileExtractedData):
@@ -101,10 +101,24 @@ class File(Field[FieldFile]):
101
101
  await self.storage.upload_pb(sf, file_extracted_data)
102
102
  self.file_extracted_data = file_extracted_data
103
103
 
104
- async def get_file_extracted_data(self) -> Optional[FileExtractedData]:
104
+ async def get_file_extracted_data(self) -> FileExtractedData | None:
105
105
  if self.file_extracted_data is None:
106
106
  sf: StorageField = self.storage.file_extracted(
107
107
  self.kbid, self.uuid, self.type, self.id, FILE_METADATA
108
108
  )
109
109
  self.file_extracted_data = await self.storage.download_pb(sf, FileExtractedData)
110
110
  return self.file_extracted_data
111
+
112
+ async def thumbnail(self) -> StorageField | None:
113
+ """Access the file field thumbnail."""
114
+ fed = await self.get_file_extracted_data()
115
+ if fed is None:
116
+ return None
117
+ if not fed.HasField("file_thumbnail"):
118
+ return None
119
+
120
+ sf: StorageField = self.storage.file_extracted(
121
+ self.kbid, self.uuid, self.type, self.id, "file_thumbnail"
122
+ )
123
+ sf.field = fed.file_thumbnail
124
+ return sf
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Any, Optional
20
+ from typing import Any
21
21
 
22
22
  from nucliadb.ingest.fields.base import Field
23
23
  from nucliadb_protos.resources_pb2 import CloudFile, FieldLink, LinkExtractedData
@@ -30,22 +30,22 @@ class Link(Field[FieldLink]):
30
30
  pbklass = FieldLink
31
31
  value: FieldLink
32
32
  type: str = "u"
33
- link_extracted_data: Optional[LinkExtractedData]
33
+ link_extracted_data: LinkExtractedData | None
34
34
 
35
35
  def __init__(
36
36
  self,
37
37
  id: str,
38
38
  resource: Any,
39
- pb: Optional[Any] = None,
40
- value: Optional[str] = None,
39
+ pb: Any | None = None,
40
+ value: str | None = None,
41
41
  ):
42
- super(Link, self).__init__(id, resource, pb, value)
42
+ super().__init__(id, resource, pb, value)
43
43
  self.link_extracted_data = None
44
44
 
45
45
  async def set_value(self, payload: FieldLink):
46
46
  await self.db_set_value(payload)
47
47
 
48
- async def get_value(self) -> Optional[FieldLink]:
48
+ async def get_value(self) -> FieldLink | None:
49
49
  return await self.db_get_value()
50
50
 
51
51
  async def set_link_extracted_data(self, link_extracted_data: LinkExtractedData):
@@ -88,7 +88,7 @@ class Link(Field[FieldLink]):
88
88
  await self.storage.upload_pb(sf, link_extracted_data)
89
89
  self.link_extracted_data = link_extracted_data
90
90
 
91
- async def get_link_extracted_data(self) -> Optional[LinkExtractedData]:
91
+ async def get_link_extracted_data(self) -> LinkExtractedData | None:
92
92
  if self.link_extracted_data is None:
93
93
  sf: StorageField = self.storage.file_extracted(
94
94
  self.kbid, self.uuid, self.type, self.id, LINK_METADATA
@@ -19,7 +19,6 @@
19
19
  #
20
20
 
21
21
  import hashlib
22
- from typing import Optional
23
22
 
24
23
  from nucliadb.ingest.fields.base import Field
25
24
  from nucliadb.ingest.fields.exceptions import FieldAuthorNotFound
@@ -39,8 +38,8 @@ class Text(Field[FieldText]):
39
38
 
40
39
  async def set_value(self, payload: FieldText):
41
40
  if payload.md5 == "":
42
- payload.md5 = hashlib.md5(payload.body.encode()).hexdigest()
41
+ payload.md5 = hashlib.md5(payload.body.encode(), usedforsecurity=False).hexdigest()
43
42
  await self.db_set_value(payload)
44
43
 
45
- async def get_value(self) -> Optional[FieldText]:
44
+ async def get_value(self) -> FieldText | None:
46
45
  return await self.db_get_value()