nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -19,15 +19,15 @@
19
19
  #
20
20
  import dataclasses
21
21
  from datetime import datetime
22
- from typing import Optional, Union
23
22
 
23
+ from fastapi import HTTPException
24
24
  from google.protobuf.json_format import MessageToDict
25
25
 
26
26
  import nucliadb_models as models
27
27
  from nucliadb.common import datamanagers
28
28
  from nucliadb.common.maindb.driver import Transaction
29
29
  from nucliadb.common.models_utils import from_proto, to_proto
30
- from nucliadb.ingest.fields.conversation import Conversation
30
+ from nucliadb.ingest.fields.conversation import MAX_CONVERSATION_MESSAGES, Conversation
31
31
  from nucliadb.ingest.orm.resource import Resource as ORMResource
32
32
  from nucliadb.models.internal import processing as processing_models
33
33
  from nucliadb.models.internal.processing import ClassificationLabel, PushConversation, PushPayload
@@ -87,7 +87,7 @@ async def extract_file_field(
87
87
  resource: ORMResource,
88
88
  toprocess: PushPayload,
89
89
  resource_classifications: ResourceClassifications,
90
- password: Optional[str] = None,
90
+ password: str | None = None,
91
91
  ):
92
92
  field_type = resources_pb2.FieldType.FILE
93
93
  field = await resource.get_field(field_id, field_type)
@@ -182,7 +182,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
182
182
  async def parse_fields(
183
183
  writer: BrokerMessage,
184
184
  toprocess: PushPayload,
185
- item: Union[CreateResourcePayload, UpdateResourcePayload],
185
+ item: CreateResourcePayload | UpdateResourcePayload,
186
186
  kbid: str,
187
187
  uuid: str,
188
188
  x_skip_store: bool,
@@ -227,6 +227,7 @@ async def parse_fields(
227
227
  kbid,
228
228
  uuid,
229
229
  resource_classifications,
230
+ replace_field=True,
230
231
  )
231
232
 
232
233
 
@@ -430,11 +431,15 @@ async def parse_conversation_field(
430
431
  kbid: str,
431
432
  uuid: str,
432
433
  resource_classifications: ResourceClassifications,
434
+ replace_field: bool,
433
435
  ) -> None:
436
+ if not replace_field:
437
+ # Appending messages to conversation
438
+ await _conversation_append_checks(kbid, uuid, key, conversation_field)
434
439
  classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.CONVERSATION)
435
440
  storage = await get_storage(service_name=SERVICE_NAME)
436
441
  processing = get_processing()
437
- field_value = resources_pb2.Conversation()
442
+ field_value = resources_pb2.Conversation(replace_field=replace_field)
438
443
  convs = processing_models.PushConversation()
439
444
  for message in conversation_field.messages:
440
445
  cm = resources_pb2.Message()
@@ -543,3 +548,36 @@ async def get_stored_resource_classifications(
543
548
  classif = ClassificationLabel(labelset=f_classif.labelset, label=f_classif.label)
544
549
  rc.field_level.setdefault(fid, set()).add(classif)
545
550
  return rc
551
+
552
+
553
+ async def _conversation_append_checks(
554
+ kbid: str, rid: str, field_id: str, input: models.InputConversationField
555
+ ):
556
+ async with datamanagers.with_ro_transaction() as txn:
557
+ resource_obj = await ORMResource.get(txn, kbid=kbid, rid=rid)
558
+ if resource_obj is None:
559
+ return
560
+ conv: Conversation = await resource_obj.get_field(
561
+ field_id, resources_pb2.FieldType.CONVERSATION, load=False
562
+ )
563
+
564
+ # Make sure that the max number of messages is not exceeded
565
+ current_message_count = (await conv.get_metadata()).total
566
+ if (
567
+ MAX_CONVERSATION_MESSAGES is not None
568
+ and (len(input.messages) + current_message_count) > MAX_CONVERSATION_MESSAGES
569
+ ):
570
+ raise HTTPException(
571
+ status_code=422,
572
+ detail=f"Conversation fields cannot have more than {MAX_CONVERSATION_MESSAGES} messages.",
573
+ )
574
+
575
+ # Make sure input messages use unique idents
576
+ existing_message_ids = set((await conv.get_splits_metadata()).metadata.keys())
577
+ input_message_ids = {message.ident for message in input.messages}
578
+ intersection = input_message_ids.intersection(existing_message_ids)
579
+ if intersection != set():
580
+ raise HTTPException(
581
+ status_code=422,
582
+ detail=f"Message identifiers must be unique field={field_id}: {list(intersection)[:50]}",
583
+ )
@@ -42,6 +42,13 @@ def parse_origin(origin: Origin, origin_payload: InputOrigin):
42
42
  origin.metadata.update(origin_payload.metadata)
43
43
  if origin_payload.path:
44
44
  origin.path = origin_payload.path
45
+ if origin_payload.sync_metadata is not None:
46
+ origin.sync_metadata.CopyFrom(
47
+ resources_pb2.SyncMetadata(
48
+ file_id=origin_payload.sync_metadata.file_id,
49
+ auth_provider=origin_payload.sync_metadata.auth_provider,
50
+ )
51
+ )
45
52
  origin.source = Origin.Source.API
46
53
 
47
54
 
@@ -17,15 +17,14 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from pydantic_settings import BaseSettings
23
22
 
24
23
 
25
24
  class Settings(BaseSettings):
26
25
  dm_enabled: bool = True
27
- dm_redis_host: Optional[str] = None
28
- dm_redis_port: Optional[int] = None
26
+ dm_redis_host: str | None = None
27
+ dm_redis_port: int | None = None
29
28
 
30
29
 
31
30
  settings = Settings()
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from dataclasses import dataclass
21
- from typing import Optional
22
21
 
23
22
  from nucliadb.writer.settings import settings as writer_settings
24
23
  from nucliadb.writer.tus.dm import FileDataManager, RedisFileDataManagerFactory
@@ -37,8 +36,8 @@ class TusStorageDriver:
37
36
  manager: FileStorageManager
38
37
 
39
38
 
40
- DRIVER: Optional[TusStorageDriver] = None
41
- REDIS_FILE_DATA_MANAGER_FACTORY: Optional[RedisFileDataManagerFactory] = None
39
+ DRIVER: TusStorageDriver | None = None
40
+ REDIS_FILE_DATA_MANAGER_FACTORY: RedisFileDataManagerFactory | None = None
42
41
 
43
42
 
44
43
  async def initialize():
@@ -19,15 +19,12 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
- from typing import Optional
23
-
24
22
  from nucliadb.writer import logger
25
23
  from nucliadb.writer.tus.dm import FileDataManager
26
24
  from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
27
25
  from nucliadb_protos.resources_pb2 import CloudFile
28
26
  from nucliadb_utils.storages import CHUNK_SIZE
29
27
  from nucliadb_utils.storages.azure import AzureObjectStore
30
- from nucliadb_utils.storages.exceptions import ObjectNotFoundError
31
28
  from nucliadb_utils.storages.utils import ObjectMetadata
32
29
 
33
30
 
@@ -41,7 +38,7 @@ class AzureBlobStore(BlobStore):
41
38
  logger.exception("Error closing AzureBlobStore")
42
39
  self._object_store = None
43
40
 
44
- async def initialize(self, account_url: str, connection_string: Optional[str] = None):
41
+ async def initialize(self, account_url: str, connection_string: str | None = None):
45
42
  self.bucket = "nucliadb-{kbid}"
46
43
  self.source = CloudFile.Source.AZURE
47
44
  self._object_store = AzureObjectStore(account_url, connection_string=connection_string)
@@ -63,7 +60,7 @@ class AzureBlobStore(BlobStore):
63
60
  class AzureFileStorageManager(FileStorageManager):
64
61
  storage: AzureBlobStore
65
62
  chunk_size = CHUNK_SIZE
66
- min_upload_size = None
63
+ min_upload_size = CHUNK_SIZE
67
64
 
68
65
  @property
69
66
  def object_store(self) -> AzureObjectStore:
@@ -87,7 +84,7 @@ class AzureFileStorageManager(FileStorageManager):
87
84
  bucket = self.storage.get_bucket_name(kbid)
88
85
  try:
89
86
  await self.object_store.delete(bucket, uri)
90
- except ObjectNotFoundError:
87
+ except KeyError:
91
88
  logger.warning(
92
89
  "Attempt to delete an upload but not found",
93
90
  extra={"uri": uri, "kbid": kbid, "bucket": bucket},
@@ -108,4 +105,5 @@ class AzureFileStorageManager(FileStorageManager):
108
105
  return path
109
106
 
110
107
  def validate_intermediate_chunk(self, uploaded_bytes: int):
111
- pass
108
+ if uploaded_bytes < self.min_upload_size:
109
+ raise ValueError(f"Intermediate chunks cannot be smaller than {self.min_upload_size} bytes")
nucliadb/writer/tus/dm.py CHANGED
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import time
21
- from typing import Any, Optional
21
+ from typing import Any
22
22
 
23
23
  import backoff
24
24
  import orjson
@@ -43,7 +43,7 @@ DATA: dict[str, Any] = {}
43
43
 
44
44
 
45
45
  class FileDataManager:
46
- _data: Optional[dict[str, Any]] = None
46
+ _data: dict[str, Any] | None = None
47
47
  _loaded = False
48
48
  key = None
49
49
  _ttl = 60 * 50 * 5 # 5 minutes should be plenty of time between activity
@@ -63,7 +63,7 @@ class FileDataManager:
63
63
  if self._data and "last_activity" in self._data:
64
64
  # check for another active upload, fail if we're screwing with
65
65
  # someone else
66
- last_activity: Optional[int] = self._data.get("last_activity")
66
+ last_activity: int | None = self._data.get("last_activity")
67
67
  if last_activity and (time.time() - last_activity) < self._ttl:
68
68
  if request.headers and request.headers.get("tus-override-upload", "0") != "1":
69
69
  raise HTTPPreconditionFailed(
@@ -17,7 +17,6 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
21
  from starlette.exceptions import HTTPException as StarletteHTTPException
23
22
 
@@ -27,11 +26,11 @@ class InvalidTUSMetadata(Exception):
27
26
 
28
27
 
29
28
  class HTTPException(StarletteHTTPException):
30
- _status_code: Optional[int] = None
29
+ _status_code: int | None = None
31
30
 
32
- def __init__(self, detail: Optional[str] = None):
31
+ def __init__(self, detail: str | None = None):
33
32
  if self._status_code:
34
- super(HTTPException, self).__init__(status_code=self._status_code, detail=detail)
33
+ super().__init__(status_code=self._status_code, detail=detail)
35
34
  else:
36
35
  raise AttributeError("Status code not defined")
37
36
 
@@ -28,7 +28,6 @@ import tempfile
28
28
  import uuid
29
29
  from concurrent.futures import ThreadPoolExecutor
30
30
  from copy import deepcopy
31
- from typing import Optional
32
31
  from urllib.parse import quote_plus
33
32
 
34
33
  import aiohttp
@@ -74,16 +73,22 @@ RETRIABLE_EXCEPTIONS = (
74
73
 
75
74
 
76
75
  class GCloudBlobStore(BlobStore):
77
- session: Optional[aiohttp.ClientSession] = None
76
+ _session: aiohttp.ClientSession | None = None
78
77
  loop = None
79
78
  upload_url: str
80
79
  object_base_url: str
81
- json_credentials: Optional[str]
80
+ json_credentials: str | None
82
81
  bucket: str
83
82
  location: str
84
83
  project: str
85
84
  executor = ThreadPoolExecutor(max_workers=5)
86
85
 
86
+ @property
87
+ def session(self) -> aiohttp.ClientSession:
88
+ if self._session is None: # pragma: no cover
89
+ raise AttributeError("Session not initialized")
90
+ return self._session
91
+
87
92
  async def get_access_headers(self):
88
93
  if self._credentials is None:
89
94
  return {}
@@ -106,8 +111,9 @@ class GCloudBlobStore(BlobStore):
106
111
  return access_token.access_token
107
112
 
108
113
  async def finalize(self):
109
- if self.session is not None:
110
- await self.session.close()
114
+ if self._session is not None:
115
+ await self._session.close()
116
+ self._session = None
111
117
 
112
118
  async def initialize(
113
119
  self,
@@ -116,7 +122,7 @@ class GCloudBlobStore(BlobStore):
116
122
  project: str,
117
123
  bucket_labels,
118
124
  object_base_url: str,
119
- json_credentials: Optional[str],
125
+ json_credentials: str | None,
120
126
  ):
121
127
  self.bucket = bucket
122
128
  self.source = CloudFile.Source.GCS
@@ -124,7 +130,7 @@ class GCloudBlobStore(BlobStore):
124
130
  self.project = project
125
131
  self.bucket_labels = bucket_labels
126
132
  self.object_base_url = object_base_url + "/storage/v1/b"
127
- self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable" # noqa
133
+ self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable"
128
134
  self.json_credentials = json_credentials
129
135
  self._credentials = None
130
136
 
@@ -143,12 +149,9 @@ class GCloudBlobStore(BlobStore):
143
149
  self._credentials = None
144
150
 
145
151
  loop = asyncio.get_event_loop()
146
- self.session = aiohttp.ClientSession(loop=loop, timeout=TIMEOUT)
152
+ self._session = aiohttp.ClientSession(loop=loop, timeout=TIMEOUT)
147
153
 
148
154
  async def check_exists(self, bucket_name: str):
149
- if self.session is None:
150
- raise AttributeError()
151
-
152
155
  headers = await self.get_access_headers()
153
156
  # Using object access url instead of bucket access to avoid
154
157
  # giving admin permission to the SA, needed to GET a bucket
@@ -163,8 +166,6 @@ class GCloudBlobStore(BlobStore):
163
166
  return False
164
167
 
165
168
  async def create_bucket(self, bucket_name: str):
166
- if self.session is None:
167
- raise AttributeError()
168
169
  headers = await self.get_access_headers()
169
170
  url = f"{self.object_base_url}?project={self.project}"
170
171
 
@@ -199,10 +200,6 @@ class GCloudFileStorageManager(FileStorageManager):
199
200
  _resumable_uri : uri to resumable upload
200
201
  _uri : finished uploaded image
201
202
  """
202
-
203
- if self.storage.session is None:
204
- raise AttributeError()
205
-
206
203
  upload_file_id = dm.get("upload_file_id")
207
204
  if upload_file_id is not None:
208
205
  await self.delete_upload(upload_file_id, kbid)
@@ -287,8 +284,6 @@ class GCloudFileStorageManager(FileStorageManager):
287
284
 
288
285
  @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
289
286
  async def _append(self, dm: FileDataManager, data, offset):
290
- if self.storage.session is None:
291
- raise AttributeError()
292
287
  if dm.size:
293
288
  size = str(dm.size)
294
289
  else:
@@ -315,7 +310,7 @@ class GCloudFileStorageManager(FileStorageManager):
315
310
  },
316
311
  data=data,
317
312
  ) as call:
318
- text = await call.text() # noqa
313
+ text = await call.text()
319
314
  if call.status not in [200, 201, 308]:
320
315
  raise GoogleCloudException(f"{call.status}: {text}")
321
316
  return call
@@ -353,8 +348,6 @@ class GCloudFileStorageManager(FileStorageManager):
353
348
  @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
354
349
  async def finish(self, dm: FileDataManager):
355
350
  if dm.size == 0:
356
- if self.storage.session is None:
357
- raise AttributeError()
358
351
  # In case of empty file, we need to send a PUT request with empty body
359
352
  # and Content-Range header set to "bytes */0"
360
353
  headers = {
nucliadb/writer/tus/s3.py CHANGED
@@ -22,7 +22,6 @@ from __future__ import annotations
22
22
  import base64
23
23
  import uuid
24
24
  from contextlib import AsyncExitStack
25
- from typing import Optional
26
25
 
27
26
  import aiobotocore # type: ignore
28
27
  import aiohttp
@@ -195,8 +194,8 @@ class S3BlobStore(BlobStore):
195
194
  endpoint_url,
196
195
  region_name,
197
196
  bucket,
198
- bucket_tags: Optional[dict[str, str]] = None,
199
- kms_key_id: Optional[str] = None,
197
+ bucket_tags: dict[str, str] | None = None,
198
+ kms_key_id: str | None = None,
200
199
  ):
201
200
  self.bucket = bucket
202
201
  self.bucket_tags = bucket_tags
@@ -19,7 +19,7 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
- from typing import AsyncIterator, Optional
22
+ from collections.abc import AsyncIterator
23
23
 
24
24
  from nucliadb.writer.tus.dm import FileDataManager
25
25
  from nucliadb_protos.resources_pb2 import CloudFile
@@ -47,13 +47,13 @@ class BlobStore:
47
47
 
48
48
  class FileStorageManager:
49
49
  chunk_size: int
50
- min_upload_size: Optional[int] = None
50
+ min_upload_size: int | None = None
51
51
 
52
52
  def __init__(self, storage: BlobStore):
53
53
  self.storage = storage
54
54
 
55
55
  def iter_data(
56
- self, uri: str, kbid: str, headers: Optional[dict[str, str]] = None
56
+ self, uri: str, kbid: str, headers: dict[str, str] | None = None
57
57
  ) -> AsyncIterator[bytes]:
58
58
  raise NotImplementedError()
59
59
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nucliadb
3
- Version: 6.7.2.post4874
3
+ Version: 6.10.0.post5705
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License-Expression: AGPL-3.0-or-later
@@ -12,20 +12,19 @@ Classifier: Development Status :: 4 - Beta
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Information Technology
14
14
  Classifier: Programming Language :: Python
15
- Classifier: Programming Language :: Python :: 3.9
16
15
  Classifier: Programming Language :: Python :: 3.10
17
16
  Classifier: Programming Language :: Python :: 3.11
18
17
  Classifier: Programming Language :: Python :: 3.12
19
18
  Classifier: Programming Language :: Python :: 3 :: Only
20
- Requires-Python: <4,>=3.9
19
+ Requires-Python: <4,>=3.10
21
20
  Description-Content-Type: text/markdown
22
- Requires-Dist: nucliadb-telemetry[all]>=6.7.2.post4874
23
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.7.2.post4874
24
- Requires-Dist: nucliadb-protos>=6.7.2.post4874
25
- Requires-Dist: nucliadb-models>=6.7.2.post4874
26
- Requires-Dist: nidx-protos>=6.7.2.post4874
21
+ Requires-Dist: nucliadb-telemetry[all]>=6.10.0.post5705
22
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.10.0.post5705
23
+ Requires-Dist: nucliadb-protos[grpc]>=6.10.0.post5705
24
+ Requires-Dist: nucliadb-models>=6.10.0.post5705
25
+ Requires-Dist: nidx-protos[grpc]>=6.10.0.post5705
27
26
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
28
- Requires-Dist: nuclia-models>=0.46.0
27
+ Requires-Dist: nuclia-models>=0.50.0
29
28
  Requires-Dist: uvicorn[standard]
30
29
  Requires-Dist: argdantic
31
30
  Requires-Dist: aiohttp>=3.11.11
@@ -35,7 +34,7 @@ Requires-Dist: aiofiles>=0.8.0
35
34
  Requires-Dist: psutil>=5.9.7
36
35
  Requires-Dist: types-psutil>=5.9.5.17
37
36
  Requires-Dist: types-aiofiles>=0.8.3
38
- Requires-Dist: protobuf<6,>=5
37
+ Requires-Dist: protobuf>=5
39
38
  Requires-Dist: types-protobuf<6,>=5
40
39
  Requires-Dist: grpcio>=1.71.0
41
40
  Requires-Dist: grpcio-health-checking>=1.71.0
@@ -57,7 +56,7 @@ Requires-Dist: jwcrypto>=1.5.6
57
56
  Requires-Dist: pyyaml>=5.1
58
57
  Requires-Dist: fastapi-versioning>=0.10.0
59
58
  Requires-Dist: fastapi>=0.95.2
60
- Requires-Dist: sentry-sdk>=2.8.0
59
+ Requires-Dist: sentry-sdk[fastapi]>=2.8.0
61
60
  Requires-Dist: pyjwt>=2.4.0
62
61
  Requires-Dist: mmh3>=3.0.0
63
62
  Requires-Dist: httpx>=0.23.0