nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -23,31 +23,33 @@ import asyncio
23
23
  import base64
24
24
  import json
25
25
  import os
26
+ import socket
26
27
  import tempfile
27
28
  import uuid
28
29
  from concurrent.futures import ThreadPoolExecutor
29
30
  from copy import deepcopy
30
- from datetime import datetime
31
- from typing import AsyncIterator, Optional
31
+ from typing import Optional
32
32
  from urllib.parse import quote_plus
33
33
 
34
34
  import aiohttp
35
35
  import backoff
36
- from nucliadb_protos.resources_pb2 import CloudFile
36
+ import google.auth.compute_engine.credentials # type: ignore
37
+ import google.auth.transport.requests # type: ignore
38
+ import google.oauth2.credentials # type: ignore
39
+ from google.auth.exceptions import DefaultCredentialsError # type: ignore
37
40
  from oauth2client.service_account import ServiceAccountCredentials # type: ignore
38
41
 
39
42
  from nucliadb.writer import logger
40
43
  from nucliadb.writer.tus.dm import FileDataManager
41
44
  from nucliadb.writer.tus.exceptions import (
42
- CloudFileNotFound,
43
45
  HTTPBadRequest,
44
- HTTPNotFound,
45
46
  HTTPPreconditionFailed,
46
47
  ResumableURINotAvailable,
47
48
  )
48
49
  from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
49
50
  from nucliadb.writer.tus.utils import to_str
50
- from nucliadb_utils.storages.gcs import CHUNK_SIZE, MIN_UPLOAD_SIZE
51
+ from nucliadb_protos.resources_pb2 import CloudFile
52
+ from nucliadb_utils.storages.gcs import CHUNK_SIZE, MIN_UPLOAD_SIZE, TIMEOUT
51
53
 
52
54
 
53
55
  class GoogleCloudException(Exception):
@@ -61,6 +63,12 @@ MAX_RETRIES = 5
61
63
  RETRIABLE_EXCEPTIONS = (
62
64
  GoogleCloudException,
63
65
  aiohttp.client_exceptions.ClientPayloadError,
66
+ aiohttp.client_exceptions.ClientConnectorError,
67
+ aiohttp.client_exceptions.ClientConnectionError,
68
+ aiohttp.client_exceptions.ClientOSError,
69
+ aiohttp.client_exceptions.ServerConnectionError,
70
+ aiohttp.client_exceptions.ServerDisconnectedError,
71
+ socket.gaierror,
64
72
  )
65
73
 
66
74
 
@@ -69,7 +77,7 @@ class GCloudBlobStore(BlobStore):
69
77
  loop = None
70
78
  upload_url: str
71
79
  object_base_url: str
72
- json_credentials: str
80
+ json_credentials: Optional[str]
73
81
  bucket: str
74
82
  location: str
75
83
  project: str
@@ -83,9 +91,18 @@ class GCloudBlobStore(BlobStore):
83
91
  return {"AUTHORIZATION": f"Bearer {token}"}
84
92
 
85
93
  def _get_access_token(self):
86
- access_token = self._credentials.get_access_token()
87
- self._creation_access_token = datetime.now()
88
- return access_token.access_token
94
+ if isinstance(
95
+ self._credentials, google.auth.compute_engine.credentials.Credentials
96
+ ) or isinstance(self._credentials, google.oauth2.credentials.Credentials):
97
+ # google default auth object
98
+ if self._credentials.expired or self._credentials.valid is False:
99
+ request = google.auth.transport.requests.Request()
100
+ self._credentials.refresh(request)
101
+
102
+ return self._credentials.token
103
+ else:
104
+ access_token = self._credentials.get_access_token()
105
+ return access_token.access_token
89
106
 
90
107
  async def finalize(self):
91
108
  if self.session is not None:
@@ -106,32 +123,35 @@ class GCloudBlobStore(BlobStore):
106
123
  self.project = project
107
124
  self.bucket_labels = bucket_labels
108
125
  self.object_base_url = object_base_url + "/storage/v1/b"
109
- self.upload_url = (
110
- object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable"
111
- ) # noqa
112
-
126
+ self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable" # noqa
127
+ self.json_credentials = json_credentials
113
128
  self._credentials = None
114
129
 
115
- if json_credentials is not None:
116
- self.json_credentials_file = os.path.join(
117
- tempfile.mkdtemp(), "gcs_credentials.json"
118
- )
119
- open(self.json_credentials_file, "w").write(
120
- base64.b64decode(json_credentials).decode("utf-8")
121
- )
130
+ if self.json_credentials is not None and self.json_credentials.strip() != "":
131
+ self.json_credentials_file = os.path.join(tempfile.mkdtemp(), "gcs_credentials.json")
132
+ with open(self.json_credentials_file, "w") as file:
133
+ file.write(base64.b64decode(self.json_credentials).decode("utf-8"))
122
134
  self._credentials = ServiceAccountCredentials.from_json_keyfile_name(
123
135
  self.json_credentials_file, SCOPES
124
136
  )
137
+ else:
138
+ try:
139
+ self._credentials, self.project = google.auth.default()
140
+ except DefaultCredentialsError:
141
+ logger.warning("Setting up without credentials as couldn't find workload identity")
142
+ self._credentials = None
125
143
 
126
144
  loop = asyncio.get_event_loop()
127
- self.session = aiohttp.ClientSession(loop=loop)
145
+ self.session = aiohttp.ClientSession(loop=loop, timeout=TIMEOUT)
128
146
 
129
147
  async def check_exists(self, bucket_name: str):
130
148
  if self.session is None:
131
149
  raise AttributeError()
132
150
 
133
151
  headers = await self.get_access_headers()
134
- url = f"{self.object_base_url}/{bucket_name}?project={self.project}"
152
+ # Using object access url instead of bucket access to avoid
153
+ # giving admin permission to the SA, needed to GET a bucket
154
+ url = f"{self.object_base_url}/{bucket_name}/o"
135
155
  async with self.session.get(
136
156
  url,
137
157
  headers=headers,
@@ -170,9 +190,7 @@ class GCloudFileStorageManager(FileStorageManager):
170
190
  chunk_size = CHUNK_SIZE
171
191
  min_upload_size = MIN_UPLOAD_SIZE
172
192
 
173
- @backoff.on_exception(
174
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
175
- )
193
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
176
194
  async def start(self, dm: FileDataManager, path: str, kbid: str):
177
195
  """Init an upload.
178
196
 
@@ -180,12 +198,15 @@ class GCloudFileStorageManager(FileStorageManager):
180
198
  _resumable_uri : uri to resumable upload
181
199
  _uri : finished uploaded image
182
200
  """
201
+
183
202
  if self.storage.session is None:
184
203
  raise AttributeError()
185
204
 
186
- upload_file_id = dm.get("upload_file_id", str(uuid.uuid4()))
205
+ upload_file_id = dm.get("upload_file_id")
187
206
  if upload_file_id is not None:
188
207
  await self.delete_upload(upload_file_id, kbid)
208
+ else:
209
+ upload_file_id = str(uuid.uuid4())
189
210
 
190
211
  bucket = self.storage.get_bucket_name(kbid)
191
212
  init_url = "{}&name={}".format(
@@ -230,13 +251,9 @@ class GCloudFileStorageManager(FileStorageManager):
230
251
  raise GoogleCloudException(text)
231
252
  resumable_uri = call.headers["Location"]
232
253
 
233
- await dm.update(
234
- resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path
235
- )
254
+ await dm.update(resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path)
236
255
 
237
- @backoff.on_exception(
238
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
239
- )
256
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
240
257
  async def delete_upload(self, uri, kbid):
241
258
  bucket = self.storage.get_bucket_name(kbid)
242
259
 
@@ -259,8 +276,7 @@ class GCloudFileStorageManager(FileStorageManager):
259
276
  if resp.status not in (200, 204, 404):
260
277
  if resp.status == 404:
261
278
  logger.error(
262
- f"Attempt to delete not found gcloud: {data}, "
263
- f"status: {resp.status}",
279
+ f"Attempt to delete not found gcloud: {data}, " f"status: {resp.status}",
264
280
  exc_info=True,
265
281
  )
266
282
  else:
@@ -268,9 +284,7 @@ class GCloudFileStorageManager(FileStorageManager):
268
284
  else:
269
285
  raise AttributeError("No valid uri")
270
286
 
271
- @backoff.on_exception(
272
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
273
- )
287
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
274
288
  async def _append(self, dm: FileDataManager, data, offset):
275
289
  if self.storage.session is None:
276
290
  raise AttributeError()
@@ -335,9 +349,7 @@ class GCloudFileStorageManager(FileStorageManager):
335
349
  break
336
350
  return count
337
351
 
338
- @backoff.on_exception(
339
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
340
- )
352
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
341
353
  async def finish(self, dm: FileDataManager):
342
354
  if dm.size == 0:
343
355
  if self.storage.session is None:
@@ -363,46 +375,6 @@ class GCloudFileStorageManager(FileStorageManager):
363
375
  await dm.finish()
364
376
  return path
365
377
 
366
- async def iter_data(self, uri, kbid: str, headers: Optional[dict[str, str]] = None):
367
- if self.storage.session is None:
368
- raise AttributeError()
369
- if headers is None:
370
- headers = {}
371
-
372
- url = "{}/{}/o/{}".format(
373
- self.storage.object_base_url,
374
- self.storage.get_bucket_name(kbid),
375
- quote_plus(uri),
376
- )
377
- headers_auth = await self.storage.get_access_headers()
378
- headers.update(headers_auth)
379
- async with self.storage.session.get(
380
- url, headers=headers, params={"alt": "media"}, timeout=-1
381
- ) as api_resp:
382
- if api_resp.status not in (200, 206):
383
- text = await api_resp.text()
384
- if api_resp.status == 404:
385
- raise CloudFileNotFound("Google cloud file not found")
386
- elif api_resp.status == 401:
387
- logger.warning(f"Invalid google cloud credentials error: {text}")
388
- raise HTTPNotFound(
389
- detail=f"Google cloud invalid credentials: {text}"
390
- )
391
- raise GoogleCloudException(f"{api_resp.status}: {text}")
392
- while True:
393
- chunk = await api_resp.content.read(1024 * 1024)
394
- if len(chunk) > 0:
395
- yield chunk
396
- else:
397
- break
398
-
399
- async def read_range(
400
- self, uri: str, kbid: str, start: int, end: int
401
- ) -> AsyncIterator[bytes]:
402
- """
403
- Iterate through ranges of data
404
- """
405
- async for chunk in self.iter_data(
406
- uri, kbid, headers={"Range": f"bytes={start}-{end - 1}"}
407
- ):
408
- yield chunk
378
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
379
+ if uploaded_bytes < self.min_upload_size:
380
+ raise ValueError(f"Intermediate chunks cannot be smaller than {self.min_upload_size} bytes")
@@ -22,14 +22,13 @@ from __future__ import annotations
22
22
  import json
23
23
  import os
24
24
  import uuid
25
- from typing import AsyncIterator
25
+ from typing import Any
26
26
 
27
27
  import aiofiles
28
- from nucliadb_protos.resources_pb2 import CloudFile
29
28
 
30
29
  from nucliadb.writer.tus.dm import FileDataManager
31
- from nucliadb.writer.tus.exceptions import CloudFileNotFound
32
30
  from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
31
+ from nucliadb_protos.resources_pb2 import CloudFile
33
32
  from nucliadb_utils.storages import CHUNK_SIZE
34
33
 
35
34
 
@@ -50,51 +49,24 @@ class LocalFileStorageManager(FileStorageManager):
50
49
  bucket = self.storage.get_bucket_name(kbid)
51
50
  upload_file_id = dm.get("upload_file_id", str(uuid.uuid4()))
52
51
  init_url = self.get_file_path(bucket, upload_file_id)
53
- metadata_init_url = self.metadata_key(init_url)
54
52
  metadata = {
55
53
  "FILENAME": dm.filename,
56
54
  "CONTENT_TYPE": dm.content_type,
57
55
  "SIZE": dm.size,
58
56
  }
59
- async with aiofiles.open(metadata_init_url, "w+") as resp:
60
- await resp.write(json.dumps(metadata))
57
+ await self.set_metadata(kbid, upload_file_id, metadata)
61
58
 
62
59
  async with aiofiles.open(init_url, "wb+") as aio_fi:
63
60
  await aio_fi.write(b"")
64
61
 
65
- await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket)
62
+ await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket, kbid=kbid)
66
63
 
67
- async def iter_data(self, uri, kbid: str, headers=None):
68
- bucket = self.storage.get_bucket_name(kbid)
69
- file_path = self.get_file_path(bucket, uri)
70
- async with aiofiles.open(file_path) as resp:
71
- data = await resp.read(CHUNK_SIZE)
72
- while data is not None:
73
- yield data
74
- data = await resp.read(CHUNK_SIZE)
75
-
76
- async def read_range(
77
- self, uri: str, kbid: str, start: int, end: int
78
- ) -> AsyncIterator[bytes]:
79
- """
80
- Iterate through ranges of data
81
- """
64
+ async def set_metadata(self, kbid: str, upload_file_id: str, metadata: dict[str, Any]):
82
65
  bucket = self.storage.get_bucket_name(kbid)
83
- file_path = self.get_file_path(bucket, uri)
84
- try:
85
- async with aiofiles.open(file_path, "rb") as resp:
86
- await resp.seek(start)
87
- count = 0
88
- data = await resp.read(CHUNK_SIZE)
89
- while data and count < end:
90
- if count + len(data) > end:
91
- new_end = end - count
92
- data = data[:new_end]
93
- yield data
94
- count += len(data)
95
- data = await resp.read(CHUNK_SIZE)
96
- except FileNotFoundError:
97
- raise CloudFileNotFound()
66
+ init_url = self.get_file_path(bucket, upload_file_id)
67
+ metadata_init_url = self.metadata_key(init_url)
68
+ async with aiofiles.open(metadata_init_url, "w+") as resp:
69
+ await resp.write(json.dumps(metadata))
98
70
 
99
71
  async def append(self, dm: FileDataManager, iterable, offset) -> int:
100
72
  count = 0
@@ -118,6 +90,15 @@ class LocalFileStorageManager(FileStorageManager):
118
90
  upload_file_id = dm.get("upload_file_id")
119
91
  from_url = self.get_file_path(bucket, upload_file_id)
120
92
 
93
+ if dm.size > 0:
94
+ kbid = dm.get("kbid")
95
+ metadata = {
96
+ "FILENAME": dm.filename,
97
+ "CONTENT_TYPE": dm.content_type,
98
+ "SIZE": dm.size,
99
+ }
100
+ await self.set_metadata(kbid, upload_file_id, metadata)
101
+
121
102
  path = dm.get("path")
122
103
  to_url = self.get_file_path(bucket, path)
123
104
  to_url_dirs = os.path.dirname(to_url)
@@ -138,6 +119,9 @@ class LocalFileStorageManager(FileStorageManager):
138
119
  file_path = self.get_file_path(bucket, uri)
139
120
  os.remove(file_path)
140
121
 
122
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
123
+ pass
124
+
141
125
 
142
126
  class LocalBlobStore(BlobStore):
143
127
  def __init__(self, local_testing_files: str):
nucliadb/writer/tus/s3.py CHANGED
@@ -19,21 +19,22 @@
19
19
  #
20
20
  from __future__ import annotations
21
21
 
22
+ import base64
22
23
  import uuid
23
24
  from contextlib import AsyncExitStack
24
- from typing import AsyncIterator, Optional
25
+ from typing import Optional
25
26
 
26
27
  import aiobotocore # type: ignore
27
28
  import aiohttp
28
- import backoff # type: ignore
29
+ import backoff
29
30
  import botocore # type: ignore
30
31
  from aiobotocore.session import AioSession # type: ignore
31
- from nucliadb_protos.resources_pb2 import CloudFile
32
32
 
33
33
  from nucliadb.writer import logger
34
34
  from nucliadb.writer.tus.dm import FileDataManager
35
- from nucliadb.writer.tus.exceptions import CloudFileNotFound, ResumableURINotAvailable
35
+ from nucliadb.writer.tus.exceptions import ResumableURINotAvailable
36
36
  from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
37
+ from nucliadb_protos.resources_pb2 import CloudFile
37
38
  from nucliadb_utils.storages.s3 import (
38
39
  CHUNK_SIZE,
39
40
  MIN_UPLOAD_SIZE,
@@ -53,9 +54,7 @@ class S3FileStorageManager(FileStorageManager):
53
54
  chunk_size = CHUNK_SIZE
54
55
  min_upload_size = MIN_UPLOAD_SIZE
55
56
 
56
- @backoff.on_exception(
57
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
58
- )
57
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
59
58
  async def _abort_multipart(self, dm: FileDataManager):
60
59
  try:
61
60
  mpu = dm.get("mpu")
@@ -72,21 +71,25 @@ class S3FileStorageManager(FileStorageManager):
72
71
  if dm.get("mpu") is not None:
73
72
  await self._abort_multipart(dm)
74
73
 
74
+ custom_metadata: dict[str, str] = {
75
+ "base64_filename": base64.b64encode((dm.filename or "").encode()).decode(),
76
+ "content_type": dm.content_type or "",
77
+ "size": str(dm.size),
78
+ }
79
+
75
80
  await dm.update(
76
81
  path=path,
77
82
  upload_file_id=upload_file_id,
78
83
  multipart={"Parts": []},
79
84
  block=1,
80
- mpu=await self._create_multipart(path, bucket),
85
+ mpu=await self._create_multipart(path, bucket, custom_metadata),
81
86
  bucket=bucket,
82
87
  )
83
88
 
84
- @backoff.on_exception(
85
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
86
- )
87
- async def _create_multipart(self, path, bucket):
89
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
90
+ async def _create_multipart(self, path, bucket, custom_metadata: dict[str, str]):
88
91
  return await self.storage._s3aioclient.create_multipart_upload(
89
- Bucket=bucket, Key=path
92
+ Bucket=bucket, Key=path, Metadata=custom_metadata
90
93
  )
91
94
 
92
95
  async def append(self, dm: FileDataManager, iterable, offset) -> int:
@@ -96,16 +99,12 @@ class S3FileStorageManager(FileStorageManager):
96
99
  size += len(chunk)
97
100
  part = await self._upload_part(dm, chunk)
98
101
  multipart = dm.get("multipart")
99
- multipart["Parts"].append(
100
- {"PartNumber": dm.get("block"), "ETag": part["ETag"]}
101
- )
102
+ multipart["Parts"].append({"PartNumber": dm.get("block"), "ETag": part["ETag"]})
102
103
  await dm.update(multipart=multipart, block=dm.get("block") + 1)
103
104
 
104
105
  return size
105
106
 
106
- @backoff.on_exception(
107
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
108
- )
107
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
109
108
  async def _upload_part(self, dm: FileDataManager, data):
110
109
  mpu = dm.get("mpu")
111
110
  if mpu is None:
@@ -128,18 +127,14 @@ class S3FileStorageManager(FileStorageManager):
128
127
  await dm.finish()
129
128
  return path
130
129
 
131
- @backoff.on_exception(
132
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
133
- )
130
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
134
131
  async def _complete_multipart_upload(self, dm: FileDataManager):
135
132
  # if blocks is 0, it means the file is of zero length so we need to
136
133
  # trick it to finish a multiple part with no data.
137
134
  if dm.get("block") == 1:
138
135
  part = await self._upload_part(dm, b"")
139
136
  multipart = dm.get("multipart")
140
- multipart["Parts"].append(
141
- {"PartNumber": dm.get("block"), "ETag": part["ETag"]}
142
- )
137
+ multipart["Parts"].append({"PartNumber": dm.get("block"), "ETag": part["ETag"]})
143
138
  await dm.update(multipart=multipart, block=dm.get("block") + 1)
144
139
  await self.storage._s3aioclient.complete_multipart_upload(
145
140
  Bucket=dm.get("bucket"),
@@ -148,45 +143,10 @@ class S3FileStorageManager(FileStorageManager):
148
143
  MultipartUpload=dm.get("multipart"),
149
144
  )
150
145
 
151
- @backoff.on_exception(
152
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3
153
- )
146
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=3)
154
147
  async def _download(self, uri: str, kbid: str, **kwargs):
155
148
  bucket = self.storage.get_bucket_name(kbid)
156
- return await self.storage._s3aioclient.get_object(
157
- Bucket=bucket, Key=uri, **kwargs
158
- )
159
-
160
- async def iter_data(
161
- self, uri: str, kbid: str, headers: Optional[dict[str, str]] = None
162
- ):
163
- if headers is None:
164
- headers = {}
165
- try:
166
- downloader = await self._download(uri, kbid, **headers)
167
- except self.storage._s3aioclient.exceptions.NoSuchKey:
168
- raise CloudFileNotFound()
169
-
170
- # we do not want to timeout ever from this...
171
- # downloader['Body'].set_socket_timeout(999999)
172
- stream = downloader["Body"]
173
- data = await stream.read(CHUNK_SIZE)
174
- while True:
175
- if not data:
176
- break
177
- yield data
178
- data = await stream.read(CHUNK_SIZE)
179
-
180
- async def read_range(
181
- self, uri, kbid: str, start: int, end: int
182
- ) -> AsyncIterator[bytes]:
183
- """
184
- Iterate through ranges of data
185
- """
186
- async for chunk in self.iter_data(
187
- uri, kbid, headers={"Range": f"bytes={start}-{end - 1}"}
188
- ):
189
- yield chunk
149
+ return await self.storage._s3aioclient.get_object(Bucket=bucket, Key=uri, **kwargs)
190
150
 
191
151
  async def delete_upload(self, uri: str, kbid: str):
192
152
  bucket = self.storage.get_bucket_name(kbid)
@@ -198,6 +158,10 @@ class S3FileStorageManager(FileStorageManager):
198
158
  else:
199
159
  raise AttributeError("No valid uri")
200
160
 
161
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
162
+ if uploaded_bytes % self.min_upload_size != 0:
163
+ raise ValueError(f"Intermediate chunks need to be multiples of {self.min_upload_size} bytes")
164
+
201
165
 
202
166
  class S3BlobStore(BlobStore):
203
167
  async def check_exists(self, bucket_name: str) -> bool:
@@ -213,9 +177,7 @@ class S3BlobStore(BlobStore):
213
177
  async def create_bucket(self, bucket):
214
178
  exists = await self.check_exists(bucket)
215
179
  if not exists:
216
- await create_bucket(
217
- self._s3aioclient, bucket, self.bucket_tags, self.region_name
218
- )
180
+ await create_bucket(self._s3aioclient, bucket, self.bucket_tags, self.region_name)
219
181
  return exists
220
182
 
221
183
  async def finalize(self):
@@ -247,9 +209,7 @@ class S3BlobStore(BlobStore):
247
209
  verify=verify_ssl,
248
210
  use_ssl=ssl,
249
211
  region_name=region_name,
250
- config=aiobotocore.config.AioConfig(
251
- None, max_pool_connections=max_pool_connections
252
- ),
212
+ config=aiobotocore.config.AioConfig(None, max_pool_connections=max_pool_connections),
253
213
  )
254
214
  session = AioSession()
255
215
  self._s3aioclient = await self._exit_stack.enter_async_context(
@@ -21,15 +21,8 @@ from __future__ import annotations
21
21
 
22
22
  from typing import AsyncIterator, Optional
23
23
 
24
- from lru import LRU # type: ignore
25
- from nucliadb_protos.resources_pb2 import CloudFile
26
- from starlette.responses import StreamingResponse
27
-
28
- from nucliadb.writer import logger
29
24
  from nucliadb.writer.tus.dm import FileDataManager
30
- from nucliadb.writer.tus.exceptions import HTTPRangeNotSatisfiable
31
-
32
- CACHED_BUCKETS = LRU(50) # type: ignore
25
+ from nucliadb_protos.resources_pb2 import CloudFile
33
26
 
34
27
 
35
28
  class BlobStore:
@@ -56,14 +49,9 @@ class FileStorageManager:
56
49
  chunk_size: int
57
50
  min_upload_size: Optional[int] = None
58
51
 
59
- def __init__(self, storage):
52
+ def __init__(self, storage: BlobStore):
60
53
  self.storage = storage
61
54
 
62
- def read_range(
63
- self, uri: str, kbid: str, start: int, end: int
64
- ) -> AsyncIterator[bytes]:
65
- raise NotImplementedError()
66
-
67
55
  def iter_data(
68
56
  self, uri: str, kbid: str, headers: Optional[dict[str, str]] = None
69
57
  ) -> AsyncIterator[bytes]:
@@ -81,48 +69,6 @@ class FileStorageManager:
81
69
  async def delete_upload(self, uri, kbid):
82
70
  raise NotImplementedError()
83
71
 
84
- async def full_download(self, content_length, content_type, upload_id):
85
- return StreamingResponse(
86
- self.iter_data(upload_id),
87
- media_type=content_type,
88
- headers={
89
- "Content-Length": str(content_length),
90
- "Content-Type": content_type,
91
- },
92
- )
93
-
94
- async def range_download(
95
- self, content_length, content_type, upload_id, range_header
96
- ):
97
- try:
98
- start, _, end = range_header.split("bytes=")[-1].partition("-")
99
- start = int(start)
100
- if len(end) == 0:
101
- # bytes=0- is valid
102
- end = content_length - 1
103
- end = int(end) + 1 # python is inclusive, http is exclusive
104
- except (IndexError, ValueError):
105
- # range errors fallback to full download
106
- raise HTTPRangeNotSatisfiable(detail=f"Range not parsable {range_header}")
107
- if start > end or start < 0:
108
- raise HTTPRangeNotSatisfiable(detail="Invalid range {start}-{end}")
109
- if end > content_length:
110
- raise HTTPRangeNotSatisfiable(
111
- detail="Invalid range {start}-{end}, too large end value"
112
- )
113
-
114
- logger.debug(f"Range request: {range_header}")
115
- headers = {
116
- "Content-Range": f"bytes {start}-{end - 1}/{content_length}",
117
- "Content-Type": content_type,
118
- }
119
-
120
- return StreamingResponse(
121
- self.read_range(upload_id, start, end),
122
- media_type=content_type,
123
- headers=headers,
124
- )
125
-
126
72
  async def iterate_body_chunks(self, request, chunk_size):
127
73
  partial = b""
128
74
  remaining = b""
@@ -146,3 +92,6 @@ class FileStorageManager:
146
92
 
147
93
  if partial or remaining:
148
94
  yield partial + remaining
95
+
96
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
97
+ raise NotImplementedError()