nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,111 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from __future__ import annotations
21
+
22
+ from typing import Optional
23
+
24
+ from nucliadb.writer import logger
25
+ from nucliadb.writer.tus.dm import FileDataManager
26
+ from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
27
+ from nucliadb_protos.resources_pb2 import CloudFile
28
+ from nucliadb_utils.storages import CHUNK_SIZE
29
+ from nucliadb_utils.storages.azure import AzureObjectStore
30
+ from nucliadb_utils.storages.exceptions import ObjectNotFoundError
31
+ from nucliadb_utils.storages.utils import ObjectMetadata
32
+
33
+
34
+ class AzureBlobStore(BlobStore):
35
+ async def finalize(self):
36
+ if self._object_store is None:
37
+ return
38
+ try:
39
+ await self._object_store.finalize()
40
+ except Exception:
41
+ logger.exception("Error closing AzureBlobStore")
42
+ self._object_store = None
43
+
44
+ async def initialize(self, account_url: str, connection_string: Optional[str] = None):
45
+ self.bucket = "nucliadb-{kbid}"
46
+ self.source = CloudFile.Source.AZURE
47
+ self._object_store = AzureObjectStore(account_url, connection_string=connection_string)
48
+ await self._object_store.initialize()
49
+
50
+ @property
51
+ def object_store(self) -> AzureObjectStore:
52
+ assert self._object_store is not None
53
+ return self._object_store
54
+
55
+ async def check_exists(self, bucket_name: str) -> bool:
56
+ return await self.object_store.bucket_exists(bucket_name)
57
+
58
+ async def create_bucket(self, bucket_name: str) -> bool:
59
+ created = await self.object_store.bucket_create(bucket_name)
60
+ return not created
61
+
62
+
63
+ class AzureFileStorageManager(FileStorageManager):
64
+ storage: AzureBlobStore
65
+ chunk_size = CHUNK_SIZE
66
+ min_upload_size = None
67
+
68
+ @property
69
+ def object_store(self) -> AzureObjectStore:
70
+ return self.storage.object_store
71
+
72
+ async def start(self, dm: FileDataManager, path: str, kbid: str):
73
+ bucket = self.storage.get_bucket_name(kbid)
74
+ if dm.filename == 0:
75
+ filename = "file"
76
+ else:
77
+ filename = dm.filename
78
+ metadata = ObjectMetadata(
79
+ filename=filename,
80
+ content_type=dm.content_type,
81
+ size=dm.size,
82
+ )
83
+ await self.object_store.upload_multipart_start(bucket, path, metadata)
84
+ await dm.update(path=path, bucket=bucket)
85
+
86
+ async def delete_upload(self, uri: str, kbid: str) -> None:
87
+ bucket = self.storage.get_bucket_name(kbid)
88
+ try:
89
+ await self.object_store.delete(bucket, uri)
90
+ except ObjectNotFoundError:
91
+ logger.warning(
92
+ "Attempt to delete an upload but not found",
93
+ extra={"uri": uri, "kbid": kbid, "bucket": bucket},
94
+ )
95
+
96
+ async def append(self, dm: FileDataManager, iterable, offset: int) -> int:
97
+ bucket = dm.get("bucket")
98
+ assert bucket is not None
99
+ path = dm.get("path")
100
+ assert path is not None
101
+ uploaded_bytes = await self.object_store.upload_multipart_append(bucket, path, iterable)
102
+ await dm.update(offset=offset)
103
+ return uploaded_bytes
104
+
105
+ async def finish(self, dm: FileDataManager):
106
+ path = dm.get("path")
107
+ await dm.finish()
108
+ return path
109
+
110
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
111
+ pass
nucliadb/writer/tus/dm.py CHANGED
@@ -20,6 +20,7 @@
20
20
  import time
21
21
  from typing import Any, Optional
22
22
 
23
+ import backoff
23
24
  import orjson
24
25
  from redis import asyncio as aioredis
25
26
  from starlette.requests import Request
@@ -33,6 +34,11 @@ class NoRedisConfigured(Exception):
33
34
  pass
34
35
 
35
36
 
37
+ RETRIABLE_REDIS_ERRORS = (
38
+ aioredis.ConnectionError,
39
+ aioredis.TimeoutError,
40
+ )
41
+
36
42
  DATA: dict[str, Any] = {}
37
43
 
38
44
 
@@ -59,10 +65,7 @@ class FileDataManager:
59
65
  # someone else
60
66
  last_activity: Optional[int] = self._data.get("last_activity")
61
67
  if last_activity and (time.time() - last_activity) < self._ttl:
62
- if (
63
- request.headers
64
- and request.headers.get("tus-override-upload", "0") != "1"
65
- ):
68
+ if request.headers and request.headers.get("tus-override-upload", "0") != "1":
66
69
  raise HTTPPreconditionFailed(
67
70
  detail="There is already an active tusupload that conflicts with this one."
68
71
  )
@@ -136,7 +139,7 @@ class RedisFileDataManagerFactory:
136
139
 
137
140
  async def finalize(self):
138
141
  try:
139
- await self.redis.close(close_connection_pool=True)
142
+ await self.redis.aclose(close_connection_pool=True)
140
143
  except Exception:
141
144
  logger.warning("Error closing redis connection", exc_info=True)
142
145
  pass
@@ -146,6 +149,9 @@ class RedisFileDataManager(FileDataManager):
146
149
  def __init__(self, redis: aioredis.Redis):
147
150
  self.redis = redis
148
151
 
152
+ @backoff.on_exception(
153
+ backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
154
+ )
149
155
  async def load(self, key):
150
156
  # preload data
151
157
  self.key = key
@@ -157,6 +163,9 @@ class RedisFileDataManager(FileDataManager):
157
163
  self._data = orjson.loads(data)
158
164
  self._loaded = True
159
165
 
166
+ @backoff.on_exception(
167
+ backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
168
+ )
160
169
  async def save(self):
161
170
  if self.key is None:
162
171
  raise Exception("Not initialized")
@@ -164,6 +173,9 @@ class RedisFileDataManager(FileDataManager):
164
173
  value = orjson.dumps(self._data)
165
174
  await self.redis.set(self.key, value, ex=self._ttl)
166
175
 
176
+ @backoff.on_exception(
177
+ backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
178
+ )
167
179
  async def _delete_key(self):
168
180
  if self.key is None:
169
181
  raise Exception("Not initialized")
@@ -31,9 +31,7 @@ class HTTPException(StarletteHTTPException):
31
31
 
32
32
  def __init__(self, detail: Optional[str] = None):
33
33
  if self._status_code:
34
- super(HTTPException, self).__init__(
35
- status_code=self._status_code, detail=detail
36
- )
34
+ super(HTTPException, self).__init__(status_code=self._status_code, detail=detail)
37
35
  else:
38
36
  raise AttributeError("Status code not defined")
39
37
 
@@ -28,27 +28,28 @@ import tempfile
28
28
  import uuid
29
29
  from concurrent.futures import ThreadPoolExecutor
30
30
  from copy import deepcopy
31
- from datetime import datetime
32
- from typing import AsyncIterator, Optional
31
+ from typing import Optional
33
32
  from urllib.parse import quote_plus
34
33
 
35
34
  import aiohttp
36
35
  import backoff
37
- from nucliadb_protos.resources_pb2 import CloudFile
36
+ import google.auth.compute_engine.credentials # type: ignore
37
+ import google.auth.transport.requests # type: ignore
38
+ import google.oauth2.credentials # type: ignore
39
+ from google.auth.exceptions import DefaultCredentialsError # type: ignore
38
40
  from oauth2client.service_account import ServiceAccountCredentials # type: ignore
39
41
 
40
42
  from nucliadb.writer import logger
41
43
  from nucliadb.writer.tus.dm import FileDataManager
42
44
  from nucliadb.writer.tus.exceptions import (
43
- CloudFileNotFound,
44
45
  HTTPBadRequest,
45
- HTTPNotFound,
46
46
  HTTPPreconditionFailed,
47
47
  ResumableURINotAvailable,
48
48
  )
49
49
  from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
50
50
  from nucliadb.writer.tus.utils import to_str
51
- from nucliadb_utils.storages.gcs import CHUNK_SIZE, MIN_UPLOAD_SIZE
51
+ from nucliadb_protos.resources_pb2 import CloudFile
52
+ from nucliadb_utils.storages.gcs import CHUNK_SIZE, MIN_UPLOAD_SIZE, TIMEOUT
52
53
 
53
54
 
54
55
  class GoogleCloudException(Exception):
@@ -76,7 +77,7 @@ class GCloudBlobStore(BlobStore):
76
77
  loop = None
77
78
  upload_url: str
78
79
  object_base_url: str
79
- json_credentials: str
80
+ json_credentials: Optional[str]
80
81
  bucket: str
81
82
  location: str
82
83
  project: str
@@ -90,9 +91,18 @@ class GCloudBlobStore(BlobStore):
90
91
  return {"AUTHORIZATION": f"Bearer {token}"}
91
92
 
92
93
  def _get_access_token(self):
93
- access_token = self._credentials.get_access_token()
94
- self._creation_access_token = datetime.now()
95
- return access_token.access_token
94
+ if isinstance(
95
+ self._credentials, google.auth.compute_engine.credentials.Credentials
96
+ ) or isinstance(self._credentials, google.oauth2.credentials.Credentials):
97
+ # google default auth object
98
+ if self._credentials.expired or self._credentials.valid is False:
99
+ request = google.auth.transport.requests.Request()
100
+ self._credentials.refresh(request)
101
+
102
+ return self._credentials.token
103
+ else:
104
+ access_token = self._credentials.get_access_token()
105
+ return access_token.access_token
96
106
 
97
107
  async def finalize(self):
98
108
  if self.session is not None:
@@ -113,32 +123,35 @@ class GCloudBlobStore(BlobStore):
113
123
  self.project = project
114
124
  self.bucket_labels = bucket_labels
115
125
  self.object_base_url = object_base_url + "/storage/v1/b"
116
- self.upload_url = (
117
- object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable"
118
- ) # noqa
119
-
126
+ self.upload_url = object_base_url + "/upload/storage/v1/b/{bucket}/o?uploadType=resumable" # noqa
127
+ self.json_credentials = json_credentials
120
128
  self._credentials = None
121
129
 
122
- if json_credentials is not None:
123
- self.json_credentials_file = os.path.join(
124
- tempfile.mkdtemp(), "gcs_credentials.json"
125
- )
126
- open(self.json_credentials_file, "w").write(
127
- base64.b64decode(json_credentials).decode("utf-8")
128
- )
130
+ if self.json_credentials is not None and self.json_credentials.strip() != "":
131
+ self.json_credentials_file = os.path.join(tempfile.mkdtemp(), "gcs_credentials.json")
132
+ with open(self.json_credentials_file, "w") as file:
133
+ file.write(base64.b64decode(self.json_credentials).decode("utf-8"))
129
134
  self._credentials = ServiceAccountCredentials.from_json_keyfile_name(
130
135
  self.json_credentials_file, SCOPES
131
136
  )
137
+ else:
138
+ try:
139
+ self._credentials, self.project = google.auth.default()
140
+ except DefaultCredentialsError:
141
+ logger.warning("Setting up without credentials as couldn't find workload identity")
142
+ self._credentials = None
132
143
 
133
144
  loop = asyncio.get_event_loop()
134
- self.session = aiohttp.ClientSession(loop=loop)
145
+ self.session = aiohttp.ClientSession(loop=loop, timeout=TIMEOUT)
135
146
 
136
147
  async def check_exists(self, bucket_name: str):
137
148
  if self.session is None:
138
149
  raise AttributeError()
139
150
 
140
151
  headers = await self.get_access_headers()
141
- url = f"{self.object_base_url}/{bucket_name}?project={self.project}"
152
+ # Using object access url instead of bucket access to avoid
153
+ # giving admin permission to the SA, needed to GET a bucket
154
+ url = f"{self.object_base_url}/{bucket_name}/o"
142
155
  async with self.session.get(
143
156
  url,
144
157
  headers=headers,
@@ -177,9 +190,7 @@ class GCloudFileStorageManager(FileStorageManager):
177
190
  chunk_size = CHUNK_SIZE
178
191
  min_upload_size = MIN_UPLOAD_SIZE
179
192
 
180
- @backoff.on_exception(
181
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
182
- )
193
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
183
194
  async def start(self, dm: FileDataManager, path: str, kbid: str):
184
195
  """Init an upload.
185
196
 
@@ -187,12 +198,15 @@ class GCloudFileStorageManager(FileStorageManager):
187
198
  _resumable_uri : uri to resumable upload
188
199
  _uri : finished uploaded image
189
200
  """
201
+
190
202
  if self.storage.session is None:
191
203
  raise AttributeError()
192
204
 
193
- upload_file_id = dm.get("upload_file_id", str(uuid.uuid4()))
205
+ upload_file_id = dm.get("upload_file_id")
194
206
  if upload_file_id is not None:
195
207
  await self.delete_upload(upload_file_id, kbid)
208
+ else:
209
+ upload_file_id = str(uuid.uuid4())
196
210
 
197
211
  bucket = self.storage.get_bucket_name(kbid)
198
212
  init_url = "{}&name={}".format(
@@ -237,13 +251,9 @@ class GCloudFileStorageManager(FileStorageManager):
237
251
  raise GoogleCloudException(text)
238
252
  resumable_uri = call.headers["Location"]
239
253
 
240
- await dm.update(
241
- resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path
242
- )
254
+ await dm.update(resumable_uri=resumable_uri, upload_file_id=upload_file_id, path=path)
243
255
 
244
- @backoff.on_exception(
245
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
246
- )
256
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
247
257
  async def delete_upload(self, uri, kbid):
248
258
  bucket = self.storage.get_bucket_name(kbid)
249
259
 
@@ -266,8 +276,7 @@ class GCloudFileStorageManager(FileStorageManager):
266
276
  if resp.status not in (200, 204, 404):
267
277
  if resp.status == 404:
268
278
  logger.error(
269
- f"Attempt to delete not found gcloud: {data}, "
270
- f"status: {resp.status}",
279
+ f"Attempt to delete not found gcloud: {data}, " f"status: {resp.status}",
271
280
  exc_info=True,
272
281
  )
273
282
  else:
@@ -275,9 +284,7 @@ class GCloudFileStorageManager(FileStorageManager):
275
284
  else:
276
285
  raise AttributeError("No valid uri")
277
286
 
278
- @backoff.on_exception(
279
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
280
- )
287
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
281
288
  async def _append(self, dm: FileDataManager, data, offset):
282
289
  if self.storage.session is None:
283
290
  raise AttributeError()
@@ -342,9 +349,7 @@ class GCloudFileStorageManager(FileStorageManager):
342
349
  break
343
350
  return count
344
351
 
345
- @backoff.on_exception(
346
- backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4
347
- )
352
+ @backoff.on_exception(backoff.expo, RETRIABLE_EXCEPTIONS, jitter=backoff.random_jitter, max_tries=4)
348
353
  async def finish(self, dm: FileDataManager):
349
354
  if dm.size == 0:
350
355
  if self.storage.session is None:
@@ -370,46 +375,6 @@ class GCloudFileStorageManager(FileStorageManager):
370
375
  await dm.finish()
371
376
  return path
372
377
 
373
- async def iter_data(self, uri, kbid: str, headers: Optional[dict[str, str]] = None):
374
- if self.storage.session is None:
375
- raise AttributeError()
376
- if headers is None:
377
- headers = {}
378
-
379
- url = "{}/{}/o/{}".format(
380
- self.storage.object_base_url,
381
- self.storage.get_bucket_name(kbid),
382
- quote_plus(uri),
383
- )
384
- headers_auth = await self.storage.get_access_headers()
385
- headers.update(headers_auth)
386
- async with self.storage.session.get(
387
- url, headers=headers, params={"alt": "media"}, timeout=-1
388
- ) as api_resp:
389
- if api_resp.status not in (200, 206):
390
- text = await api_resp.text()
391
- if api_resp.status == 404:
392
- raise CloudFileNotFound("Google cloud file not found")
393
- elif api_resp.status == 401:
394
- logger.warning(f"Invalid google cloud credentials error: {text}")
395
- raise HTTPNotFound(
396
- detail=f"Google cloud invalid credentials: {text}"
397
- )
398
- raise GoogleCloudException(f"{api_resp.status}: {text}")
399
- while True:
400
- chunk = await api_resp.content.read(1024 * 1024)
401
- if len(chunk) > 0:
402
- yield chunk
403
- else:
404
- break
405
-
406
- async def read_range(
407
- self, uri: str, kbid: str, start: int, end: int
408
- ) -> AsyncIterator[bytes]:
409
- """
410
- Iterate through ranges of data
411
- """
412
- async for chunk in self.iter_data(
413
- uri, kbid, headers={"Range": f"bytes={start}-{end - 1}"}
414
- ):
415
- yield chunk
378
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
379
+ if uploaded_bytes < self.min_upload_size:
380
+ raise ValueError(f"Intermediate chunks cannot be smaller than {self.min_upload_size} bytes")
@@ -22,14 +22,13 @@ from __future__ import annotations
22
22
  import json
23
23
  import os
24
24
  import uuid
25
- from typing import AsyncIterator
25
+ from typing import Any
26
26
 
27
27
  import aiofiles
28
- from nucliadb_protos.resources_pb2 import CloudFile
29
28
 
30
29
  from nucliadb.writer.tus.dm import FileDataManager
31
- from nucliadb.writer.tus.exceptions import CloudFileNotFound
32
30
  from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
31
+ from nucliadb_protos.resources_pb2 import CloudFile
33
32
  from nucliadb_utils.storages import CHUNK_SIZE
34
33
 
35
34
 
@@ -50,51 +49,24 @@ class LocalFileStorageManager(FileStorageManager):
50
49
  bucket = self.storage.get_bucket_name(kbid)
51
50
  upload_file_id = dm.get("upload_file_id", str(uuid.uuid4()))
52
51
  init_url = self.get_file_path(bucket, upload_file_id)
53
- metadata_init_url = self.metadata_key(init_url)
54
52
  metadata = {
55
53
  "FILENAME": dm.filename,
56
54
  "CONTENT_TYPE": dm.content_type,
57
55
  "SIZE": dm.size,
58
56
  }
59
- async with aiofiles.open(metadata_init_url, "w+") as resp:
60
- await resp.write(json.dumps(metadata))
57
+ await self.set_metadata(kbid, upload_file_id, metadata)
61
58
 
62
59
  async with aiofiles.open(init_url, "wb+") as aio_fi:
63
60
  await aio_fi.write(b"")
64
61
 
65
- await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket)
62
+ await dm.update(upload_file_id=upload_file_id, path=path, bucket=bucket, kbid=kbid)
66
63
 
67
- async def iter_data(self, uri, kbid: str, headers=None):
68
- bucket = self.storage.get_bucket_name(kbid)
69
- file_path = self.get_file_path(bucket, uri)
70
- async with aiofiles.open(file_path) as resp:
71
- data = await resp.read(CHUNK_SIZE)
72
- while data is not None:
73
- yield data
74
- data = await resp.read(CHUNK_SIZE)
75
-
76
- async def read_range(
77
- self, uri: str, kbid: str, start: int, end: int
78
- ) -> AsyncIterator[bytes]:
79
- """
80
- Iterate through ranges of data
81
- """
64
+ async def set_metadata(self, kbid: str, upload_file_id: str, metadata: dict[str, Any]):
82
65
  bucket = self.storage.get_bucket_name(kbid)
83
- file_path = self.get_file_path(bucket, uri)
84
- try:
85
- async with aiofiles.open(file_path, "rb") as resp:
86
- await resp.seek(start)
87
- count = 0
88
- data = await resp.read(CHUNK_SIZE)
89
- while data and count < end:
90
- if count + len(data) > end:
91
- new_end = end - count
92
- data = data[:new_end]
93
- yield data
94
- count += len(data)
95
- data = await resp.read(CHUNK_SIZE)
96
- except FileNotFoundError:
97
- raise CloudFileNotFound()
66
+ init_url = self.get_file_path(bucket, upload_file_id)
67
+ metadata_init_url = self.metadata_key(init_url)
68
+ async with aiofiles.open(metadata_init_url, "w+") as resp:
69
+ await resp.write(json.dumps(metadata))
98
70
 
99
71
  async def append(self, dm: FileDataManager, iterable, offset) -> int:
100
72
  count = 0
@@ -118,6 +90,15 @@ class LocalFileStorageManager(FileStorageManager):
118
90
  upload_file_id = dm.get("upload_file_id")
119
91
  from_url = self.get_file_path(bucket, upload_file_id)
120
92
 
93
+ if dm.size > 0:
94
+ kbid = dm.get("kbid")
95
+ metadata = {
96
+ "FILENAME": dm.filename,
97
+ "CONTENT_TYPE": dm.content_type,
98
+ "SIZE": dm.size,
99
+ }
100
+ await self.set_metadata(kbid, upload_file_id, metadata)
101
+
121
102
  path = dm.get("path")
122
103
  to_url = self.get_file_path(bucket, path)
123
104
  to_url_dirs = os.path.dirname(to_url)
@@ -138,6 +119,9 @@ class LocalFileStorageManager(FileStorageManager):
138
119
  file_path = self.get_file_path(bucket, uri)
139
120
  os.remove(file_path)
140
121
 
122
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
123
+ pass
124
+
141
125
 
142
126
  class LocalBlobStore(BlobStore):
143
127
  def __init__(self, local_testing_files: str):