nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -21,36 +21,48 @@ from datetime import datetime
21
21
  from typing import Optional, Union
22
22
 
23
23
  from google.protobuf.json_format import MessageToDict
24
- from nucliadb_protos.writer_pb2 import BrokerMessage
25
24
 
26
25
  import nucliadb_models as models
26
+ from nucliadb.common.models_utils import from_proto, to_proto
27
27
  from nucliadb.ingest.fields.conversation import Conversation
28
28
  from nucliadb.ingest.orm.resource import Resource as ORMResource
29
29
  from nucliadb.ingest.processing import PushPayload
30
30
  from nucliadb.writer import SERVICE_NAME
31
- from nucliadb.writer.layouts import serialize_blocks
32
31
  from nucliadb.writer.utilities import get_processing
33
- from nucliadb_models.common import FIELD_TYPES_MAP, FieldTypeName
32
+ from nucliadb_models.common import FieldTypeName
33
+ from nucliadb_models.content_types import GENERIC_MIME_TYPE
34
34
  from nucliadb_models.conversation import PushConversation
35
35
  from nucliadb_models.writer import (
36
- GENERIC_MIME_TYPE,
37
36
  CreateResourcePayload,
38
37
  UpdateResourcePayload,
39
38
  )
40
39
  from nucliadb_protos import resources_pb2
40
+ from nucliadb_protos.writer_pb2 import BrokerMessage
41
41
  from nucliadb_utils.storages.storage import StorageField
42
42
  from nucliadb_utils.utilities import get_storage
43
43
 
44
44
 
45
+ async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
46
+ processing = get_processing()
47
+
48
+ if field_pb.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
49
+ file_field = models.FileField(
50
+ language=field_pb.language,
51
+ password=field_pb.password,
52
+ file=models.File(payload=None, uri=field_pb.file.uri),
53
+ )
54
+ return processing.convert_external_filefield_to_str(file_field)
55
+ else:
56
+ storage = await get_storage(service_name=SERVICE_NAME)
57
+ return await processing.convert_internal_filefield_to_str(field_pb, storage)
58
+
59
+
45
60
  async def extract_file_field(
46
61
  field_id: str,
47
62
  resource: ORMResource,
48
63
  toprocess: PushPayload,
49
64
  password: Optional[str] = None,
50
65
  ):
51
- processing = get_processing()
52
- storage = await get_storage(service_name=SERVICE_NAME)
53
-
54
66
  field_type = resources_pb2.FieldType.FILE
55
67
  field = await resource.get_field(field_id, field_type)
56
68
  field_pb = await field.get_value()
@@ -60,9 +72,7 @@ async def extract_file_field(
60
72
  if password is not None:
61
73
  field_pb.password = password
62
74
 
63
- toprocess.filefield[field_id] = await processing.convert_internal_filefield_to_str(
64
- field_pb, storage
65
- )
75
+ toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
66
76
 
67
77
 
68
78
  async def extract_fields(resource: ORMResource, toprocess: PushPayload):
@@ -70,12 +80,11 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
70
80
  storage = await get_storage(service_name=SERVICE_NAME)
71
81
  await resource.get_fields()
72
82
  for (field_type, field_id), field in resource.fields.items():
73
- field_type_name = FIELD_TYPES_MAP[field_type]
83
+ field_type_name = from_proto.field_type_name(field_type)
74
84
 
75
85
  if field_type_name not in {
76
86
  FieldTypeName.TEXT,
77
87
  FieldTypeName.FILE,
78
- FieldTypeName.LAYOUT,
79
88
  FieldTypeName.CONVERSATION,
80
89
  FieldTypeName.LINK,
81
90
  }:
@@ -84,9 +93,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
84
93
  field_pb = await field.get_value()
85
94
 
86
95
  if field_type_name is FieldTypeName.FILE:
87
- toprocess.filefield[
88
- field_id
89
- ] = await processing.convert_internal_filefield_to_str(field_pb, storage)
96
+ toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
90
97
 
91
98
  if field_type_name is FieldTypeName.LINK:
92
99
  parsed_link = MessageToDict(
@@ -106,28 +113,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
106
113
  parsed_text["format"] = models.PushTextFormat[parsed_text["format"]]
107
114
  toprocess.textfield[field_id] = models.Text(**parsed_text)
108
115
 
109
- if field_type_name is FieldTypeName.LAYOUT:
110
- parsed_layout = MessageToDict(
111
- field_pb,
112
- preserving_proto_field_name=True,
113
- including_default_value_fields=True,
114
- )
115
- parsed_layout["format"] = resources_pb2.FieldLayout.Format.Value(
116
- parsed_layout["format"]
117
- )
118
-
119
- for blockid, block in parsed_layout["body"]["blocks"].items():
120
- cf = field_pb.body.blocks[blockid].file
121
- block["file"] = await processing.convert_internal_cf_to_str(cf, storage)
122
-
123
- parsed_layout["blocks"] = parsed_layout.get("body", {}).get("blocks", {})
124
- del parsed_layout["body"]
125
-
126
- toprocess.layoutfield[field_id] = models.LayoutDiff(**parsed_layout)
127
-
128
- if field_type_name is FieldTypeName.CONVERSATION and isinstance(
129
- field, Conversation
130
- ):
116
+ if field_type_name is FieldTypeName.CONVERSATION and isinstance(field, Conversation):
131
117
  metadata = await field.get_metadata()
132
118
  if metadata.pages == 0:
133
119
  continue
@@ -148,14 +134,13 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
148
134
  await processing.convert_internal_cf_to_str(cf, storage)
149
135
  for cf in message.content.attachments
150
136
  ]
151
- parsed_message["content"][
152
- "format"
153
- ] = resources_pb2.MessageContent.Format.Value(
137
+ if "attachments_fields" in parsed_message["content"]:
138
+ # Not defined on the push payload
139
+ del parsed_message["content"]["attachments_fields"]
140
+ parsed_message["content"]["format"] = resources_pb2.MessageContent.Format.Value(
154
141
  parsed_message["content"]["format"]
155
142
  )
156
- full_conversation.messages.append(
157
- models.PushMessage(**parsed_message)
158
- )
143
+ full_conversation.messages.append(models.PushMessage(**parsed_message))
159
144
  toprocess.conversationfield[field_id] = full_conversation
160
145
 
161
146
 
@@ -168,9 +153,7 @@ async def parse_fields(
168
153
  x_skip_store: bool,
169
154
  ):
170
155
  for key, file_field in item.files.items():
171
- await parse_file_field(
172
- key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store
173
- )
156
+ await parse_file_field(key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store)
174
157
 
175
158
  for key, link_field in item.links.items():
176
159
  parse_link_field(key, link_field, writer, toprocess)
@@ -178,19 +161,8 @@ async def parse_fields(
178
161
  for key, text_field in item.texts.items():
179
162
  parse_text_field(key, text_field, writer, toprocess)
180
163
 
181
- for key, layout_field in item.layouts.items():
182
- await parse_layout_field(key, layout_field, writer, toprocess, kbid, uuid)
183
-
184
164
  for key, conversation_field in item.conversations.items():
185
- await parse_conversation_field(
186
- key, conversation_field, writer, toprocess, kbid, uuid
187
- )
188
-
189
- for key, datetime_field in item.datetimes.items():
190
- parse_datetime_field(key, datetime_field, writer, toprocess)
191
-
192
- for key, keywordset_field in item.keywordsets.items():
193
- parse_keywordset_field(key, keywordset_field, writer, toprocess)
165
+ await parse_conversation_field(key, conversation_field, writer, toprocess, kbid, uuid)
194
166
 
195
167
 
196
168
  def parse_text_field(
@@ -200,9 +172,7 @@ def parse_text_field(
200
172
  toprocess: PushPayload,
201
173
  ) -> None:
202
174
  writer.texts[key].body = text_field.body
203
- writer.texts[key].format = resources_pb2.FieldText.Format.Value(
204
- text_field.format.value
205
- )
175
+ writer.texts[key].format = resources_pb2.FieldText.Format.Value(text_field.format.value)
206
176
  etw = resources_pb2.ExtractedTextWrapper()
207
177
  etw.field.field = key
208
178
  etw.field.field_type = resources_pb2.FieldType.TEXT
@@ -317,84 +287,16 @@ def parse_link_field(
317
287
  if link_field.css_selector is not None:
318
288
  writer.links[key].css_selector = link_field.css_selector
319
289
 
290
+ if link_field.xpath is not None:
291
+ writer.links[key].xpath = link_field.xpath
292
+
320
293
  toprocess.linkfield[key] = models.LinkUpload(
321
294
  link=link_field.uri,
322
295
  headers=link_field.headers or {},
323
296
  cookies=link_field.cookies or {},
324
297
  localstorage=link_field.localstorage or {},
325
298
  css_selector=link_field.css_selector,
326
- )
327
-
328
-
329
- def parse_keywordset_field(
330
- key: str,
331
- keywordset_field: models.FieldKeywordset,
332
- writer: BrokerMessage,
333
- toprocess: PushPayload,
334
- ) -> None:
335
- if keywordset_field.keywords is None:
336
- return
337
-
338
- for keyword in keywordset_field.keywords:
339
- fieldpb = resources_pb2.Keyword()
340
- fieldpb.value = keyword.value
341
- writer.keywordsets[key].keywords.append(fieldpb)
342
-
343
-
344
- def parse_datetime_field(
345
- key: str,
346
- datetime_field: models.FieldDatetime,
347
- writer: BrokerMessage,
348
- toprocess: PushPayload,
349
- ) -> None:
350
- if datetime_field.value is None:
351
- return
352
-
353
- writer.datetimes[key].value.FromDatetime(datetime_field.value)
354
-
355
-
356
- async def parse_layout_field(
357
- key: str,
358
- layout_field: models.InputLayoutField,
359
- writer: BrokerMessage,
360
- toprocess: PushPayload,
361
- kbid: str,
362
- uuid: str,
363
- ) -> None:
364
- storage = await get_storage(service_name=SERVICE_NAME)
365
- processing = get_processing()
366
-
367
- lc: resources_pb2.FieldLayout = await serialize_blocks(
368
- layout_field, kbid, uuid, key, storage
369
- )
370
- writer.layouts[key].CopyFrom(lc)
371
-
372
- toprocess_blocks = {}
373
- for blockid, block in layout_field.body.blocks.items():
374
- sf_conv_field: StorageField = storage.layout_field(
375
- kbid, uuid, field=key, ident=block.ident
376
- )
377
- cf_conv_field = await storage.upload_b64file_to_cloudfile(
378
- sf_conv_field,
379
- block.file.payload.encode(),
380
- block.file.filename,
381
- block.file.content_type,
382
- block.file.md5,
383
- )
384
-
385
- toprocess_blocks[blockid] = models.PushLayoutBlock(
386
- x=block.x,
387
- y=block.y,
388
- cols=block.cols,
389
- rows=block.rows,
390
- type=block.type,
391
- ident=block.ident,
392
- payload=block.payload,
393
- file=await processing.convert_internal_cf_to_str(cf_conv_field, storage),
394
- )
395
-
396
- toprocess.layoutfield[key] = models.LayoutDiff(
397
- format=lc.format, blocks=toprocess_blocks # type: ignore
299
+ xpath=link_field.xpath,
398
300
  )
399
301
 
400
302
 
@@ -429,8 +331,16 @@ async def parse_conversation_field(
429
331
  )
430
332
 
431
333
  cm.content.text = message.content.text
432
- cm.content.format = resources_pb2.MessageContent.Format.Value(
433
- message.content.format.value
334
+ cm.content.format = resources_pb2.MessageContent.Format.Value(message.content.format.value)
335
+ cm.content.attachments_fields.extend(
336
+ [
337
+ resources_pb2.FieldRef(
338
+ field_type=to_proto.field_type_name(attachment.field_type),
339
+ field_id=attachment.field_id,
340
+ split=attachment.split if attachment.split is not None else "",
341
+ )
342
+ for attachment in message.content.attachments_fields
343
+ ]
434
344
  )
435
345
 
436
346
  for count, file in enumerate(message.content.attachments):
@@ -16,10 +16,9 @@
16
16
  #
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- from nucliadb_protos.resources_pb2 import Origin
20
-
21
19
  from nucliadb_models import Extra, InputOrigin
22
20
  from nucliadb_protos import resources_pb2
21
+ from nucliadb_protos.resources_pb2 import Origin
23
22
 
24
23
 
25
24
  def parse_origin(origin: Origin, origin_payload: InputOrigin):
@@ -19,7 +19,8 @@
19
19
  #
20
20
  from typing import Optional
21
21
 
22
- from pydantic import BaseSettings, Field
22
+ from pydantic import Field
23
+ from pydantic_settings import BaseSettings
23
24
 
24
25
 
25
26
  class Settings(BaseSettings):
@@ -32,10 +33,10 @@ class BackPressureSettings(BaseSettings):
32
33
  enabled: bool = Field(
33
34
  default=False,
34
35
  description="Enable or disable back pressure.",
35
- env=["back_pressure_enabled"],
36
+ alias="back_pressure_enabled",
36
37
  )
37
38
  indexing_rate: float = Field(
38
- default=2,
39
+ default=4,
39
40
  description="Estimation of the indexing rate in messages per second. This is used to calculate the try again in time", # noqa
40
41
  )
41
42
  ingest_rate: float = Field(
@@ -47,16 +48,20 @@ class BackPressureSettings(BaseSettings):
47
48
  description="Estimation of the processing rate in messages per second. This is used to calculate the try again in time", # noqa
48
49
  )
49
50
  max_indexing_pending: int = Field(
50
- default=100,
51
+ default=200,
51
52
  description="Max number of messages pending to index in a node queue before rate limiting writes. Set to 0 to disable indexing back pressure checks", # noqa
53
+ alias="back_pressure_max_indexing_pending",
52
54
  )
53
55
  max_ingest_pending: int = Field(
54
- default=1_000,
56
+ # Disabled by default
57
+ default=0,
55
58
  description="Max number of messages pending to be ingested by processed consumers before rate limiting writes. Set to 0 to disable ingest back pressure checks", # noqa
59
+ alias="back_pressure_max_ingest_pending",
56
60
  )
57
61
  max_processing_pending: int = Field(
58
62
  default=1000,
59
63
  description="Max number of messages pending to process per Knowledge Box before rate limiting writes. Set to 0 to disable processing back pressure checks", # noqa
64
+ alias="back_pressure_max_processing_pending",
60
65
  )
61
66
  indexing_check_interval: int = Field(
62
67
  default=30,
@@ -66,6 +71,10 @@ class BackPressureSettings(BaseSettings):
66
71
  default=30,
67
72
  description="Interval in seconds to check the ingest pending messages",
68
73
  )
74
+ max_wait_time: int = Field(
75
+ default=60,
76
+ description="Max time in seconds to wait before trying again after back pressure",
77
+ )
69
78
 
70
79
 
71
80
  settings = Settings()
@@ -23,10 +23,6 @@ from typing import Optional
23
23
  from nucliadb.writer.settings import settings as writer_settings
24
24
  from nucliadb.writer.tus.dm import FileDataManager, RedisFileDataManagerFactory
25
25
  from nucliadb.writer.tus.exceptions import ManagerNotAvailable
26
- from nucliadb.writer.tus.gcs import GCloudBlobStore, GCloudFileStorageManager
27
- from nucliadb.writer.tus.local import LocalBlobStore, LocalFileStorageManager
28
- from nucliadb.writer.tus.pg import PGBlobStore, PGFileStorageManager
29
- from nucliadb.writer.tus.s3 import S3BlobStore, S3FileStorageManager
30
26
  from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
31
27
  from nucliadb_utils.exceptions import ConfigurationError
32
28
  from nucliadb_utils.settings import FileBackendConfig, storage_settings
@@ -48,6 +44,8 @@ REDIS_FILE_DATA_MANAGER_FACTORY: Optional[RedisFileDataManagerFactory] = None
48
44
  async def initialize():
49
45
  global DRIVER
50
46
  if storage_settings.file_backend == FileBackendConfig.GCS:
47
+ from nucliadb.writer.tus.gcs import GCloudBlobStore, GCloudFileStorageManager
48
+
51
49
  storage_backend = GCloudBlobStore()
52
50
 
53
51
  await storage_backend.initialize(
@@ -64,6 +62,8 @@ async def initialize():
64
62
  DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
65
63
 
66
64
  elif storage_settings.file_backend == FileBackendConfig.S3:
65
+ from nucliadb.writer.tus.s3 import S3BlobStore, S3FileStorageManager
66
+
67
67
  storage_backend = S3BlobStore()
68
68
 
69
69
  await storage_backend.initialize(
@@ -83,6 +83,8 @@ async def initialize():
83
83
  DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
84
84
 
85
85
  elif storage_settings.file_backend == FileBackendConfig.LOCAL:
86
+ from nucliadb.writer.tus.local import LocalBlobStore, LocalFileStorageManager
87
+
86
88
  storage_backend = LocalBlobStore(storage_settings.local_files)
87
89
 
88
90
  await storage_backend.initialize()
@@ -91,12 +93,18 @@ async def initialize():
91
93
 
92
94
  DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
93
95
 
94
- elif storage_settings.file_backend == FileBackendConfig.PG:
95
- storage_backend = PGBlobStore(storage_settings.driver_pg_url)
96
+ elif storage_settings.file_backend == FileBackendConfig.AZURE:
97
+ from nucliadb.writer.tus.azure import AzureBlobStore, AzureFileStorageManager
96
98
 
97
- await storage_backend.initialize()
99
+ if storage_settings.azure_account_url is None:
100
+ raise ConfigurationError("AZURE_ACCOUNT_URL env variable not configured")
98
101
 
99
- storage_manager = PGFileStorageManager(storage_backend)
102
+ storage_backend = AzureBlobStore()
103
+ await storage_backend.initialize(
104
+ storage_settings.azure_account_url,
105
+ connection_string=storage_settings.azure_connection_string,
106
+ )
107
+ storage_manager = AzureFileStorageManager(storage_backend)
100
108
 
101
109
  DRIVER = TusStorageDriver(backend=storage_backend, manager=storage_manager)
102
110
 
@@ -117,7 +125,7 @@ async def finalize():
117
125
  REDIS_FILE_DATA_MANAGER_FACTORY = None
118
126
 
119
127
 
120
- def get_dm() -> FileDataManager: # type: ignore
128
+ def get_dm() -> FileDataManager:
121
129
  if writer_settings.dm_enabled:
122
130
  global REDIS_FILE_DATA_MANAGER_FACTORY
123
131
  if REDIS_FILE_DATA_MANAGER_FACTORY is None:
@@ -136,9 +144,3 @@ def get_storage_manager() -> FileStorageManager:
136
144
  if DRIVER is None:
137
145
  raise ManagerNotAvailable()
138
146
  return DRIVER.manager
139
-
140
-
141
- def clear_storage():
142
- global DRIVER
143
-
144
- DRIVER = None
@@ -0,0 +1,111 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from __future__ import annotations
21
+
22
+ from typing import Optional
23
+
24
+ from nucliadb.writer import logger
25
+ from nucliadb.writer.tus.dm import FileDataManager
26
+ from nucliadb.writer.tus.storage import BlobStore, FileStorageManager
27
+ from nucliadb_protos.resources_pb2 import CloudFile
28
+ from nucliadb_utils.storages import CHUNK_SIZE
29
+ from nucliadb_utils.storages.azure import AzureObjectStore
30
+ from nucliadb_utils.storages.exceptions import ObjectNotFoundError
31
+ from nucliadb_utils.storages.utils import ObjectMetadata
32
+
33
+
34
+ class AzureBlobStore(BlobStore):
35
+ async def finalize(self):
36
+ if self._object_store is None:
37
+ return
38
+ try:
39
+ await self._object_store.finalize()
40
+ except Exception:
41
+ logger.exception("Error closing AzureBlobStore")
42
+ self._object_store = None
43
+
44
+ async def initialize(self, account_url: str, connection_string: Optional[str] = None):
45
+ self.bucket = "nucliadb-{kbid}"
46
+ self.source = CloudFile.Source.AZURE
47
+ self._object_store = AzureObjectStore(account_url, connection_string=connection_string)
48
+ await self._object_store.initialize()
49
+
50
+ @property
51
+ def object_store(self) -> AzureObjectStore:
52
+ assert self._object_store is not None
53
+ return self._object_store
54
+
55
+ async def check_exists(self, bucket_name: str) -> bool:
56
+ return await self.object_store.bucket_exists(bucket_name)
57
+
58
+ async def create_bucket(self, bucket_name: str) -> bool:
59
+ created = await self.object_store.bucket_create(bucket_name)
60
+ return not created
61
+
62
+
63
+ class AzureFileStorageManager(FileStorageManager):
64
+ storage: AzureBlobStore
65
+ chunk_size = CHUNK_SIZE
66
+ min_upload_size = None
67
+
68
+ @property
69
+ def object_store(self) -> AzureObjectStore:
70
+ return self.storage.object_store
71
+
72
+ async def start(self, dm: FileDataManager, path: str, kbid: str):
73
+ bucket = self.storage.get_bucket_name(kbid)
74
+ if dm.filename == 0:
75
+ filename = "file"
76
+ else:
77
+ filename = dm.filename
78
+ metadata = ObjectMetadata(
79
+ filename=filename,
80
+ content_type=dm.content_type,
81
+ size=dm.size,
82
+ )
83
+ await self.object_store.upload_multipart_start(bucket, path, metadata)
84
+ await dm.update(path=path, bucket=bucket)
85
+
86
+ async def delete_upload(self, uri: str, kbid: str) -> None:
87
+ bucket = self.storage.get_bucket_name(kbid)
88
+ try:
89
+ await self.object_store.delete(bucket, uri)
90
+ except ObjectNotFoundError:
91
+ logger.warning(
92
+ "Attempt to delete an upload but not found",
93
+ extra={"uri": uri, "kbid": kbid, "bucket": bucket},
94
+ )
95
+
96
+ async def append(self, dm: FileDataManager, iterable, offset: int) -> int:
97
+ bucket = dm.get("bucket")
98
+ assert bucket is not None
99
+ path = dm.get("path")
100
+ assert path is not None
101
+ uploaded_bytes = await self.object_store.upload_multipart_append(bucket, path, iterable)
102
+ await dm.update(offset=offset)
103
+ return uploaded_bytes
104
+
105
+ async def finish(self, dm: FileDataManager):
106
+ path = dm.get("path")
107
+ await dm.finish()
108
+ return path
109
+
110
+ def validate_intermediate_chunk(self, uploaded_bytes: int):
111
+ pass
nucliadb/writer/tus/dm.py CHANGED
@@ -20,6 +20,7 @@
20
20
  import time
21
21
  from typing import Any, Optional
22
22
 
23
+ import backoff
23
24
  import orjson
24
25
  from redis import asyncio as aioredis
25
26
  from starlette.requests import Request
@@ -33,6 +34,11 @@ class NoRedisConfigured(Exception):
33
34
  pass
34
35
 
35
36
 
37
+ RETRIABLE_REDIS_ERRORS = (
38
+ aioredis.ConnectionError,
39
+ aioredis.TimeoutError,
40
+ )
41
+
36
42
  DATA: dict[str, Any] = {}
37
43
 
38
44
 
@@ -59,10 +65,7 @@ class FileDataManager:
59
65
  # someone else
60
66
  last_activity: Optional[int] = self._data.get("last_activity")
61
67
  if last_activity and (time.time() - last_activity) < self._ttl:
62
- if (
63
- request.headers
64
- and request.headers.get("tus-override-upload", "0") != "1"
65
- ):
68
+ if request.headers and request.headers.get("tus-override-upload", "0") != "1":
66
69
  raise HTTPPreconditionFailed(
67
70
  detail="There is already an active tusupload that conflicts with this one."
68
71
  )
@@ -136,7 +139,7 @@ class RedisFileDataManagerFactory:
136
139
 
137
140
  async def finalize(self):
138
141
  try:
139
- await self.redis.close(close_connection_pool=True)
142
+ await self.redis.aclose(close_connection_pool=True)
140
143
  except Exception:
141
144
  logger.warning("Error closing redis connection", exc_info=True)
142
145
  pass
@@ -146,6 +149,9 @@ class RedisFileDataManager(FileDataManager):
146
149
  def __init__(self, redis: aioredis.Redis):
147
150
  self.redis = redis
148
151
 
152
+ @backoff.on_exception(
153
+ backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
154
+ )
149
155
  async def load(self, key):
150
156
  # preload data
151
157
  self.key = key
@@ -157,6 +163,9 @@ class RedisFileDataManager(FileDataManager):
157
163
  self._data = orjson.loads(data)
158
164
  self._loaded = True
159
165
 
166
+ @backoff.on_exception(
167
+ backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
168
+ )
160
169
  async def save(self):
161
170
  if self.key is None:
162
171
  raise Exception("Not initialized")
@@ -164,6 +173,9 @@ class RedisFileDataManager(FileDataManager):
164
173
  value = orjson.dumps(self._data)
165
174
  await self.redis.set(self.key, value, ex=self._ttl)
166
175
 
176
+ @backoff.on_exception(
177
+ backoff.expo, RETRIABLE_REDIS_ERRORS, jitter=backoff.random_jitter, max_tries=3
178
+ )
167
179
  async def _delete_key(self):
168
180
  if self.key is None:
169
181
  raise Exception("Not initialized")
@@ -31,9 +31,7 @@ class HTTPException(StarletteHTTPException):
31
31
 
32
32
  def __init__(self, detail: Optional[str] = None):
33
33
  if self._status_code:
34
- super(HTTPException, self).__init__(
35
- status_code=self._status_code, detail=detail
36
- )
34
+ super(HTTPException, self).__init__(status_code=self._status_code, detail=detail)
37
35
  else:
38
36
  raise AttributeError("Status code not defined")
39
37