nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -47,17 +47,19 @@ class MigrationsDataManager:
47
47
  self.driver = driver
48
48
 
49
49
  async def schedule_all_kbs(self, target_version: int) -> None:
50
+ # Get all kb ids
51
+ async with self.driver.transaction(read_only=True) as txn:
52
+ kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
53
+ # Schedule the migrations
50
54
  async with self.driver.transaction() as txn:
51
- async for kbid, _ in datamanagers.kb.get_kbs(txn):
52
- await txn.set(
53
- MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode()
54
- )
55
+ for kbid in kbids:
56
+ await txn.set(MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode())
55
57
  await txn.commit()
56
58
 
57
- async def get_kb_migrations(self, limit: int = 100) -> list[str]:
59
+ async def get_kb_migrations(self) -> list[str]:
58
60
  keys = []
59
61
  async with self.driver.transaction() as txn:
60
- async for key in txn.keys(MIGRATIONS_CONTAINER_KEY, count=limit):
62
+ async for key in txn.keys(MIGRATIONS_CONTAINER_KEY):
61
63
  keys.append(key.split("/")[-1])
62
64
 
63
65
  return keys
@@ -68,7 +70,7 @@ class MigrationsDataManager:
68
70
  await txn.commit()
69
71
 
70
72
  async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
71
- async with self.driver.transaction() as txn:
73
+ async with self.driver.transaction(read_only=True) as txn:
72
74
  kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
73
75
  if kb_config is None:
74
76
  return None
@@ -76,7 +78,7 @@ class MigrationsDataManager:
76
78
 
77
79
  async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
78
80
  async with self.driver.transaction() as txn:
79
- kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
81
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
80
82
  if kb_config is None:
81
83
  raise Exception(f"KB {kbid} does not exist")
82
84
  kb_config.migration_version = current_version
@@ -84,15 +86,13 @@ class MigrationsDataManager:
84
86
  await txn.commit()
85
87
 
86
88
  async def get_global_info(self) -> GlobalInfo:
87
- async with self.driver.transaction() as txn:
89
+ async with self.driver.transaction(read_only=True) as txn:
88
90
  raw_pb = await txn.get(MIGRATION_INFO_KEY)
89
91
  if raw_pb is None:
90
92
  return GlobalInfo(current_version=0, target_version=None)
91
93
  pb = migrations_pb2.MigrationInfo()
92
94
  pb.ParseFromString(raw_pb)
93
- return GlobalInfo(
94
- current_version=pb.current_version, target_version=pb.target_version
95
- )
95
+ return GlobalInfo(current_version=pb.current_version, target_version=pb.target_version)
96
96
 
97
97
  async def update_global_info(
98
98
  self,
@@ -101,7 +101,7 @@ class MigrationsDataManager:
101
101
  target_version: Union[int, None, _Unset] = _UNSET,
102
102
  ) -> None:
103
103
  async with self.driver.transaction() as txn:
104
- raw_pb = await txn.get(MIGRATION_INFO_KEY)
104
+ raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
105
105
  pb = migrations_pb2.MigrationInfo()
106
106
  if raw_pb is not None:
107
107
  pb.ParseFromString(raw_pb)
@@ -22,22 +22,20 @@ import logging
22
22
  from typing import Optional
23
23
 
24
24
  from nucliadb.common import locking
25
- from nucliadb.common.cluster.rollover import rollover_kb_shards
25
+ from nucliadb.common.cluster.rollover import rollover_kb_index
26
+ from nucliadb.common.cluster.settings import in_standalone_mode
27
+ from nucliadb.common.maindb.pg import PGDriver
26
28
  from nucliadb.migrator.context import ExecutionContext
27
- from nucliadb.migrator.utils import get_migrations
29
+ from nucliadb.migrator.utils import get_migrations, get_pg_migrations
28
30
  from nucliadb_telemetry import errors, metrics
29
31
 
30
- migration_observer = metrics.Observer(
31
- "nucliadb_migrations", labels={"type": "kb", "target_version": ""}
32
- )
32
+ migration_observer = metrics.Observer("nucliadb_migrations", labels={"type": "kb", "target_version": ""})
33
33
 
34
34
 
35
35
  logger = logging.getLogger(__name__)
36
36
 
37
37
 
38
- async def run_kb_migrations(
39
- context: ExecutionContext, kbid: str, target_version: int
40
- ) -> None:
38
+ async def run_kb_migrations(context: ExecutionContext, kbid: str, target_version: int) -> None:
41
39
  async with locking.distributed_lock(f"migration-{kbid}"):
42
40
  kb_info = await context.data_manager.get_kb_info(kbid)
43
41
  if kb_info is None:
@@ -45,9 +43,7 @@ async def run_kb_migrations(
45
43
  await context.data_manager.delete_kb_migration(kbid=kbid)
46
44
  return
47
45
 
48
- migrations = get_migrations(
49
- from_version=kb_info.current_version, to_version=target_version
50
- )
46
+ migrations = get_migrations(from_version=kb_info.current_version, to_version=target_version)
51
47
 
52
48
  for migration in migrations:
53
49
  migration_info = {
@@ -57,15 +53,11 @@ async def run_kb_migrations(
57
53
  }
58
54
 
59
55
  try:
60
- logger.warning("Migrating KB", extra=migration_info)
61
- with migration_observer(
62
- {"type": "kb", "target_version": str(migration.version)}
63
- ):
64
- await migration.module.migrate_kb(context, kbid) # type: ignore
65
- logger.warning("Finished KB Migration", extra=migration_info)
66
- await context.data_manager.update_kb_info(
67
- kbid=kbid, current_version=migration.version
68
- )
56
+ logger.info("Migrating KB", extra=migration_info)
57
+ with migration_observer({"type": "kb", "target_version": str(migration.version)}):
58
+ await migration.module.migrate_kb(context, kbid)
59
+ logger.info("Finished KB Migration", extra=migration_info)
60
+ await context.data_manager.update_kb_info(kbid=kbid, current_version=migration.version)
69
61
  except Exception as exc:
70
62
  errors.capture_exception(exc)
71
63
  logger.exception("Failed to migrate KB", extra=migration_info)
@@ -73,9 +65,7 @@ async def run_kb_migrations(
73
65
 
74
66
  refreshed_kb_info = await context.data_manager.get_kb_info(kbid=kbid)
75
67
  if refreshed_kb_info is None:
76
- logger.warning(
77
- "KB not found. This should not happen.", extra={"kbid": kbid}
78
- )
68
+ logger.warning("KB not found. This should not happen.", extra={"kbid": kbid})
79
69
  return
80
70
  assert refreshed_kb_info.current_version == target_version
81
71
 
@@ -87,12 +77,14 @@ async def run_all_kb_migrations(context: ExecutionContext, target_version: int)
87
77
  Schedule all KB migrations to run in parallel. Only a certain number of migrations will run at the same time.
88
78
  If any of the migrations fail, the whole process will fail.
89
79
  """
90
- to_migrate = await context.data_manager.get_kb_migrations(limit=-1)
80
+ to_migrate = await context.data_manager.get_kb_migrations()
91
81
 
92
82
  if len(to_migrate) == 0:
93
83
  return
94
-
95
- max_concurrent = context.settings.max_concurrent_migrations
84
+ if in_standalone_mode():
85
+ max_concurrent = 1
86
+ else:
87
+ max_concurrent = context.settings.max_concurrent_migrations
96
88
  semaphore = asyncio.Semaphore(max_concurrent)
97
89
 
98
90
  logger.info(
@@ -150,15 +142,11 @@ async def run_global_migrations(context: ExecutionContext, target_version: int)
150
142
  "to_version": migration.version,
151
143
  }
152
144
  try:
153
- logger.warning("Migrating", extra=migration_info)
154
- with migration_observer(
155
- {"type": "global", "target_version": str(migration.version)}
156
- ):
157
- await migration.module.migrate(context) # type: ignore
158
- await context.data_manager.update_global_info(
159
- current_version=migration.version
160
- )
161
- logger.warning("Finished migration", extra=migration_info)
145
+ logger.info("Migrating", extra=migration_info)
146
+ with migration_observer({"type": "global", "target_version": str(migration.version)}):
147
+ await migration.module.migrate(context)
148
+ await context.data_manager.update_global_info(current_version=migration.version)
149
+ logger.info("Finished migration", extra=migration_info)
162
150
  except Exception as exc:
163
151
  errors.capture_exception(exc)
164
152
  logger.exception("Failed to migrate", extra=migration_info)
@@ -174,7 +162,7 @@ async def run_rollover_in_parallel(
174
162
  ) -> None:
175
163
  async with max_concurrent:
176
164
  try:
177
- await rollover_kb_shards(context, kbid)
165
+ await rollover_kb_index(context, kbid)
178
166
  await context.data_manager.delete_kb_rollover(kbid=kbid)
179
167
  except Exception as exc:
180
168
  errors.capture_exception(exc)
@@ -218,8 +206,40 @@ async def run_rollovers(context: ExecutionContext) -> None:
218
206
  raise Exception(f"Failed to migrate KBs. Failures: {failures}")
219
207
 
220
208
 
209
+ async def run_pg_schema_migrations(driver: PGDriver):
210
+ migrations = get_pg_migrations()
211
+
212
+ # The migration uses two transactions. The former is only used to get a lock (pg_advisory_lock)
213
+ # without having to worry about correctly unlocking it (postgres unlocks it when the transaction ends)
214
+ async with driver.transaction() as tx_lock, tx_lock.connection.cursor() as cur_lock: # type: ignore[attr-defined]
215
+ await cur_lock.execute(
216
+ "CREATE TABLE IF NOT EXISTS migrations (version INT PRIMARY KEY, migrated_at TIMESTAMP NOT NULL DEFAULT NOW())"
217
+ )
218
+ await tx_lock.commit()
219
+ await cur_lock.execute("SELECT pg_advisory_xact_lock(3116614845278015934)")
220
+
221
+ await cur_lock.execute("SELECT version FROM migrations")
222
+ migrated = [r[0] for r in await cur_lock.fetchall()]
223
+
224
+ for version, migration in migrations:
225
+ if version in migrated:
226
+ continue
227
+
228
+ # Gets a new transaction for each migration, so if they get interrupted we at least
229
+ # save the state of the last finished transaction
230
+ async with driver.transaction() as tx, tx.connection.cursor() as cur: # type: ignore[attr-defined]
231
+ await migration.migrate(tx)
232
+ await cur.execute("INSERT INTO migrations (version) VALUES (%s)", (version,))
233
+ await tx.commit()
234
+
235
+
221
236
  async def run(context: ExecutionContext, target_version: Optional[int] = None) -> None:
222
- async with locking.distributed_lock("migration"):
237
+ # Run schema migrations first, since they create the `resources` table needed for the lock below
238
+ # Schema migrations use their own locking system
239
+ if isinstance(context.kv_driver, PGDriver):
240
+ await run_pg_schema_migrations(context.kv_driver)
241
+
242
+ async with locking.distributed_lock(locking.MIGRATIONS_LOCK):
223
243
  # before we move to managed migrations, see if there are any rollovers
224
244
  # scheduled and run them
225
245
  await run_rollovers(context)
@@ -20,9 +20,10 @@
20
20
  from typing import Optional
21
21
 
22
22
  import pydantic
23
+ import pydantic_settings
23
24
 
24
25
 
25
- class Settings(pydantic.BaseSettings):
26
+ class Settings(pydantic_settings.BaseSettings):
26
27
  redis_url: Optional[str] = None
27
28
  max_concurrent_migrations: int = pydantic.Field(
28
29
  default=5,
@@ -17,13 +17,12 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import importlib
20
21
  import logging
21
22
  import os
22
23
  import types
23
24
  from functools import lru_cache
24
25
 
25
- import migrations
26
-
27
26
  from .models import Migration
28
27
 
29
28
  logger = logging.getLogger(__name__)
@@ -33,14 +32,27 @@ MIGRATION_DIR = os.path.sep.join(
33
32
  )
34
33
 
35
34
 
35
+ def get_pg_migrations() -> list[tuple[int, types.ModuleType]]:
36
+ output = []
37
+ for filename in os.listdir(os.path.join(MIGRATION_DIR, "pg")):
38
+ if filename.endswith(".py") and filename != "__init__.py":
39
+ module_name = filename[:-3]
40
+ version = int(module_name.split("_")[0])
41
+ module = importlib.import_module(f"migrations.pg.{module_name}")
42
+ if not hasattr(module, "migrate"):
43
+ raise Exception(f"Missing `migrate` function in {module_name}")
44
+ output.append((version, module))
45
+ output.sort()
46
+ return output
47
+
48
+
36
49
  def get_migration_modules() -> list[tuple[types.ModuleType, int]]:
37
50
  output = []
38
51
  for filename in os.listdir(MIGRATION_DIR):
39
52
  if filename.endswith(".py") and filename != "__init__.py":
40
53
  module_name = filename[:-3]
41
54
  version = int(module_name.split("_")[0])
42
- __import__(f"migrations.{module_name}")
43
- module = getattr(migrations, module_name)
55
+ module = importlib.import_module(f"migrations.{module_name}")
44
56
  if not hasattr(module, "migrate"):
45
57
  raise Exception(f"Missing `migrate` function in {module_name}")
46
58
  if not hasattr(module, "migrate_kb"):
@@ -49,17 +61,13 @@ def get_migration_modules() -> list[tuple[types.ModuleType, int]]:
49
61
  return output
50
62
 
51
63
 
52
- def get_migrations(
53
- from_version: int = 0, to_version: int = 99999999
54
- ) -> list[Migration]:
64
+ def get_migrations(from_version: int = 0, to_version: int = 99999999) -> list[Migration]:
55
65
  migrations: list[Migration] = []
56
66
  for module, version in get_migration_modules():
57
67
  migrations.append(Migration(version=version, module=module))
58
68
 
59
69
  migrations.sort(key=lambda m: m.version)
60
- return [
61
- m for m in migrations if m.version > from_version and m.version <= to_version
62
- ]
70
+ return [m for m in migrations if m.version > from_version and m.version <= to_version]
63
71
 
64
72
 
65
73
  @lru_cache(maxsize=None)
@@ -18,10 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
+ import importlib.metadata
21
22
  from typing import AsyncGenerator
22
23
 
23
- import pkg_resources
24
-
25
24
  from nucliadb.common.cluster.exceptions import NodeError, ShardNotFound
26
25
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
27
26
  from nucliadb.common.maindb.driver import Driver
@@ -31,6 +30,9 @@ from nucliadb.ingest.orm.knowledgebox import (
31
30
  KB_TO_DELETE,
32
31
  KB_TO_DELETE_BASE,
33
32
  KB_TO_DELETE_STORAGE_BASE,
33
+ KB_VECTORSET_TO_DELETE,
34
+ KB_VECTORSET_TO_DELETE_BASE,
35
+ RESOURCE_TO_DELETE_STORAGE_BASE,
34
36
  KnowledgeBox,
35
37
  )
36
38
  from nucliadb_telemetry import errors
@@ -41,7 +43,7 @@ from nucliadb_utils.utilities import get_storage
41
43
 
42
44
  async def _iter_keys(driver: Driver, match: str) -> AsyncGenerator[str, None]:
43
45
  async with driver.transaction(read_only=True) as keys_txn:
44
- async for key in keys_txn.keys(match=match, count=-1):
46
+ async for key in keys_txn.keys(match=match):
45
47
  yield key
46
48
 
47
49
 
@@ -52,9 +54,7 @@ async def purge_kb(driver: Driver):
52
54
  try:
53
55
  kbid = key.split("/")[2]
54
56
  except Exception:
55
- logger.warning(
56
- f" X Skipping purge {key}, wrong key format, expected {KB_TO_DELETE_BASE}"
57
- )
57
+ logger.warning(f" X Skipping purge {key}, wrong key format, expected {KB_TO_DELETE_BASE}")
58
58
  continue
59
59
 
60
60
  try:
@@ -62,15 +62,11 @@ async def purge_kb(driver: Driver):
62
62
  logger.info(f" √ Successfully Purged {kbid}")
63
63
  except ShardNotFound as exc:
64
64
  errors.capture_exception(exc)
65
- logger.error(
66
- f" X At least one shard was unavailable while purging {kbid}, skipping"
67
- )
65
+ logger.error(f" X At least one shard was unavailable while purging {kbid}, skipping")
68
66
  continue
69
67
  except NodeError as exc:
70
68
  errors.capture_exception(exc)
71
- logger.error(
72
- f" X At least one node was unavailable while purging {kbid}, skipping"
73
- )
69
+ logger.error(f" X At least one node was unavailable while purging {kbid}, skipping")
74
70
  continue
75
71
 
76
72
  except Exception as exc:
@@ -82,10 +78,10 @@ async def purge_kb(driver: Driver):
82
78
 
83
79
  # Now delete the tikv delete mark
84
80
  try:
85
- txn = await driver.begin()
86
- key_to_purge = KB_TO_DELETE.format(kbid=kbid)
87
- await txn.delete(key_to_purge)
88
- await txn.commit()
81
+ async with driver.transaction() as txn:
82
+ key_to_purge = KB_TO_DELETE.format(kbid=kbid)
83
+ await txn.delete(key_to_purge)
84
+ await txn.commit()
89
85
  logger.info(f" √ Deleted {key_to_purge}")
90
86
  except Exception as exc:
91
87
  errors.capture_exception(exc)
@@ -112,16 +108,12 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
112
108
 
113
109
  delete_marker = False
114
110
  if conflict:
115
- logger.info(
116
- f" . Nothing was deleted for {key}, (Bucket not yet empty), will try next time"
117
- )
111
+ logger.info(f" . Nothing was deleted for {key}, (Bucket not yet empty), will try next time")
118
112
  # Just in case something failed while setting a lifecycle policy to
119
113
  # remove all elements from the bucket, reschedule it
120
114
  await storage.schedule_delete_kb(kbid)
121
115
  elif not deleted:
122
- logger.info(
123
- f" ! Expected bucket for {key} was not found, will delete marker"
124
- )
116
+ logger.info(f" ! Expected bucket for {key} was not found, will delete marker")
125
117
  delete_marker = True
126
118
  elif deleted:
127
119
  logger.info(" √ Bucket successfully deleted")
@@ -129,19 +121,122 @@ async def purge_kb_storage(driver: Driver, storage: Storage):
129
121
 
130
122
  if delete_marker:
131
123
  try:
132
- txn = await driver.begin()
133
- await txn.delete(key)
124
+ async with driver.transaction() as txn:
125
+ await txn.delete(key)
126
+ await txn.commit()
134
127
  logger.info(f" √ Deleted storage deletion marker {key}")
135
128
  except Exception as exc:
136
129
  errors.capture_exception(exc)
137
130
  logger.info(f" X Error while deleting key {key}")
138
- await txn.abort()
139
- else:
140
- await txn.commit()
141
131
 
142
132
  logger.info("FINISH PURGING KB STORAGE")
143
133
 
144
134
 
135
+ async def purge_deleted_resource_storage(driver: Driver, storage: Storage) -> None:
136
+ """
137
+ Remove from storage all resources marked as deleted.
138
+
139
+ Returns the number of resources purged.
140
+ """
141
+ logger.info("Starting purge of deleted resource storage")
142
+ to_purge = await _count_resources_storage_to_purge(driver)
143
+ logger.info(f"Found {to_purge} resources to purge")
144
+ while True:
145
+ try:
146
+ purged = await _purge_resources_storage_batch(driver, storage, batch_size=100)
147
+ if not purged:
148
+ logger.info("No more resources to purge found")
149
+ return
150
+ logger.info(f"Purged {purged} resources")
151
+
152
+ except asyncio.CancelledError:
153
+ logger.info("Purge of deleted resource storage was cancelled")
154
+ return
155
+
156
+
157
+ async def _count_resources_storage_to_purge(driver: Driver) -> int:
158
+ """
159
+ Count the number of resources marked as deleted in storage.
160
+ """
161
+ async with driver.transaction(read_only=True) as txn:
162
+ return await txn.count(match=RESOURCE_TO_DELETE_STORAGE_BASE)
163
+
164
+
165
+ async def _purge_resources_storage_batch(driver: Driver, storage: Storage, batch_size: int = 100) -> int:
166
+ """
167
+ Remove from storage a batch of resources marked as deleted. Returns the
168
+ number of resources purged.
169
+ """
170
+ # Get the keys of the resources to delete in batches of 100
171
+ to_delete_batch = []
172
+ async with driver.transaction(read_only=True) as txn:
173
+ async for key in txn.keys(match=RESOURCE_TO_DELETE_STORAGE_BASE, count=batch_size):
174
+ to_delete_batch.append(key)
175
+
176
+ if not to_delete_batch:
177
+ return 0
178
+
179
+ # Delete the resources blobs from storage
180
+ logger.info(f"Purging {len(to_delete_batch)} deleted resources")
181
+ tasks = []
182
+ for key in to_delete_batch:
183
+ kbid, resource_id = key.split("/")[-2:]
184
+ tasks.append(asyncio.create_task(storage.delete_resource(kbid, resource_id)))
185
+ await asyncio.gather(*tasks)
186
+
187
+ # Delete the schedule-to-delete keys
188
+ async with driver.transaction() as txn:
189
+ for key in to_delete_batch:
190
+ await txn.delete(key)
191
+ await txn.commit()
192
+
193
+ return len(to_delete_batch)
194
+
195
+
196
+ async def purge_kb_vectorsets(driver: Driver, storage: Storage):
197
+ """Vectors for a vectorset are stored in a key inside each resource. Iterate
198
+ through all resources of the KB and remove any storage object containing
199
+ vectors for the specific vectorset to purge.
200
+
201
+ """
202
+ logger.info("START PURGING KB VECTORSETS")
203
+
204
+ purged = []
205
+ async for key in _iter_keys(driver, KB_VECTORSET_TO_DELETE_BASE):
206
+ logger.info(f"Purging vectorsets {key}")
207
+ try:
208
+ _base, kbid, vectorset = key.lstrip("/").split("/")
209
+ except ValueError:
210
+ logger.info(f" X Skipping purge {key}, wrong key format, expected {KB_VECTORSET_TO_DELETE}")
211
+ continue
212
+
213
+ try:
214
+ async with driver.transaction(read_only=True) as txn:
215
+ kb = KnowledgeBox(txn, storage, kbid)
216
+ async for resource in kb.iterate_resources():
217
+ fields = await resource.get_fields(force=True)
218
+ # we don't need the maindb transaction anymore to remove vectors from storage
219
+ for field in fields.values():
220
+ await field.delete_vectors(vectorset)
221
+ except Exception as exc:
222
+ errors.capture_exception(exc)
223
+ logger.error(
224
+ f" X ERROR while executing KB vectorset purge, skipping",
225
+ exc_info=exc,
226
+ extra={"kbid": kbid},
227
+ )
228
+ continue
229
+
230
+ purged.append(key)
231
+
232
+ async with driver.transaction() as txn:
233
+ for key in purged:
234
+ await txn.delete(key)
235
+ await txn.commit()
236
+
237
+ logger.info("FINISH PURGING KB VECTORSETS")
238
+
239
+
145
240
  async def main():
146
241
  """
147
242
  This script will purge all knowledge boxes marked to be deleted in maindb.
@@ -153,17 +248,28 @@ async def main():
153
248
  service_name=SERVICE_NAME,
154
249
  )
155
250
  try:
251
+ purge_resources_storage_task = asyncio.create_task(
252
+ purge_deleted_resource_storage(driver, storage)
253
+ )
156
254
  await purge_kb(driver)
157
255
  await purge_kb_storage(driver, storage)
256
+ await purge_kb_vectorsets(driver, storage)
257
+ await purge_resources_storage_task
258
+ except Exception as ex: # pragma: no cover
259
+ logger.exception("Unhandled exception on purge command")
260
+ errors.capture_exception(ex)
158
261
  finally:
159
- await storage.finalize()
160
- await teardown_driver()
161
- await teardown_cluster()
262
+ try:
263
+ purge_resources_storage_task.cancel()
264
+ await storage.finalize()
265
+ await teardown_driver()
266
+ await teardown_cluster()
267
+ except Exception: # pragma: no cover
268
+ logger.exception("Error tearing down utilities on purge command")
269
+ pass
162
270
 
163
271
 
164
272
  def run() -> int: # pragma: no cover
165
273
  setup_logging()
166
-
167
- errors.setup_error_handling(pkg_resources.get_distribution("nucliadb").version)
168
-
274
+ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
169
275
  return asyncio.run(main())
@@ -19,11 +19,11 @@
19
19
 
20
20
  import argparse
21
21
  import asyncio
22
+ import importlib.metadata
22
23
  from dataclasses import dataclass
23
24
  from typing import Optional
24
25
 
25
- import pkg_resources
26
- from grpc.aio import AioRpcError # type: ignore
26
+ from grpc.aio import AioRpcError
27
27
 
28
28
  from nucliadb.common import datamanagers
29
29
  from nucliadb.common.cluster import manager
@@ -86,7 +86,7 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardLocation]:
86
86
  orphan_shard_ids = indexed_shards.keys() - stored_shards.keys()
87
87
  orphan_shards: dict[str, ShardLocation] = {}
88
88
  unavailable_nodes: set[str] = set()
89
- async with datamanagers.with_transaction() as txn:
89
+ async with datamanagers.with_ro_transaction() as txn:
90
90
  for shard_id in orphan_shard_ids:
91
91
  node_id = indexed_shards[shard_id].node_id
92
92
  node = manager.get_index_node(node_id) # type: ignore
@@ -99,9 +99,7 @@ async def detect_orphan_shards(driver: Driver) -> dict[str, ShardLocation]:
99
99
  # Shards with knwon KB ids can be checked and ignore those comming from
100
100
  # an ongoing migration/rollover
101
101
  if kbid != UNKNOWN_KB:
102
- skip = await datamanagers.rollover.is_rollover_shard(
103
- txn, kbid=kbid, shard_id=shard_id
104
- )
102
+ skip = await datamanagers.rollover.is_rollover_shard(txn, kbid=kbid, shard_id=shard_id)
105
103
  if skip:
106
104
  continue
107
105
 
@@ -133,18 +131,14 @@ async def _get_stored_shards(driver: Driver) -> dict[str, ShardLocation]:
133
131
  try:
134
132
  kb_shards = await shards_manager.get_shards_by_kbid(kbid)
135
133
  except ShardsNotFound:
136
- logger.warning(
137
- "KB not found while looking for orphan shards", extra={"kbid": kbid}
138
- )
134
+ logger.warning("KB not found while looking for orphan shards", extra={"kbid": kbid})
139
135
  continue
140
136
  else:
141
137
  for shard_object_pb in kb_shards:
142
138
  for shard_replica_pb in shard_object_pb.replicas:
143
139
  shard_replica_id = shard_replica_pb.shard.id
144
140
  node_id = shard_replica_pb.node
145
- stored_shards[shard_replica_id] = ShardLocation(
146
- kbid=kbid, node_id=node_id
147
- )
141
+ stored_shards[shard_replica_id] = ShardLocation(kbid=kbid, node_id=node_id)
148
142
  return stored_shards
149
143
 
150
144
 
@@ -264,6 +258,6 @@ async def main():
264
258
  def run() -> int: # pragma: no cover
265
259
  setup_logging()
266
260
 
267
- errors.setup_error_handling(pkg_resources.get_distribution("nucliadb").version)
261
+ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
268
262
 
269
263
  return asyncio.run(main())
@@ -29,9 +29,7 @@ API_PREFIX = "api"
29
29
  class EndpointFilter(logging.Filter):
30
30
  def filter(self, record: logging.LogRecord) -> bool:
31
31
  return (
32
- record.args is not None
33
- and len(record.args) >= 3
34
- and record.args[2] not in ("/", "/metrics") # type: ignore
32
+ record.args is not None and len(record.args) >= 3 and record.args[2] not in ("/", "/metrics") # type: ignore
35
33
  )
36
34
 
37
35