nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -23,7 +23,6 @@ import uuid
23
23
  from typing import Any, Awaitable, Callable, Optional
24
24
 
25
25
  import backoff
26
- from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, TypeMessage
27
26
 
28
27
  from nucliadb.common import datamanagers
29
28
  from nucliadb.common.cluster.base import AbstractIndexNode
@@ -37,12 +36,15 @@ from nucliadb.common.cluster.exceptions import (
37
36
  ShardsNotFound,
38
37
  )
39
38
  from nucliadb.common.maindb.driver import Transaction
39
+ from nucliadb.common.nidx import NIDX_ENABLED, get_nidx, get_nidx_api_client, get_nidx_fake_node
40
40
  from nucliadb_protos import (
41
+ knowledgebox_pb2,
41
42
  nodereader_pb2,
42
43
  noderesources_pb2,
43
44
  nodewriter_pb2,
44
45
  writer_pb2,
45
46
  )
47
+ from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, NewShardRequest, TypeMessage
46
48
  from nucliadb_telemetry import errors
47
49
  from nucliadb_utils.utilities import get_indexing, get_storage
48
50
 
@@ -124,7 +126,7 @@ def remove_index_node(node_id: str, primary_id: Optional[str] = None) -> None:
124
126
  class KBShardManager:
125
127
  # TODO: move to data manager
126
128
  async def get_shards_by_kbid_inner(self, kbid: str) -> writer_pb2.Shards:
127
- async with datamanagers.with_transaction(read_only=True) as txn:
129
+ async with datamanagers.with_ro_transaction() as txn:
128
130
  result = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
129
131
  if result is None:
130
132
  # could be None because /shards doesn't exist, or beacause the
@@ -142,6 +144,8 @@ class KBShardManager:
142
144
  kbid: str,
143
145
  aw: Callable[[AbstractIndexNode, str], Awaitable[Any]],
144
146
  timeout: float,
147
+ *,
148
+ use_nidx: bool,
145
149
  use_read_replica_nodes: bool = False,
146
150
  ) -> list[Any]:
147
151
  shards = await self.get_shards_by_kbid(kbid)
@@ -149,7 +153,7 @@ class KBShardManager:
149
153
 
150
154
  for shard_obj in shards:
151
155
  node, shard_id = choose_node(
152
- shard_obj, use_read_replica_nodes=use_read_replica_nodes
156
+ shard_obj, use_nidx=use_nidx, use_read_replica_nodes=use_read_replica_nodes
153
157
  )
154
158
  if shard_id is None:
155
159
  raise ShardNotFound("Found a node but not a shard")
@@ -158,7 +162,7 @@ class KBShardManager:
158
162
 
159
163
  try:
160
164
  results = await asyncio.wait_for(
161
- asyncio.gather(*ops, return_exceptions=True), # type: ignore
165
+ asyncio.gather(*ops, return_exceptions=True),
162
166
  timeout=timeout,
163
167
  )
164
168
  except asyncio.TimeoutError as exc:
@@ -171,7 +175,7 @@ class KBShardManager:
171
175
  async def get_current_active_shard(
172
176
  self, txn: Transaction, kbid: str
173
177
  ) -> Optional[writer_pb2.ShardObject]:
174
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
178
+ kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=False)
175
179
  if kb_shards is None:
176
180
  return None
177
181
 
@@ -195,23 +199,25 @@ class KBShardManager:
195
199
  )
196
200
  raise
197
201
 
198
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
202
+ kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
199
203
  if kb_shards is None:
200
- msg = (
201
- "Attempting to create a shard for a KB when it has no stored shards in maindb",
202
- )
204
+ msg = ("Attempting to create a shard for a KB when it has no stored shards in maindb",)
203
205
  logger.error(msg, extra={"kbid": kbid})
204
206
  raise ShardsNotFound(msg)
205
207
 
206
- existing_kb_nodes = [
207
- replica.node for shard in kb_shards.shards for replica in shard.replicas
208
- ]
208
+ existing_kb_nodes = [replica.node for shard in kb_shards.shards for replica in shard.replicas]
209
209
  nodes = sorted_primary_nodes(
210
210
  avoid_nodes=existing_kb_nodes,
211
211
  ignore_nodes=settings.drain_nodes,
212
212
  )
213
213
 
214
+ vectorsets = {
215
+ vectorset_id: vectorset_config.vectorset_index_config
216
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(txn, kbid=kbid)
217
+ }
218
+
214
219
  shard_uuid = uuid.uuid4().hex
220
+
215
221
  shard = writer_pb2.ShardObject(shard=shard_uuid, read_only=False)
216
222
  try:
217
223
  # Attempt to create configured number of replicas
@@ -228,28 +234,56 @@ class KBShardManager:
228
234
  if node is None:
229
235
  logger.error(f"Node {node_id} is not found or not available")
230
236
  continue
231
- is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
237
+
232
238
  try:
233
- shard_created = await node.new_shard(
234
- kbid,
235
- similarity=kb_shards.similarity,
236
- release_channel=kb_shards.release_channel,
237
- normalize_vectors=is_matryoshka,
239
+ if not vectorsets:
240
+ # bw/c KBs without vectorsets
241
+ is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
242
+ vector_index_config = nodewriter_pb2.VectorIndexConfig(
243
+ similarity=kb_shards.similarity,
244
+ vector_type=nodewriter_pb2.VectorType.DENSE_F32,
245
+ vector_dimension=kb_shards.model.vector_dimension,
246
+ normalize_vectors=is_matryoshka,
247
+ )
248
+
249
+ shard_created = await node.new_shard(
250
+ kbid,
251
+ vector_index_config=vector_index_config,
252
+ )
253
+
254
+ else:
255
+ shard_created = await node.new_shard_with_vectorsets(
256
+ kbid,
257
+ vectorsets_configs=vectorsets,
258
+ )
259
+
260
+ except Exception as exc:
261
+ errors.capture_exception(exc)
262
+ logger.exception(
263
+ f"Error creating new shard for KB", extra={"kbid": kbid, "node_id": node}
238
264
  )
239
- except Exception as e:
240
- errors.capture_exception(e)
241
- logger.exception(f"Error creating new shard at {node}: {e}")
242
265
  continue
243
266
 
244
267
  replica = writer_pb2.ShardReplica(node=str(node_id))
245
268
  replica.shard.CopyFrom(shard_created)
246
269
  shard.replicas.append(replica)
247
270
  replicas_created += 1
248
- except Exception as e:
249
- errors.capture_exception(e)
250
- logger.error(f"Unexpected error creating new shard: {e}")
271
+
272
+ nidx_api = get_nidx_api_client()
273
+ if nidx_api:
274
+ req = NewShardRequest(
275
+ kbid=kbid,
276
+ vectorsets_configs=vectorsets,
277
+ )
278
+
279
+ resp = await nidx_api.NewShard(req) # type: ignore
280
+ shard.nidx_shard_id = resp.id
281
+
282
+ except Exception as exc:
283
+ errors.capture_exception(exc)
284
+ logger.exception(f"Unexpected error creating new shard for KB", extra={"kbid": kbid})
251
285
  await self.rollback_shard(shard)
252
- raise e
286
+ raise exc
253
287
 
254
288
  # set previous shard as read only, we only have one writable shard at a
255
289
  # time
@@ -284,6 +318,17 @@ class KBShardManager:
284
318
  exc_info=True,
285
319
  )
286
320
 
321
+ nidx_api = get_nidx_api_client()
322
+ if nidx_api and shard.nidx_shard_id:
323
+ try:
324
+ await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
325
+ except Exception as rollback_error:
326
+ errors.capture_exception(rollback_error)
327
+ logger.error(
328
+ f"New shard rollback error. Nidx Shard: {shard.nidx_shard_id}",
329
+ exc_info=True,
330
+ )
331
+
287
332
  def indexing_replicas(self, shard: writer_pb2.ShardObject) -> list[tuple[str, str]]:
288
333
  """
289
334
  Returns the replica ids and nodes for the shard replicas
@@ -303,10 +348,9 @@ class KBShardManager:
303
348
  ) -> None:
304
349
  indexing = get_indexing()
305
350
  storage = await get_storage()
351
+ nidx = get_nidx()
306
352
 
307
- await storage.delete_indexing(
308
- resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard
309
- )
353
+ await storage.delete_indexing(resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard)
310
354
 
311
355
  for replica_id, node_id in self.indexing_replicas(shard):
312
356
  indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
@@ -319,6 +363,13 @@ class KBShardManager:
319
363
  indexpb.kbid = kb
320
364
  await indexing.index(indexpb, node_id)
321
365
 
366
+ if nidx is not None and shard.nidx_shard_id:
367
+ nidxpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
368
+ nidxpb.shard = shard.nidx_shard_id
369
+ nidxpb.resource = uuid
370
+ nidxpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
371
+ await nidx.index(nidxpb)
372
+
322
373
  async def add_resource(
323
374
  self,
324
375
  shard: writer_pb2.ShardObject,
@@ -339,7 +390,7 @@ class KBShardManager:
339
390
 
340
391
  storage = await get_storage()
341
392
  indexing = get_indexing()
342
-
393
+ nidx = get_nidx()
343
394
  indexpb = IndexMessage()
344
395
 
345
396
  if reindex_id is not None:
@@ -366,6 +417,10 @@ class KBShardManager:
366
417
  indexpb.shard = replica_id
367
418
  await indexing.index(indexpb, node_id)
368
419
 
420
+ if nidx is not None and shard.nidx_shard_id:
421
+ indexpb.shard = shard.nidx_shard_id
422
+ await nidx.index(indexpb)
423
+
369
424
  def should_create_new_shard(self, num_paragraphs: int) -> bool:
370
425
  return num_paragraphs > settings.max_shard_paragraphs
371
426
 
@@ -383,6 +438,44 @@ class KBShardManager:
383
438
  await self.create_shard_by_kbid(txn, kbid)
384
439
  await txn.commit()
385
440
 
441
+ async def create_vectorset(self, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
442
+ """Create a new vectorset in all KB shards."""
443
+
444
+ async def _create_vectorset(node: AbstractIndexNode, shard_id: str):
445
+ vectorset_id = config.vectorset_id
446
+ index_config = config.vectorset_index_config
447
+ result = await node.add_vectorset(shard_id, vectorset_id, index_config)
448
+ if result.status != result.Status.OK:
449
+ raise NodeError(
450
+ f"Unable to create vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
451
+ )
452
+
453
+ await self.apply_for_all_shards(
454
+ kbid, _create_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
455
+ )
456
+ if NIDX_ENABLED:
457
+ await self.apply_for_all_shards(
458
+ kbid, _create_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
459
+ )
460
+
461
+ async def delete_vectorset(self, kbid: str, vectorset_id: str):
462
+ """Delete a vectorset from all KB shards"""
463
+
464
+ async def _delete_vectorset(node: AbstractIndexNode, shard_id: str):
465
+ result = await node.remove_vectorset(shard_id, vectorset_id)
466
+ if result.status != result.Status.OK:
467
+ raise NodeError(
468
+ f"Unable to delete vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
469
+ )
470
+
471
+ await self.apply_for_all_shards(
472
+ kbid, _delete_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
473
+ )
474
+ if NIDX_ENABLED:
475
+ await self.apply_for_all_shards(
476
+ kbid, _delete_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
477
+ )
478
+
386
479
 
387
480
  class StandaloneKBShardManager(KBShardManager):
388
481
  max_ops_before_checks = 200
@@ -390,11 +483,9 @@ class StandaloneKBShardManager(KBShardManager):
390
483
  def __init__(self):
391
484
  super().__init__()
392
485
  self._lock = asyncio.Lock()
393
- self._change_count: dict[tuple[str, str], int] = {} # type: ignore
486
+ self._change_count: dict[tuple[str, str], int] = {}
394
487
 
395
- async def _resource_change_event(
396
- self, kbid: str, node_id: str, shard_id: str
397
- ) -> None:
488
+ async def _resource_change_event(self, kbid: str, node_id: str, shard_id: str) -> None:
398
489
  if (node_id, shard_id) not in self._change_count:
399
490
  self._change_count[(node_id, shard_id)] = 0
400
491
  self._change_count[(node_id, shard_id)] += 1
@@ -407,17 +498,15 @@ class StandaloneKBShardManager(KBShardManager):
407
498
  if index_node is None:
408
499
  return
409
500
  shard_info: noderesources_pb2.Shard = await index_node.reader.GetShard(
410
- nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
501
+ nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id))
411
502
  )
412
503
  await self.maybe_create_new_shard(
413
504
  kbid,
414
505
  shard_info.paragraphs,
415
506
  )
416
- await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id)) # type: ignore
507
+ await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id))
417
508
 
418
- @backoff.on_exception(
419
- backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5
420
- )
509
+ @backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
421
510
  async def delete_resource(
422
511
  self,
423
512
  shard: writer_pb2.ShardObject,
@@ -433,19 +522,21 @@ class StandaloneKBShardManager(KBShardManager):
433
522
  req.shard_id = shardreplica.shard.id
434
523
  index_node = get_index_node(shardreplica.node)
435
524
  if index_node is None: # pragma: no cover
436
- raise NodesUnsync(
437
- f"Node {shardreplica.node} is not found or not available"
438
- )
525
+ raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
439
526
  await index_node.writer.RemoveResource(req) # type: ignore
440
527
  asyncio.create_task(
441
- self._resource_change_event(
442
- kb, shardreplica.node, shardreplica.shard.id
443
- )
528
+ self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
444
529
  )
445
530
 
446
- @backoff.on_exception(
447
- backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5
448
- )
531
+ nidx = get_nidx()
532
+ if nidx is not None and shard.nidx_shard_id:
533
+ indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
534
+ indexpb.shard = shard.nidx_shard_id
535
+ indexpb.resource = uuid
536
+ indexpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
537
+ await nidx.index(indexpb)
538
+
539
+ @backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
449
540
  async def add_resource(
450
541
  self,
451
542
  shard: writer_pb2.ShardObject,
@@ -465,16 +556,36 @@ class StandaloneKBShardManager(KBShardManager):
465
556
  resource.shard_id = resource.resource.shard_id = shardreplica.shard.id
466
557
  index_node = get_index_node(shardreplica.node)
467
558
  if index_node is None: # pragma: no cover
468
- raise NodesUnsync(
469
- f"Node {shardreplica.node} is not found or not available"
470
- )
559
+ raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
471
560
  await index_node.writer.SetResource(resource) # type: ignore
472
561
  asyncio.create_task(
473
- self._resource_change_event(
474
- kb, shardreplica.node, shardreplica.shard.id
475
- )
562
+ self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
476
563
  )
477
564
 
565
+ nidx = get_nidx()
566
+ if nidx is not None and shard.nidx_shard_id:
567
+ storage = await get_storage()
568
+ indexpb = IndexMessage()
569
+ storage_key = await storage.indexing(
570
+ resource, txid, partition, kb=kb, logical_shard=shard.shard
571
+ )
572
+
573
+ indexpb.typemessage = TypeMessage.CREATION
574
+ indexpb.storage_key = storage_key
575
+ indexpb.kbid = kb
576
+ indexpb.source = source
577
+ indexpb.resource = resource.resource.uuid
578
+ indexpb.shard = shard.nidx_shard_id
579
+
580
+ await nidx.index(indexpb)
581
+
582
+ # Delete indexing message (no longer needed)
583
+ try:
584
+ if storage.indexing_bucket:
585
+ await storage.delete_upload(storage_key, storage.indexing_bucket)
586
+ except Exception:
587
+ pass
588
+
478
589
 
479
590
  def get_all_shard_nodes(
480
591
  shard: writer_pb2.ShardObject,
@@ -506,6 +617,7 @@ def get_all_shard_nodes(
506
617
  def choose_node(
507
618
  shard: writer_pb2.ShardObject,
508
619
  *,
620
+ use_nidx: bool,
509
621
  target_shard_replicas: Optional[list[str]] = None,
510
622
  use_read_replica_nodes: bool = False,
511
623
  ) -> tuple[AbstractIndexNode, str]:
@@ -521,6 +633,13 @@ def choose_node(
521
633
  `target_shard_replicas` is the least preferent.
522
634
 
523
635
  """
636
+
637
+ # Use nidx if requested and enabled, fallback to node
638
+ if shard.nidx_shard_id and use_nidx:
639
+ fake_node = get_nidx_fake_node()
640
+ if fake_node:
641
+ return fake_node, shard.nidx_shard_id
642
+
524
643
  target_shard_replicas = target_shard_replicas or []
525
644
 
526
645
  shard_nodes = get_all_shard_nodes(shard, use_read_replicas=use_read_replica_nodes)
@@ -564,9 +683,7 @@ def check_enough_nodes():
564
683
  )
565
684
  if settings.max_node_replicas >= 0:
566
685
  available_nodes = list(
567
- filter(
568
- lambda n: n.shard_count < settings.max_node_replicas, available_nodes # type: ignore
569
- )
686
+ filter(lambda n: n.shard_count < settings.max_node_replicas, available_nodes)
570
687
  )
571
688
  if len(available_nodes) < target_replicas:
572
689
  raise NodeClusterSmall(
@@ -598,9 +715,7 @@ def sorted_primary_nodes(
598
715
  preferred_nodes = [nid for nid in available_node_ids if nid not in avoid_nodes]
599
716
 
600
717
  # Add avoid_nodes to the end of the last nodes
601
- result_nodes = preferred_nodes + [
602
- nid for nid in available_node_ids if nid not in preferred_nodes
603
- ]
718
+ result_nodes = preferred_nodes + [nid for nid in available_node_ids if nid not in preferred_nodes]
604
719
 
605
720
  # Remove ignore_nodes from the list
606
721
  result_nodes = [nid for nid in result_nodes if nid not in ignore_nodes]
@@ -25,6 +25,7 @@ from nucliadb.common.cluster.manager import choose_node
25
25
  from nucliadb.common.cluster.utils import get_shard_manager
26
26
  from nucliadb.common.context import ApplicationContext
27
27
  from nucliadb_protos import nodereader_pb2, noderesources_pb2
28
+ from nucliadb_telemetry import errors
28
29
  from nucliadb_telemetry.logs import setup_logging
29
30
  from nucliadb_telemetry.utils import setup_telemetry
30
31
  from nucliadb_utils import const
@@ -43,28 +44,26 @@ async def get_shards_paragraphs(kbid: str) -> list[tuple[str, int]]:
43
44
  """
44
45
  Ordered shard -> num paragraph by number of paragraphs
45
46
  """
46
- async with datamanagers.with_transaction(read_only=True) as txn:
47
+ async with datamanagers.with_ro_transaction() as txn:
47
48
  kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
48
49
  if kb_shards is None:
49
50
  return []
50
51
 
51
52
  results = {}
52
53
  for shard_meta in kb_shards.shards:
53
- node, shard_id = choose_node(shard_meta)
54
+ # Rebalance using node as source of truth. But it will rebalance nidx
55
+ node, shard_id = choose_node(shard_meta, use_nidx=False)
54
56
  shard_data: nodereader_pb2.Shard = await node.reader.GetShard(
55
57
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
56
58
  )
57
59
  results[shard_meta.shard] = shard_data.paragraphs
58
60
 
59
- return [
60
- (shard, paragraphs)
61
- for shard, paragraphs in sorted(results.items(), key=lambda x: x[1])
62
- ]
61
+ return [(shard, paragraphs) for shard, paragraphs in sorted(results.items(), key=lambda x: x[1])]
63
62
 
64
63
 
65
64
  async def maybe_add_shard(kbid: str) -> None:
66
65
  async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
67
- async with datamanagers.with_transaction(read_only=True) as txn:
66
+ async with datamanagers.with_ro_transaction() as txn:
68
67
  kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
69
68
  if kb_shards is None:
70
69
  return
@@ -89,12 +88,10 @@ async def move_set_of_kb_resources(
89
88
  to_shard_id: str,
90
89
  count: int = 20,
91
90
  ) -> None:
92
- async with datamanagers.with_transaction() as txn:
91
+ async with datamanagers.with_ro_transaction() as txn:
93
92
  kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
94
93
  if kb_shards is None: # pragma: no cover
95
- logger.warning(
96
- "No shards found for kb. This should not happen.", extra={"kbid": kbid}
97
- )
94
+ logger.warning("No shards found for kb. This should not happen.", extra={"kbid": kbid})
98
95
  return
99
96
 
100
97
  logger.info(
@@ -105,7 +102,7 @@ async def move_set_of_kb_resources(
105
102
  from_shard = [s for s in kb_shards.shards if s.shard == from_shard_id][0]
106
103
  to_shard = [s for s in kb_shards.shards if s.shard == to_shard_id][0]
107
104
 
108
- from_node, from_shard_replica_id = choose_node(from_shard)
105
+ from_node, from_shard_replica_id = choose_node(from_shard, use_nidx=False)
109
106
  search_response: nodereader_pb2.SearchResponse = await from_node.reader.Search( # type: ignore
110
107
  nodereader_pb2.SearchRequest(
111
108
  shard=from_shard_replica_id,
@@ -122,13 +119,11 @@ async def move_set_of_kb_resources(
122
119
  async with (
123
120
  datamanagers.with_transaction() as txn,
124
121
  locking.distributed_lock(
125
- locking.RESOURCE_INDEX_LOCK.format(
126
- kbid=kbid, resource_id=resource_id
127
- )
122
+ locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=resource_id)
128
123
  ),
129
124
  ):
130
125
  found_shard_id = await datamanagers.resources.get_resource_shard_id(
131
- txn, kbid=kbid, rid=resource_id
126
+ txn, kbid=kbid, rid=resource_id, for_update=True
132
127
  )
133
128
  if found_shard_id is None:
134
129
  # resource deleted
@@ -175,9 +170,7 @@ async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
175
170
 
176
171
  shard_paragraphs = await get_shards_paragraphs(kbid)
177
172
  rebalanced_shards = set()
178
- while any(
179
- paragraphs > settings.max_shard_paragraphs for _, paragraphs in shard_paragraphs
180
- ):
173
+ while any(paragraphs > settings.max_shard_paragraphs for _, paragraphs in shard_paragraphs):
181
174
  # find the shard with the least/most paragraphs
182
175
  smallest_shard = shard_paragraphs[0][0]
183
176
  largest_shard = shard_paragraphs[-1][0]
@@ -198,13 +191,13 @@ async def rebalance_kb(context: ApplicationContext, kbid: str) -> None:
198
191
  async def run(context: ApplicationContext) -> None:
199
192
  try:
200
193
  async with locking.distributed_lock(REBALANCE_LOCK):
194
+ # get all kb ids
195
+ async with datamanagers.with_ro_transaction() as txn:
196
+ kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
201
197
  # go through each kb and see if shards need to be reduced in size
202
- async with datamanagers.with_transaction() as txn:
203
- async for kbid, _ in datamanagers.kb.get_kbs(txn):
204
- async with locking.distributed_lock(
205
- locking.KB_SHARDS_LOCK.format(kbid=kbid)
206
- ):
207
- await rebalance_kb(context, kbid)
198
+ for kbid in kbids:
199
+ async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
200
+ await rebalance_kb(context, kbid)
208
201
  except locking.ResourceLocked as exc:
209
202
  if exc.key == REBALANCE_LOCK:
210
203
  logger.warning("Another rebalance process is already running.")
@@ -222,13 +215,18 @@ async def run_command(context: ApplicationContext) -> None:
222
215
 
223
216
  try:
224
217
  await run(context)
225
- except (asyncio.CancelledError, RuntimeError):
218
+ except (asyncio.CancelledError, RuntimeError): # pragma: no cover
226
219
  return
227
- except Exception:
220
+ except Exception as ex: # pragma: no cover
228
221
  logger.exception("Failed to run rebalancing.")
222
+ errors.capture_exception(ex)
229
223
  finally:
230
- await context.finalize()
231
- await metrics_server.shutdown()
224
+ try:
225
+ await metrics_server.shutdown()
226
+ await context.finalize()
227
+ except Exception: # pragma: no cover
228
+ logger.exception("Error tearing down utilities on rebalance command")
229
+ pass
232
230
 
233
231
 
234
232
  def main():