nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,13 +19,10 @@
19
19
  #
20
20
  import asyncio
21
21
  import logging
22
- import random
23
22
  import uuid
24
23
  from typing import Any, Awaitable, Callable, Optional
25
24
 
26
25
  import backoff
27
- from nucliadb_protos.knowledgebox_pb2 import SemanticModelMetadata # type: ignore
28
- from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, TypeMessage
29
26
 
30
27
  from nucliadb.common import datamanagers
31
28
  from nucliadb.common.cluster.base import AbstractIndexNode
@@ -39,13 +36,15 @@ from nucliadb.common.cluster.exceptions import (
39
36
  ShardsNotFound,
40
37
  )
41
38
  from nucliadb.common.maindb.driver import Transaction
39
+ from nucliadb.common.nidx import NIDX_ENABLED, get_nidx, get_nidx_api_client, get_nidx_fake_node
42
40
  from nucliadb_protos import (
41
+ knowledgebox_pb2,
43
42
  nodereader_pb2,
44
43
  noderesources_pb2,
45
44
  nodewriter_pb2,
46
- utils_pb2,
47
45
  writer_pb2,
48
46
  )
47
+ from nucliadb_protos.nodewriter_pb2 import IndexMessage, IndexMessageSource, NewShardRequest, TypeMessage
49
48
  from nucliadb_telemetry import errors
50
49
  from nucliadb_utils.utilities import get_indexing, get_storage
51
50
 
@@ -71,6 +70,11 @@ def get_index_node(node_id: str) -> Optional[AbstractIndexNode]:
71
70
  return INDEX_NODES.get(node_id)
72
71
 
73
72
 
73
+ def clear_index_nodes():
74
+ INDEX_NODES.clear()
75
+ READ_REPLICA_INDEX_NODES.clear()
76
+
77
+
74
78
  def get_read_replica_node_ids(node_id: str) -> list[str]:
75
79
  return list(READ_REPLICA_INDEX_NODES.get(node_id, set()))
76
80
 
@@ -122,7 +126,7 @@ def remove_index_node(node_id: str, primary_id: Optional[str] = None) -> None:
122
126
  class KBShardManager:
123
127
  # TODO: move to data manager
124
128
  async def get_shards_by_kbid_inner(self, kbid: str) -> writer_pb2.Shards:
125
- async with datamanagers.with_transaction(read_only=True) as txn:
129
+ async with datamanagers.with_ro_transaction() as txn:
126
130
  result = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
127
131
  if result is None:
128
132
  # could be None because /shards doesn't exist, or beacause the
@@ -140,6 +144,8 @@ class KBShardManager:
140
144
  kbid: str,
141
145
  aw: Callable[[AbstractIndexNode, str], Awaitable[Any]],
142
146
  timeout: float,
147
+ *,
148
+ use_nidx: bool,
143
149
  use_read_replica_nodes: bool = False,
144
150
  ) -> list[Any]:
145
151
  shards = await self.get_shards_by_kbid(kbid)
@@ -147,7 +153,7 @@ class KBShardManager:
147
153
 
148
154
  for shard_obj in shards:
149
155
  node, shard_id = choose_node(
150
- shard_obj, use_read_replica_nodes=use_read_replica_nodes
156
+ shard_obj, use_nidx=use_nidx, use_read_replica_nodes=use_read_replica_nodes
151
157
  )
152
158
  if shard_id is None:
153
159
  raise ShardNotFound("Found a node but not a shard")
@@ -156,7 +162,7 @@ class KBShardManager:
156
162
 
157
163
  try:
158
164
  results = await asyncio.wait_for(
159
- asyncio.gather(*ops, return_exceptions=True), # type: ignore
165
+ asyncio.gather(*ops, return_exceptions=True),
160
166
  timeout=timeout,
161
167
  )
162
168
  except asyncio.TimeoutError as exc:
@@ -169,7 +175,7 @@ class KBShardManager:
169
175
  async def get_current_active_shard(
170
176
  self, txn: Transaction, kbid: str
171
177
  ) -> Optional[writer_pb2.ShardObject]:
172
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
178
+ kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=False)
173
179
  if kb_shards is None:
174
180
  return None
175
181
 
@@ -183,8 +189,6 @@ class KBShardManager:
183
189
  self,
184
190
  txn: Transaction,
185
191
  kbid: str,
186
- semantic_model: SemanticModelMetadata,
187
- release_channel: utils_pb2.ReleaseChannel.ValueType,
188
192
  ) -> writer_pb2.ShardObject:
189
193
  try:
190
194
  check_enough_nodes()
@@ -195,26 +199,25 @@ class KBShardManager:
195
199
  )
196
200
  raise
197
201
 
198
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
202
+ kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
199
203
  if kb_shards is None:
200
- # First logic shard on the index
201
- kb_shards = writer_pb2.Shards()
202
- kb_shards.kbid = kbid
203
- # B/c with Shards.actual
204
- kb_shards.actual = -1
205
- kb_shards.similarity = semantic_model.similarity_function
206
- kb_shards.model.CopyFrom(semantic_model)
207
- else:
208
- # New logic shard on an existing index
209
- pass
204
+ msg = ("Attempting to create a shard for a KB when it has no stored shards in maindb",)
205
+ logger.error(msg, extra={"kbid": kbid})
206
+ raise ShardsNotFound(msg)
207
+
208
+ existing_kb_nodes = [replica.node for shard in kb_shards.shards for replica in shard.replicas]
209
+ nodes = sorted_primary_nodes(
210
+ avoid_nodes=existing_kb_nodes,
211
+ ignore_nodes=settings.drain_nodes,
212
+ )
210
213
 
211
- kb_shards.release_channel = release_channel
212
- existing_kb_nodes = [
213
- replica.node for shard in kb_shards.shards for replica in shard.replicas
214
- ]
215
- nodes = sorted_primary_nodes(avoid_nodes=existing_kb_nodes)
214
+ vectorsets = {
215
+ vectorset_id: vectorset_config.vectorset_index_config
216
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(txn, kbid=kbid)
217
+ }
216
218
 
217
219
  shard_uuid = uuid.uuid4().hex
220
+
218
221
  shard = writer_pb2.ShardObject(shard=shard_uuid, read_only=False)
219
222
  try:
220
223
  # Attempt to create configured number of replicas
@@ -231,26 +234,56 @@ class KBShardManager:
231
234
  if node is None:
232
235
  logger.error(f"Node {node_id} is not found or not available")
233
236
  continue
237
+
234
238
  try:
235
- shard_created = await node.new_shard(
236
- kbid,
237
- similarity=kb_shards.similarity,
238
- release_channel=kb_shards.release_channel,
239
+ if not vectorsets:
240
+ # bw/c KBs without vectorsets
241
+ is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
242
+ vector_index_config = nodewriter_pb2.VectorIndexConfig(
243
+ similarity=kb_shards.similarity,
244
+ vector_type=nodewriter_pb2.VectorType.DENSE_F32,
245
+ vector_dimension=kb_shards.model.vector_dimension,
246
+ normalize_vectors=is_matryoshka,
247
+ )
248
+
249
+ shard_created = await node.new_shard(
250
+ kbid,
251
+ vector_index_config=vector_index_config,
252
+ )
253
+
254
+ else:
255
+ shard_created = await node.new_shard_with_vectorsets(
256
+ kbid,
257
+ vectorsets_configs=vectorsets,
258
+ )
259
+
260
+ except Exception as exc:
261
+ errors.capture_exception(exc)
262
+ logger.exception(
263
+ f"Error creating new shard for KB", extra={"kbid": kbid, "node_id": node}
239
264
  )
240
- except Exception as e:
241
- errors.capture_exception(e)
242
- logger.exception(f"Error creating new shard at {node}: {e}")
243
265
  continue
244
266
 
245
267
  replica = writer_pb2.ShardReplica(node=str(node_id))
246
268
  replica.shard.CopyFrom(shard_created)
247
269
  shard.replicas.append(replica)
248
270
  replicas_created += 1
249
- except Exception as e:
250
- errors.capture_exception(e)
251
- logger.error(f"Unexpected error creating new shard: {e}")
271
+
272
+ nidx_api = get_nidx_api_client()
273
+ if nidx_api:
274
+ req = NewShardRequest(
275
+ kbid=kbid,
276
+ vectorsets_configs=vectorsets,
277
+ )
278
+
279
+ resp = await nidx_api.NewShard(req) # type: ignore
280
+ shard.nidx_shard_id = resp.id
281
+
282
+ except Exception as exc:
283
+ errors.capture_exception(exc)
284
+ logger.exception(f"Unexpected error creating new shard for KB", extra={"kbid": kbid})
252
285
  await self.rollback_shard(shard)
253
- raise e
286
+ raise exc
254
287
 
255
288
  # set previous shard as read only, we only have one writable shard at a
256
289
  # time
@@ -259,8 +292,8 @@ class KBShardManager:
259
292
 
260
293
  # Append the created shard and make `actual` point to it.
261
294
  kb_shards.shards.append(shard)
262
- # B/c with Shards.actual
263
- kb_shards.actual += 1
295
+ # B/c with Shards.actual - we only use last created shard
296
+ kb_shards.actual = len(kb_shards.shards) - 1
264
297
 
265
298
  await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
266
299
 
@@ -273,7 +306,7 @@ class KBShardManager:
273
306
  node = get_index_node(node_id)
274
307
  if node is not None:
275
308
  try:
276
- logger.warning(
309
+ logger.info(
277
310
  "Deleting shard replica",
278
311
  extra={"shard": replica_id, "node": node_id},
279
312
  )
@@ -285,6 +318,17 @@ class KBShardManager:
285
318
  exc_info=True,
286
319
  )
287
320
 
321
+ nidx_api = get_nidx_api_client()
322
+ if nidx_api and shard.nidx_shard_id:
323
+ try:
324
+ await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
325
+ except Exception as rollback_error:
326
+ errors.capture_exception(rollback_error)
327
+ logger.error(
328
+ f"New shard rollback error. Nidx Shard: {shard.nidx_shard_id}",
329
+ exc_info=True,
330
+ )
331
+
288
332
  def indexing_replicas(self, shard: writer_pb2.ShardObject) -> list[tuple[str, str]]:
289
333
  """
290
334
  Returns the replica ids and nodes for the shard replicas
@@ -304,10 +348,9 @@ class KBShardManager:
304
348
  ) -> None:
305
349
  indexing = get_indexing()
306
350
  storage = await get_storage()
351
+ nidx = get_nidx()
307
352
 
308
- await storage.delete_indexing(
309
- resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard
310
- )
353
+ await storage.delete_indexing(resource_uid=uuid, txid=txid, kb=kb, logical_shard=shard.shard)
311
354
 
312
355
  for replica_id, node_id in self.indexing_replicas(shard):
313
356
  indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
@@ -320,6 +363,13 @@ class KBShardManager:
320
363
  indexpb.kbid = kb
321
364
  await indexing.index(indexpb, node_id)
322
365
 
366
+ if nidx is not None and shard.nidx_shard_id:
367
+ nidxpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
368
+ nidxpb.shard = shard.nidx_shard_id
369
+ nidxpb.resource = uuid
370
+ nidxpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
371
+ await nidx.index(nidxpb)
372
+
323
373
  async def add_resource(
324
374
  self,
325
375
  shard: writer_pb2.ShardObject,
@@ -330,6 +380,9 @@ class KBShardManager:
330
380
  reindex_id: Optional[str] = None,
331
381
  source: IndexMessageSource.ValueType = IndexMessageSource.PROCESSOR,
332
382
  ) -> None:
383
+ """
384
+ Stores the Resource object in the object storage and sends an IndexMessage to the indexing Nats stream.
385
+ """
333
386
  if txid == -1 and reindex_id is None:
334
387
  # This means we are injecting a complete resource via ingest gRPC
335
388
  # outside of a transaction. We need to treat this as a reindex operation.
@@ -337,7 +390,7 @@ class KBShardManager:
337
390
 
338
391
  storage = await get_storage()
339
392
  indexing = get_indexing()
340
-
393
+ nidx = get_nidx()
341
394
  indexpb = IndexMessage()
342
395
 
343
396
  if reindex_id is not None:
@@ -364,34 +417,65 @@ class KBShardManager:
364
417
  indexpb.shard = replica_id
365
418
  await indexing.index(indexpb, node_id)
366
419
 
367
- def should_create_new_shard(self, num_paragraphs: int, num_fields: int) -> bool:
368
- return (
369
- num_paragraphs > settings.max_shard_paragraphs
370
- or num_fields > settings.max_shard_fields
371
- )
420
+ if nidx is not None and shard.nidx_shard_id:
421
+ indexpb.shard = shard.nidx_shard_id
422
+ await nidx.index(indexpb)
423
+
424
+ def should_create_new_shard(self, num_paragraphs: int) -> bool:
425
+ return num_paragraphs > settings.max_shard_paragraphs
372
426
 
373
427
  async def maybe_create_new_shard(
374
428
  self,
375
429
  kbid: str,
376
430
  num_paragraphs: int,
377
- num_fields: int,
378
- release_channel: utils_pb2.ReleaseChannel.ValueType = utils_pb2.ReleaseChannel.STABLE,
379
431
  ):
380
- if not self.should_create_new_shard(num_paragraphs, num_fields):
432
+ if not self.should_create_new_shard(num_paragraphs):
381
433
  return
382
434
 
383
- logger.warning({"message": "Adding shard", "kbid": kbid})
435
+ logger.info({"message": "Adding shard", "kbid": kbid})
384
436
 
385
437
  async with datamanagers.with_transaction() as txn:
386
- model = await datamanagers.kb.get_model_metadata(txn, kbid=kbid)
387
- await self.create_shard_by_kbid(
388
- txn,
389
- kbid,
390
- semantic_model=model,
391
- release_channel=release_channel,
392
- )
438
+ await self.create_shard_by_kbid(txn, kbid)
393
439
  await txn.commit()
394
440
 
441
+ async def create_vectorset(self, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
442
+ """Create a new vectorset in all KB shards."""
443
+
444
+ async def _create_vectorset(node: AbstractIndexNode, shard_id: str):
445
+ vectorset_id = config.vectorset_id
446
+ index_config = config.vectorset_index_config
447
+ result = await node.add_vectorset(shard_id, vectorset_id, index_config)
448
+ if result.status != result.Status.OK:
449
+ raise NodeError(
450
+ f"Unable to create vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
451
+ )
452
+
453
+ await self.apply_for_all_shards(
454
+ kbid, _create_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
455
+ )
456
+ if NIDX_ENABLED:
457
+ await self.apply_for_all_shards(
458
+ kbid, _create_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
459
+ )
460
+
461
+ async def delete_vectorset(self, kbid: str, vectorset_id: str):
462
+ """Delete a vectorset from all KB shards"""
463
+
464
+ async def _delete_vectorset(node: AbstractIndexNode, shard_id: str):
465
+ result = await node.remove_vectorset(shard_id, vectorset_id)
466
+ if result.status != result.Status.OK:
467
+ raise NodeError(
468
+ f"Unable to delete vectorset {vectorset_id} in kb {kbid} shard {shard_id}"
469
+ )
470
+
471
+ await self.apply_for_all_shards(
472
+ kbid, _delete_vectorset, timeout=10, use_nidx=False, use_read_replica_nodes=False
473
+ )
474
+ if NIDX_ENABLED:
475
+ await self.apply_for_all_shards(
476
+ kbid, _delete_vectorset, timeout=10, use_nidx=True, use_read_replica_nodes=False
477
+ )
478
+
395
479
 
396
480
  class StandaloneKBShardManager(KBShardManager):
397
481
  max_ops_before_checks = 200
@@ -399,11 +483,9 @@ class StandaloneKBShardManager(KBShardManager):
399
483
  def __init__(self):
400
484
  super().__init__()
401
485
  self._lock = asyncio.Lock()
402
- self._change_count: dict[tuple[str, str], int] = {} # type: ignore
486
+ self._change_count: dict[tuple[str, str], int] = {}
403
487
 
404
- async def _resource_change_event(
405
- self, kbid: str, node_id: str, shard_id: str
406
- ) -> None:
488
+ async def _resource_change_event(self, kbid: str, node_id: str, shard_id: str) -> None:
407
489
  if (node_id, shard_id) not in self._change_count:
408
490
  self._change_count[(node_id, shard_id)] = 0
409
491
  self._change_count[(node_id, shard_id)] += 1
@@ -416,19 +498,15 @@ class StandaloneKBShardManager(KBShardManager):
416
498
  if index_node is None:
417
499
  return
418
500
  shard_info: noderesources_pb2.Shard = await index_node.reader.GetShard(
419
- nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
501
+ nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id))
420
502
  )
421
503
  await self.maybe_create_new_shard(
422
504
  kbid,
423
505
  shard_info.paragraphs,
424
- shard_info.fields,
425
- shard_info.metadata.release_channel,
426
506
  )
427
- await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id)) # type: ignore
507
+ await index_node.writer.GC(noderesources_pb2.ShardId(id=shard_id))
428
508
 
429
- @backoff.on_exception(
430
- backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5
431
- )
509
+ @backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
432
510
  async def delete_resource(
433
511
  self,
434
512
  shard: writer_pb2.ShardObject,
@@ -444,19 +522,21 @@ class StandaloneKBShardManager(KBShardManager):
444
522
  req.shard_id = shardreplica.shard.id
445
523
  index_node = get_index_node(shardreplica.node)
446
524
  if index_node is None: # pragma: no cover
447
- raise NodesUnsync(
448
- f"Node {shardreplica.node} is not found or not available"
449
- )
525
+ raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
450
526
  await index_node.writer.RemoveResource(req) # type: ignore
451
527
  asyncio.create_task(
452
- self._resource_change_event(
453
- kb, shardreplica.node, shardreplica.shard.id
454
- )
528
+ self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
455
529
  )
456
530
 
457
- @backoff.on_exception(
458
- backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5
459
- )
531
+ nidx = get_nidx()
532
+ if nidx is not None and shard.nidx_shard_id:
533
+ indexpb: nodewriter_pb2.IndexMessage = nodewriter_pb2.IndexMessage()
534
+ indexpb.shard = shard.nidx_shard_id
535
+ indexpb.resource = uuid
536
+ indexpb.typemessage = nodewriter_pb2.TypeMessage.DELETION
537
+ await nidx.index(indexpb)
538
+
539
+ @backoff.on_exception(backoff.expo, NodesUnsync, jitter=backoff.random_jitter, max_tries=5)
460
540
  async def add_resource(
461
541
  self,
462
542
  shard: writer_pb2.ShardObject,
@@ -467,21 +547,45 @@ class StandaloneKBShardManager(KBShardManager):
467
547
  reindex_id: Optional[str] = None,
468
548
  source: IndexMessageSource.ValueType = IndexMessageSource.PROCESSOR,
469
549
  ) -> None:
550
+ """
551
+ Calls the node writer's SetResource method directly to store the resource in the node.
552
+ There is no queuing for standalone nodes at the moment -- indexing is done synchronously.
553
+ """
470
554
  index_node = None
471
555
  for shardreplica in shard.replicas:
472
556
  resource.shard_id = resource.resource.shard_id = shardreplica.shard.id
473
557
  index_node = get_index_node(shardreplica.node)
474
558
  if index_node is None: # pragma: no cover
475
- raise NodesUnsync(
476
- f"Node {shardreplica.node} is not found or not available"
477
- )
559
+ raise NodesUnsync(f"Node {shardreplica.node} is not found or not available")
478
560
  await index_node.writer.SetResource(resource) # type: ignore
479
561
  asyncio.create_task(
480
- self._resource_change_event(
481
- kb, shardreplica.node, shardreplica.shard.id
482
- )
562
+ self._resource_change_event(kb, shardreplica.node, shardreplica.shard.id)
563
+ )
564
+
565
+ nidx = get_nidx()
566
+ if nidx is not None and shard.nidx_shard_id:
567
+ storage = await get_storage()
568
+ indexpb = IndexMessage()
569
+ storage_key = await storage.indexing(
570
+ resource, txid, partition, kb=kb, logical_shard=shard.shard
483
571
  )
484
572
 
573
+ indexpb.typemessage = TypeMessage.CREATION
574
+ indexpb.storage_key = storage_key
575
+ indexpb.kbid = kb
576
+ indexpb.source = source
577
+ indexpb.resource = resource.resource.uuid
578
+ indexpb.shard = shard.nidx_shard_id
579
+
580
+ await nidx.index(indexpb)
581
+
582
+ # Delete indexing message (no longer needed)
583
+ try:
584
+ if storage.indexing_bucket:
585
+ await storage.delete_upload(storage_key, storage.indexing_bucket)
586
+ except Exception:
587
+ pass
588
+
485
589
 
486
590
  def get_all_shard_nodes(
487
591
  shard: writer_pb2.ShardObject,
@@ -513,6 +617,7 @@ def get_all_shard_nodes(
513
617
  def choose_node(
514
618
  shard: writer_pb2.ShardObject,
515
619
  *,
620
+ use_nidx: bool,
516
621
  target_shard_replicas: Optional[list[str]] = None,
517
622
  use_read_replica_nodes: bool = False,
518
623
  ) -> tuple[AbstractIndexNode, str]:
@@ -528,6 +633,13 @@ def choose_node(
528
633
  `target_shard_replicas` is the least preferent.
529
634
 
530
635
  """
636
+
637
+ # Use nidx if requested and enabled, fallback to node
638
+ if shard.nidx_shard_id and use_nidx:
639
+ fake_node = get_nidx_fake_node()
640
+ if fake_node:
641
+ return fake_node, shard.nidx_shard_id
642
+
531
643
  target_shard_replicas = target_shard_replicas or []
532
644
 
533
645
  shard_nodes = get_all_shard_nodes(shard, use_read_replicas=use_read_replica_nodes)
@@ -550,7 +662,10 @@ def choose_node(
550
662
  ranked_nodes.setdefault(score, []).append((node, shard_replica_id))
551
663
 
552
664
  top = ranked_nodes[max(ranked_nodes)]
553
- selected_node, shard_replica_id = random.choice(top)
665
+ # As shard replica ids are random numbers, we sort by shard replica id and choose its
666
+ # node to make sure we choose in deterministically but we don't favour any node in particular
667
+ top.sort(key=lambda x: x[1])
668
+ selected_node, shard_replica_id = top[0]
554
669
  return selected_node, shard_replica_id
555
670
 
556
671
 
@@ -558,17 +673,17 @@ def check_enough_nodes():
558
673
  """
559
674
  It raises an exception if it can't find enough nodes for the configured replicas.
560
675
  """
676
+ drain_nodes = settings.drain_nodes
561
677
  target_replicas = settings.node_replicas
562
678
  available_nodes = get_index_nodes()
679
+ available_nodes = [node for node in available_nodes if node.id not in drain_nodes]
563
680
  if len(available_nodes) < target_replicas:
564
681
  raise NodeClusterSmall(
565
682
  f"Not enough nodes. Total: {len(available_nodes)}, Required: {target_replicas}"
566
683
  )
567
684
  if settings.max_node_replicas >= 0:
568
685
  available_nodes = list(
569
- filter(
570
- lambda n: n.shard_count < settings.max_node_replicas, available_nodes # type: ignore
571
- )
686
+ filter(lambda n: n.shard_count < settings.max_node_replicas, available_nodes)
572
687
  )
573
688
  if len(available_nodes) < target_replicas:
574
689
  raise NodeClusterSmall(
@@ -576,26 +691,32 @@ def check_enough_nodes():
576
691
  )
577
692
 
578
693
 
579
- def sorted_primary_nodes(avoid_nodes: Optional[list[str]] = None) -> list[str]:
694
+ def sorted_primary_nodes(
695
+ avoid_nodes: Optional[list[str]] = None,
696
+ ignore_nodes: Optional[list[str]] = None,
697
+ ) -> list[str]:
580
698
  """
581
699
  Returns the list of all primary node ids sorted by decreasing available
582
700
  disk space (from more to less available disk reported).
583
701
 
584
- It will put the node ids in `avoid_nodes` at the tail of the list.
702
+ Nodes in `avoid_nodes` are placed at the tail of the list.
703
+ Nodes in `ignore_nodes` are ignored and never returned.
585
704
  """
586
705
  primary_nodes = get_index_nodes(include_secondary=False)
587
706
 
588
707
  # Sort by available disk
589
- sorted_primary_nodes = sorted(
590
- primary_nodes, key=lambda n: n.available_disk, reverse=True
591
- )
592
- available_node_ids = [node.id for node in sorted_primary_nodes]
708
+ sorted_nodes = sorted(primary_nodes, key=lambda n: n.available_disk, reverse=True)
709
+ available_node_ids = [node.id for node in sorted_nodes]
593
710
 
594
711
  avoid_nodes = avoid_nodes or []
595
- # get preferred nodes first
712
+ ignore_nodes = ignore_nodes or []
713
+
714
+ # Get the non-avoided nodes first
596
715
  preferred_nodes = [nid for nid in available_node_ids if nid not in avoid_nodes]
597
- # now, add to the end of the last nodes
598
- preferred_node_order = preferred_nodes + [
599
- nid for nid in available_node_ids if nid not in preferred_nodes
600
- ]
601
- return preferred_node_order
716
+
717
+ # Add avoid_nodes to the end of the last nodes
718
+ result_nodes = preferred_nodes + [nid for nid in available_node_ids if nid not in preferred_nodes]
719
+
720
+ # Remove ignore_nodes from the list
721
+ result_nodes = [nid for nid in result_nodes if nid not in ignore_nodes]
722
+ return result_nodes