nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import argparse
21
21
  import asyncio
22
- import enum
23
22
  import logging
24
23
  from datetime import datetime
25
24
  from typing import Optional
@@ -27,140 +26,213 @@ from typing import Optional
27
26
  from nucliadb.common import datamanagers, locking
28
27
  from nucliadb.common.cluster import manager as cluster_manager
29
28
  from nucliadb.common.context import ApplicationContext
30
- from nucliadb_protos import writer_pb2
29
+ from nucliadb.common.datamanagers.rollover import RolloverState, RolloverStateNotFoundError
30
+ from nucliadb.common.external_index_providers.base import ExternalIndexManager
31
+ from nucliadb.common.external_index_providers.manager import (
32
+ get_external_index_manager,
33
+ )
34
+ from nucliadb.common.nidx import get_nidx_fake_node
35
+ from nucliadb_protos import nodewriter_pb2, writer_pb2
31
36
  from nucliadb_telemetry import errors
32
37
 
33
38
  from .manager import get_index_node
34
39
  from .settings import settings
35
- from .utils import delete_resource_from_shard, index_resource_to_shard, wait_for_node
40
+ from .utils import (
41
+ delete_resource_from_shard,
42
+ get_resource,
43
+ get_resource_index_message,
44
+ index_resource_to_shard,
45
+ wait_for_node,
46
+ )
36
47
 
37
48
  logger = logging.getLogger(__name__)
38
49
 
39
50
 
40
- class RolloverStatus(enum.Enum):
41
- RESOURCES_SCHEDULED = "resources_scheduled"
42
- RESOURCES_INDEXED = "resources_indexed"
43
- RESOURCES_VALIDATED = "resources_validated"
44
-
51
+ class UnexpectedRolloverError(Exception):
52
+ pass
45
53
 
46
- def _get_rollover_status(
47
- rollover_shards: writer_pb2.Shards, status: RolloverStatus
48
- ) -> bool:
49
- return rollover_shards.extra.get(status.value) == "true"
50
54
 
55
+ async def create_rollover_index(
56
+ app_context: ApplicationContext,
57
+ kbid: str,
58
+ drain_nodes: Optional[list[str]] = None,
59
+ external: Optional[ExternalIndexManager] = None,
60
+ ) -> None:
61
+ """
62
+ Creates a new index for a knowledgebox in the index node cluster (and to the external index provider if configured).
63
+ For the external index case, we still need the shard on the index node cluster to be created because
64
+ it is used to store the rollover state during the rollover. However, the actual indexing will be done
65
+ by the external index provider.
66
+ """
67
+ await create_rollover_shards(app_context, kbid, drain_nodes=drain_nodes)
68
+ if external is not None:
69
+ if external.supports_rollover:
70
+ await create_rollover_external_index(kbid, external)
71
+ else:
72
+ logger.info(
73
+ "External index provider does not support rollover",
74
+ extra={"kbid": kbid, "external_index_provider": external.type.value},
75
+ )
51
76
 
52
- def _set_rollover_status(rollover_shards: writer_pb2.Shards, status: RolloverStatus):
53
- rollover_shards.extra[status.value] = "true"
54
77
 
78
+ async def create_rollover_external_index(kbid: str, external: ExternalIndexManager) -> None:
79
+ extra = {"kbid": kbid, "external_index_provider": external.type.value}
80
+ async with datamanagers.with_ro_transaction() as txn:
81
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
82
+ if state.external_index_created:
83
+ logger.info("Rollover external index already created, skipping", extra=extra)
84
+ return
55
85
 
56
- def _clear_rollover_status(rollover_shards: writer_pb2.Shards):
57
- for status in RolloverStatus:
58
- rollover_shards.extra.pop(status.value, None)
86
+ logger.info("Creating rollover external index", extra=extra)
87
+ async with datamanagers.with_ro_transaction() as txn:
88
+ stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
89
+ if stored_metadata is None:
90
+ raise UnexpectedRolloverError("External index metadata not found")
59
91
 
92
+ rollover_metadata = await external.rollover_create_indexes(stored_metadata)
60
93
 
61
- class UnexpectedRolloverError(Exception):
62
- pass
94
+ async with datamanagers.with_rw_transaction() as txn:
95
+ await datamanagers.rollover.update_kb_rollover_external_index_metadata(
96
+ txn, kbid=kbid, metadata=rollover_metadata
97
+ )
98
+ state.external_index_created = True
99
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
100
+ await txn.commit()
63
101
 
64
102
 
65
103
  async def create_rollover_shards(
66
- app_context: ApplicationContext, kbid: str
104
+ app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
67
105
  ) -> writer_pb2.Shards:
68
106
  """
69
- Creates shards to to used for a rollover operation
107
+ Creates new index node shards for a rollover operation.
108
+ If drain_nodes is provided, no replicas will be created on those nodes.
70
109
  """
71
- logger.warning("Creating rollover shards", extra={"kbid": kbid})
110
+
111
+ logger.info("Creating rollover shards", extra={"kbid": kbid})
72
112
  sm = app_context.shard_manager
113
+ nidx_node = get_nidx_fake_node()
73
114
 
74
- async with datamanagers.with_transaction() as txn:
75
- existing_rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
76
- txn, kbid=kbid
77
- )
78
- if existing_rollover_shards is not None:
79
- logger.warning("Rollover shards already exist, skipping")
80
- return existing_rollover_shards
115
+ async with datamanagers.with_ro_transaction() as txn:
116
+ try:
117
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
118
+ except RolloverStateNotFoundError:
119
+ # State is not set yet, create it
120
+ state = RolloverState(
121
+ rollover_shards_created=False,
122
+ external_index_created=False,
123
+ resources_scheduled=False,
124
+ resources_indexed=False,
125
+ cutover_shards=False,
126
+ cutover_external_index=False,
127
+ resources_validated=False,
128
+ )
81
129
 
82
130
  kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
83
131
  if kb_shards is None:
84
132
  raise UnexpectedRolloverError(f"No shards found for KB {kbid}")
85
133
 
86
- # create new shards
87
- created_shards = []
88
- try:
89
- nodes = cluster_manager.sorted_primary_nodes()
90
- for shard in kb_shards.shards:
91
- shard.ClearField("replicas")
92
- # Attempt to create configured number of replicas
93
- replicas_created = 0
94
- while replicas_created < settings.node_replicas:
95
- if len(nodes) == 0:
96
- # could have multiple shards on single node
97
- nodes = cluster_manager.sorted_primary_nodes()
98
- node_id = nodes.pop(0)
99
-
100
- node = get_index_node(node_id)
101
- if node is None:
102
- logger.error(f"Node {node_id} is not found or not available")
103
- continue
104
- try:
134
+ if state.rollover_shards_created:
135
+ logger.info("Rollover shards already created, skipping", extra={"kbid": kbid})
136
+ return kb_shards
137
+
138
+ # create new shards
139
+ created_shards = []
140
+ try:
141
+ nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
142
+ for shard in kb_shards.shards:
143
+ shard.ClearField("replicas")
144
+ # Attempt to create configured number of replicas
145
+ replicas_created = 0
146
+ while replicas_created < settings.node_replicas:
147
+ if len(nodes) == 0:
148
+ # could have multiple shards on single node
149
+ nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
150
+ node_id = nodes.pop(0)
151
+
152
+ node = get_index_node(node_id)
153
+ if node is None:
154
+ logger.error(f"Node {node_id} is not found or not available")
155
+ continue
156
+
157
+ vectorsets = {
158
+ vectorset_id: vectorset_config.vectorset_index_config
159
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
160
+ txn, kbid=kbid
161
+ )
162
+ }
163
+ try:
164
+ if not vectorsets:
165
+ is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
166
+ vector_index_config = nodewriter_pb2.VectorIndexConfig(
167
+ similarity=kb_shards.similarity,
168
+ vector_type=nodewriter_pb2.VectorType.DENSE_F32,
169
+ vector_dimension=kb_shards.model.vector_dimension,
170
+ normalize_vectors=is_matryoshka,
171
+ )
105
172
  shard_created = await node.new_shard(
106
173
  kbid,
107
- similarity=kb_shards.similarity,
108
- release_channel=kb_shards.release_channel,
174
+ vector_index_config=vector_index_config,
109
175
  )
110
- except Exception as e:
111
- errors.capture_exception(e)
112
- logger.exception(f"Error creating new shard at {node}")
113
- continue
114
-
115
- replica = writer_pb2.ShardReplica(node=str(node_id))
116
- replica.shard.CopyFrom(shard_created)
117
- shard.replicas.append(replica)
118
- created_shards.append(shard)
119
- replicas_created += 1
120
- except Exception as e:
121
- errors.capture_exception(e)
122
- logger.exception("Unexpected error creating new shard")
123
- for created_shard in created_shards:
124
- await sm.rollback_shard(created_shard)
125
- raise e
126
-
127
- await datamanagers.rollover.update_kb_rollover_shards(
128
- txn, kbid=kbid, kb_shards=kb_shards
129
- )
176
+ else:
177
+ shard_created = await node.new_shard_with_vectorsets(
178
+ kbid,
179
+ vectorsets_configs=vectorsets,
180
+ )
181
+ except Exception as e:
182
+ errors.capture_exception(e)
183
+ logger.exception(f"Error creating new shard at {node}")
184
+ continue
185
+
186
+ replica = writer_pb2.ShardReplica(node=str(node_id))
187
+ replica.shard.CopyFrom(shard_created)
188
+ shard.replicas.append(replica)
189
+ created_shards.append(shard)
190
+ replicas_created += 1
191
+
192
+ if nidx_node:
193
+ nidx_shard = await nidx_node.new_shard_with_vectorsets(
194
+ kbid,
195
+ vectorsets_configs=vectorsets,
196
+ )
197
+ shard.nidx_shard_id = nidx_shard.id
198
+
199
+ except Exception as e:
200
+ errors.capture_exception(e)
201
+ logger.exception("Unexpected error creating new shard")
202
+ for created_shard in created_shards:
203
+ await sm.rollback_shard(created_shard)
204
+ raise e
205
+
206
+ async with datamanagers.with_transaction() as txn:
207
+ await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=kb_shards)
208
+ state.rollover_shards_created = True
209
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
130
210
  await txn.commit()
131
211
  return kb_shards
132
212
 
133
213
 
134
- def _get_shard(
135
- shards: writer_pb2.Shards, shard_id: str
136
- ) -> Optional[writer_pb2.ShardObject]:
214
+ def _get_shard(shards: writer_pb2.Shards, shard_id: str) -> Optional[writer_pb2.ShardObject]:
137
215
  for shard in shards.shards:
138
216
  if shard_id == shard.shard:
139
217
  return shard
140
218
  return None
141
219
 
142
220
 
143
- async def schedule_resource_indexing(
144
- app_context: ApplicationContext, kbid: str
145
- ) -> None:
221
+ async def schedule_resource_indexing(app_context: ApplicationContext, kbid: str) -> None:
146
222
  """
147
223
  Schedule indexing all data in a kb in rollover shards
148
224
  """
149
- logger.warning("Indexing rollover shards", extra={"kbid": kbid})
150
-
151
- async with datamanagers.with_transaction() as txn:
152
- rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
153
- txn, kbid=kbid
154
- )
155
- if rollover_shards is None:
225
+ logger.info("Scheduling resources to be indexed to rollover shards", extra={"kbid": kbid})
226
+ async with datamanagers.with_ro_transaction() as txn:
227
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
228
+ if not state.rollover_shards_created:
156
229
  raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
157
-
158
- if _get_rollover_status(rollover_shards, RolloverStatus.RESOURCES_SCHEDULED):
159
- logger.warning(
160
- "Resources already scheduled for indexing, skipping",
161
- extra={"kbid": kbid},
162
- )
163
- return
230
+ if state.resources_scheduled:
231
+ logger.info(
232
+ "Resources already scheduled for indexing, skipping",
233
+ extra={"kbid": kbid},
234
+ )
235
+ return
164
236
 
165
237
  batch = []
166
238
  async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
@@ -168,9 +240,7 @@ async def schedule_resource_indexing(
168
240
 
169
241
  if len(batch) > 100:
170
242
  async with datamanagers.with_transaction() as txn:
171
- await datamanagers.rollover.add_batch_to_index(
172
- txn, kbid=kbid, batch=batch
173
- )
243
+ await datamanagers.rollover.add_batch_to_index(txn, kbid=kbid, batch=batch)
174
244
  await txn.commit()
175
245
  batch = []
176
246
  if len(batch) > 0:
@@ -179,10 +249,8 @@ async def schedule_resource_indexing(
179
249
  await txn.commit()
180
250
 
181
251
  async with datamanagers.with_transaction() as txn:
182
- _set_rollover_status(rollover_shards, RolloverStatus.RESOURCES_SCHEDULED)
183
- await datamanagers.rollover.update_kb_rollover_shards(
184
- txn, kbid=kbid, kb_shards=rollover_shards
185
- )
252
+ state.resources_scheduled = True
253
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
186
254
  await txn.commit()
187
255
 
188
256
 
@@ -190,24 +258,27 @@ def _to_ts(dt: datetime) -> int:
190
258
  return int(dt.timestamp() * 1000 * 1000)
191
259
 
192
260
 
193
- async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> None:
261
+ async def index_to_rollover_index(
262
+ app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
263
+ ) -> None:
194
264
  """
195
- Indexes all data in a kb in rollover shards
265
+ Indexes all data in a kb in rollover indexes. This happens before the cutover.
196
266
  """
197
-
198
- async with datamanagers.with_transaction() as txn:
199
- rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
200
- txn, kbid=kbid
201
- )
202
- if rollover_shards is None:
203
- raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
204
-
205
- if _get_rollover_status(rollover_shards, RolloverStatus.RESOURCES_INDEXED):
206
- logger.warning("Resources already indexed, skipping", extra={"kbid": kbid})
267
+ extra = {"kbid": kbid, "external_index_provider": None}
268
+ if external is not None:
269
+ extra["external_index_provider"] = external.type.value
270
+ async with datamanagers.with_ro_transaction() as txn:
271
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
272
+ if not all([state.rollover_shards_created, state.resources_scheduled]):
273
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
274
+ rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
275
+ if rollover_shards is None:
276
+ raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
277
+ if state.resources_indexed:
278
+ logger.info("Resources already indexed, skipping", extra=extra)
207
279
  return
208
280
 
209
- logger.warning("Indexing rollover shards", extra={"kbid": kbid})
210
-
281
+ logger.info("Indexing to rollover index", extra=extra)
211
282
  wait_index_batch: list[writer_pb2.ShardObject] = []
212
283
  # now index on all new shards only
213
284
  while True:
@@ -221,11 +292,14 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
221
292
  txn, kbid=kbid, rid=resource_id
222
293
  )
223
294
  if shard_id is None:
224
- logger.error(
225
- "Shard id not found for resource",
295
+ logger.warning(
296
+ "Shard id not found for resource. Skipping indexing as it may have been deleted",
226
297
  extra={"kbid": kbid, "resource_id": resource_id},
227
298
  )
228
- raise UnexpectedRolloverError("Shard id not found for resource")
299
+ async with datamanagers.with_transaction() as txn:
300
+ await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
301
+ await txn.commit()
302
+ continue
229
303
 
230
304
  shard = _get_shard(rollover_shards, shard_id)
231
305
  if shard is None: # pragma: no cover
@@ -236,28 +310,29 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
236
310
  raise UnexpectedRolloverError(
237
311
  f"Shard {shard_id} not found. Was a new one created during migration?"
238
312
  )
239
-
240
- resource_index_message = await index_resource_to_shard(
241
- app_context, kbid, resource_id, shard
242
- )
243
- if resource_index_message is None:
313
+ resource = await get_resource(kbid, resource_id)
314
+ index_message = await get_resource_index_message(kbid, resource_id)
315
+ if resource is None or index_message is None:
244
316
  # resource no longer existing, remove indexing and carry on
245
317
  async with datamanagers.with_transaction() as txn:
246
- await datamanagers.rollover.remove_to_index(
247
- txn, kbid=kbid, resource=resource_id
248
- )
318
+ await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
249
319
  await txn.commit()
250
320
  continue
251
321
 
322
+ if external is not None:
323
+ await external.index_resource(resource_id, index_message, to_rollover_indexes=True)
324
+ else:
325
+ await index_resource_to_shard(
326
+ app_context, kbid, resource_id, shard, resource_index_message=index_message
327
+ )
328
+
252
329
  async with datamanagers.with_transaction() as txn:
253
330
  await datamanagers.rollover.add_indexed(
254
331
  txn,
255
332
  kbid=kbid,
256
333
  resource_id=resource_id,
257
334
  shard_id=shard_id,
258
- modification_time=_to_ts(
259
- resource_index_message.metadata.modified.ToDatetime()
260
- ),
335
+ modification_time=_to_ts(resource.basic.modified.ToDatetime()), # type: ignore
261
336
  )
262
337
  await txn.commit()
263
338
  wait_index_batch.append(shard)
@@ -271,11 +346,66 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
271
346
  await wait_for_node(app_context, node_id)
272
347
  wait_index_batch = []
273
348
 
274
- _set_rollover_status(rollover_shards, RolloverStatus.RESOURCES_INDEXED)
275
349
  async with datamanagers.with_transaction() as txn:
276
- await datamanagers.rollover.update_kb_rollover_shards(
277
- txn, kbid=kbid, kb_shards=rollover_shards
350
+ state.resources_indexed = True
351
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
352
+ await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=rollover_shards)
353
+ await txn.commit()
354
+
355
+
356
+ async def cutover_index(
357
+ app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
358
+ ) -> None:
359
+ """
360
+ Swaps our the current active index for a knowledgebox.
361
+ """
362
+ await cutover_shards(app_context, kbid)
363
+ if external is not None:
364
+ if external.supports_rollover:
365
+ await cutover_external_index(kbid, external)
366
+ else:
367
+ logger.info(
368
+ "External index provider does not support rollover",
369
+ extra={"kbid": kbid, "external_index_provider": external.type.value},
370
+ )
371
+
372
+
373
+ async def cutover_external_index(kbid: str, external: ExternalIndexManager) -> None:
374
+ """
375
+ Cuts over to the newly creted external index for a knowledgebox.
376
+ The old indexes are deleted.
377
+ """
378
+ extra = {"kbid": kbid, "external_index_provider": external.type.value}
379
+ logger.info("Cutting over external index", extra=extra)
380
+ async with datamanagers.with_rw_transaction() as txn:
381
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
382
+ if not all(
383
+ [
384
+ state.rollover_shards_created,
385
+ state.resources_scheduled,
386
+ state.resources_indexed,
387
+ ]
388
+ ):
389
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
390
+ if state.cutover_external_index:
391
+ logger.info("External index already cut over, skipping", extra=extra)
392
+ return
393
+
394
+ stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
395
+ rollover_metadata = await datamanagers.rollover.get_kb_rollover_external_index_metadata(
396
+ txn, kbid=kbid
397
+ )
398
+ if stored_metadata is None or rollover_metadata is None:
399
+ raise UnexpectedRolloverError("stored or rollover external index metadata not found")
400
+
401
+ await external.rollover_cutover_indexes()
402
+
403
+ await datamanagers.kb.set_external_index_provider_metadata(
404
+ txn, kbid=kbid, metadata=rollover_metadata
278
405
  )
406
+ await datamanagers.rollover.delete_kb_rollover_external_index_metadata(txn, kbid=kbid)
407
+ state.cutover_external_index = True
408
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
279
409
  await txn.commit()
280
410
 
281
411
 
@@ -283,33 +413,44 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
283
413
  """
284
414
  Swaps our the current active shards for a knowledgebox.
285
415
  """
286
- logger.warning("Cutting over shards", extra={"kbid": kbid})
416
+ logger.info("Cutting over shards", extra={"kbid": kbid})
287
417
  async with datamanagers.with_transaction() as txn:
288
418
  sm = app_context.shard_manager
289
419
 
420
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
421
+ if not all(
422
+ [
423
+ state.rollover_shards_created,
424
+ state.resources_scheduled,
425
+ state.resources_indexed,
426
+ ]
427
+ ):
428
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
429
+ if state.cutover_shards:
430
+ logger.info("Shards already cut over, skipping", extra={"kbid": kbid})
431
+ return
432
+
290
433
  previously_active_shards = await datamanagers.cluster.get_kb_shards(
291
- txn, kbid=kbid
292
- )
293
- rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
294
- txn, kbid=kbid
434
+ txn, kbid=kbid, for_update=True
295
435
  )
436
+ rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
296
437
  if previously_active_shards is None or rollover_shards is None:
297
438
  raise UnexpectedRolloverError("Shards for kb not found")
298
439
 
299
- _clear_rollover_status(rollover_shards)
300
- await datamanagers.cluster.update_kb_shards(
301
- txn, kbid=kbid, shards=rollover_shards
302
- )
440
+ await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rollover_shards)
303
441
  await datamanagers.rollover.delete_kb_rollover_shards(txn, kbid=kbid)
304
442
 
305
443
  for shard in previously_active_shards.shards:
306
444
  await sm.rollback_shard(shard)
307
445
 
446
+ state.cutover_shards = True
447
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
448
+
308
449
  await txn.commit()
309
450
 
310
451
 
311
452
  async def validate_indexed_data(
312
- app_context: ApplicationContext, kbid: str
453
+ app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
313
454
  ) -> list[str]:
314
455
  """
315
456
  Goes through all the resources in a knowledgebox and validates it
@@ -319,21 +460,34 @@ async def validate_indexed_data(
319
460
 
320
461
  If a resource was removed during the rollover, it will be removed as well.
321
462
  """
463
+ extra = {"kbid": kbid, "external_index_provider": None}
464
+ if external is not None:
465
+ extra["external_index_provider"] = external.type.value
466
+ async with datamanagers.with_ro_transaction() as txn:
467
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
468
+ if not all(
469
+ [
470
+ state.rollover_shards_created,
471
+ state.resources_scheduled,
472
+ state.resources_indexed,
473
+ state.cutover_shards,
474
+ ]
475
+ ):
476
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
322
477
 
323
- async with datamanagers.with_transaction() as txn:
324
478
  rolled_over_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
325
479
  if rolled_over_shards is None:
326
480
  raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
327
481
 
328
- if _get_rollover_status(rolled_over_shards, RolloverStatus.RESOURCES_VALIDATED):
329
- logger.warning("Resources already validated, skipping", extra={"kbid": kbid})
482
+ if state.resources_validated:
483
+ logger.info("Resources already validated, skipping", extra=extra)
330
484
  return []
331
485
 
332
- logger.warning("Validating indexed data", extra={"kbid": kbid})
486
+ logger.info("Validating indexed data", extra=extra)
333
487
 
334
- repaired_resources = []
488
+ repaired_resources: list[str] = []
335
489
  async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
336
- async with datamanagers.with_transaction() as txn:
490
+ async with datamanagers.with_ro_transaction() as txn:
337
491
  indexed_data = await datamanagers.rollover.get_indexed_data(
338
492
  txn, kbid=kbid, resource_id=resource_id
339
493
  )
@@ -350,7 +504,7 @@ async def validate_indexed_data(
350
504
  if shard_id is None:
351
505
  logger.error(
352
506
  "Shard id not found for resource",
353
- extra={"kbid": kbid, "resource_id": resource_id},
507
+ extra={"resource_id": resource_id, **extra},
354
508
  )
355
509
  raise UnexpectedRolloverError("Shard id not found for resource")
356
510
  last_indexed = 0
@@ -360,23 +514,18 @@ async def validate_indexed_data(
360
514
  logger.error(
361
515
  "Shard not found for resource",
362
516
  extra={
363
- "kbid": kbid,
364
517
  "resource_id": resource_id,
365
518
  "shard_id": shard_id,
519
+ **extra,
366
520
  },
367
521
  )
368
- raise UnexpectedRolloverError(
369
- f"Shard {shard_id} not found. This should not happen"
370
- )
522
+ raise UnexpectedRolloverError(f"Shard {shard_id} not found. This should not happen")
371
523
 
372
- async with datamanagers.with_transaction() as txn:
373
- res = await datamanagers.resources.get_resource(
374
- txn, kbid=kbid, rid=resource_id
375
- )
524
+ res = await get_resource(kbid, resource_id)
376
525
  if res is None:
377
526
  logger.error(
378
527
  "Resource not found while validating, skipping",
379
- extra={"kbid": kbid, "resource_id": resource_id},
528
+ extra={"resource_id": resource_id, **extra},
380
529
  )
381
530
  continue
382
531
 
@@ -393,12 +542,26 @@ async def validate_indexed_data(
393
542
  await txn.commit()
394
543
  continue
395
544
 
545
+ index_message = await get_resource_index_message(kbid, resource_id)
546
+ if index_message is None:
547
+ logger.error(
548
+ "Resource index message not found while validating, skipping",
549
+ extra={"resource_id": resource_id, **extra},
550
+ )
551
+ continue
552
+
396
553
  # resource was modified or added during rollover, reindex
397
- resource_index_message = await index_resource_to_shard(
398
- app_context, kbid, resource_id, shard
399
- )
400
- if resource_index_message is not None:
401
- repaired_resources.append(resource_id)
554
+ if external is not None:
555
+ await external.index_resource(
556
+ resource_id,
557
+ index_message,
558
+ to_rollover_indexes=True,
559
+ )
560
+ else:
561
+ await index_resource_to_shard(
562
+ app_context, kbid, resource_id, shard, resource_index_message=index_message
563
+ )
564
+ repaired_resources.append(resource_id)
402
565
  async with datamanagers.with_transaction() as txn:
403
566
  await datamanagers.rollover.add_indexed(
404
567
  txn,
@@ -409,25 +572,23 @@ async def validate_indexed_data(
409
572
  )
410
573
  await txn.commit()
411
574
 
412
- # any left overs should be deleted
413
- async for resource_id, (
414
- shard_id,
415
- last_indexed,
416
- ) in datamanagers.rollover.iterate_indexed_data(kbid=kbid):
417
- if last_indexed == -1:
418
- continue
419
-
420
- shard = _get_shard(rolled_over_shards, shard_id)
421
- if shard is None:
422
- raise UnexpectedRolloverError("Shard not found. This should not happen")
575
+ # any left overs should be deleted
576
+ async for resource_id, (
577
+ shard_id,
578
+ last_indexed,
579
+ ) in datamanagers.rollover.iterate_indexed_data(kbid=kbid):
580
+ if last_indexed == -1:
581
+ continue
423
582
 
424
- await delete_resource_from_shard(app_context, kbid, resource_id, shard)
583
+ shard = _get_shard(rolled_over_shards, shard_id)
584
+ if shard is None:
585
+ raise UnexpectedRolloverError("Shard not found. This should not happen")
586
+ await delete_resource_from_shard(app_context, kbid, resource_id, shard)
425
587
 
426
- _set_rollover_status(rolled_over_shards, RolloverStatus.RESOURCES_VALIDATED)
427
588
  async with datamanagers.with_transaction() as txn:
428
- await datamanagers.cluster.update_kb_shards(
429
- txn, kbid=kbid, shards=rolled_over_shards
430
- )
589
+ state.resources_validated = True
590
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
591
+ await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rolled_over_shards)
431
592
 
432
593
  return repaired_resources
433
594
 
@@ -449,64 +610,76 @@ async def clean_indexed_data(app_context: ApplicationContext, kbid: str) -> None
449
610
 
450
611
  async def clean_rollover_status(app_context: ApplicationContext, kbid: str) -> None:
451
612
  async with datamanagers.with_transaction() as txn:
452
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
453
- if kb_shards is None:
613
+ try:
614
+ await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
615
+ except RolloverStateNotFoundError:
454
616
  logger.warning(
455
- "No shards found for KB, skipping clean rollover status",
456
- extra={"kbid": kbid},
617
+ "No rollover state found, skipping clean rollover status", extra={"kbid": kbid}
457
618
  )
458
619
  return
620
+ await datamanagers.rollover.clear_rollover_state(txn, kbid=kbid)
621
+ await txn.commit()
622
+
459
623
 
460
- _clear_rollover_status(kb_shards)
461
- await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
624
+ async def wait_for_cluster_ready() -> None:
625
+ node_ready_checks = 0
626
+ while len(cluster_manager.INDEX_NODES) == 0:
627
+ if node_ready_checks > 10:
628
+ raise Exception("No index nodes available")
629
+ logger.info("Waiting for index nodes to be available")
630
+ await asyncio.sleep(1)
631
+ node_ready_checks += 1
462
632
 
463
633
 
464
- async def rollover_kb_shards(app_context: ApplicationContext, kbid: str) -> None:
634
+ async def rollover_kb_index(
635
+ app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
636
+ ) -> None:
465
637
  """
466
- Rollover a shard is the process of creating new shard replicas for every
467
- shard and indexing all existing resources into the replicas.
638
+ Rollover a KB index is the process of creating new shard replicas for every
639
+ shard and indexing all existing resources into the replicas. Also includes creating new external indexes if
640
+ the KB is configured to use them.
641
+
642
+ Once all the data is in the new indexes, cut over to the replicated index delete the old one.
468
643
 
469
- Once all the data is in the new shards, cut over the registered replicas
470
- to the new shards and delete the old shards.
644
+ If drain_nodes is provided, no index node replicas will be created on those nodes. This is useful
645
+ for when we want to remove a set of nodes from the index node cluster.
471
646
 
472
647
  This is a very expensive operation and should be done with care.
473
648
 
474
649
  Process:
475
- - Create new shards
650
+ - Create new index for kb index (index node shards or external indexes if configured)
476
651
  - Schedule all resources to be indexed
477
- - Index all resources into new shards
478
- - Cut over replicas to new shards
479
- - Validate that all resources are in the new shards
652
+ - Index all resources into new kb index (index node shards or external indexes if configured)
653
+ - Cut over replicas to new shards (and external indexes if configured)
654
+ - Validate that all resources are in the new kb index
480
655
  - Clean up indexed data
481
656
  """
482
- node_ready_checks = 0
483
- while len(cluster_manager.INDEX_NODES) == 0:
484
- if node_ready_checks > 10:
485
- raise Exception("No index nodes available")
486
- logger.warning("Waiting for index nodes to be available")
487
- await asyncio.sleep(1)
488
- node_ready_checks += 1
657
+ await wait_for_cluster_ready()
489
658
 
490
- logger.warning("Rolling over shards", extra={"kbid": kbid})
659
+ extra = {"kbid": kbid, "external_index_provider": None}
660
+ external = await get_external_index_manager(kbid, for_rollover=True)
661
+ if external is not None:
662
+ extra["external_index_provider"] = external.type.value
663
+ logger.info("Rolling over KB index", extra=extra)
491
664
 
492
665
  async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
493
- await create_rollover_shards(app_context, kbid)
666
+ await create_rollover_index(app_context, kbid, drain_nodes=drain_nodes, external=external)
494
667
  await schedule_resource_indexing(app_context, kbid)
495
- await index_rollover_shards(app_context, kbid)
496
- await cutover_shards(app_context, kbid)
668
+ await index_to_rollover_index(app_context, kbid, external=external)
669
+ await cutover_index(app_context, kbid, external=external)
497
670
  # we need to cut over BEFORE we validate the data
498
- await validate_indexed_data(app_context, kbid)
671
+ await validate_indexed_data(app_context, kbid, external=external)
499
672
  await clean_indexed_data(app_context, kbid)
500
673
  await clean_rollover_status(app_context, kbid)
501
674
 
502
- logger.warning("Finished rolling over shards", extra={"kbid": kbid})
675
+ logger.info("Finished rolling over KB indes", extra=extra)
503
676
 
504
677
 
505
678
  async def _rollover_kbid_command(kbid: str) -> None: # pragma: no cover
506
679
  app_context = ApplicationContext()
507
680
  await app_context.initialize()
508
681
  try:
509
- await rollover_kb_shards(app_context, kbid)
682
+ await rollover_kb_index(app_context, kbid)
510
683
  finally:
511
684
  await app_context.finalize()
512
685