nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,7 +19,6 @@
19
19
  #
20
20
  import argparse
21
21
  import asyncio
22
- import enum
23
22
  import logging
24
23
  from datetime import datetime
25
24
  from typing import Optional
@@ -27,145 +26,213 @@ from typing import Optional
27
26
  from nucliadb.common import datamanagers, locking
28
27
  from nucliadb.common.cluster import manager as cluster_manager
29
28
  from nucliadb.common.context import ApplicationContext
30
- from nucliadb_protos import writer_pb2
29
+ from nucliadb.common.datamanagers.rollover import RolloverState, RolloverStateNotFoundError
30
+ from nucliadb.common.external_index_providers.base import ExternalIndexManager
31
+ from nucliadb.common.external_index_providers.manager import (
32
+ get_external_index_manager,
33
+ )
34
+ from nucliadb.common.nidx import get_nidx_fake_node
35
+ from nucliadb_protos import nodewriter_pb2, writer_pb2
31
36
  from nucliadb_telemetry import errors
32
37
 
33
38
  from .manager import get_index_node
34
39
  from .settings import settings
35
- from .utils import delete_resource_from_shard, index_resource_to_shard, wait_for_node
40
+ from .utils import (
41
+ delete_resource_from_shard,
42
+ get_resource,
43
+ get_resource_index_message,
44
+ index_resource_to_shard,
45
+ wait_for_node,
46
+ )
36
47
 
37
48
  logger = logging.getLogger(__name__)
38
49
 
39
50
 
40
- class RolloverStatus(enum.Enum):
41
- RESOURCES_SCHEDULED = "resources_scheduled"
42
- RESOURCES_INDEXED = "resources_indexed"
43
- RESOURCES_VALIDATED = "resources_validated"
44
-
51
+ class UnexpectedRolloverError(Exception):
52
+ pass
45
53
 
46
- def _get_rollover_status(
47
- rollover_shards: writer_pb2.Shards, status: RolloverStatus
48
- ) -> bool:
49
- return rollover_shards.extra.get(status.value) == "true"
50
54
 
55
+ async def create_rollover_index(
56
+ app_context: ApplicationContext,
57
+ kbid: str,
58
+ drain_nodes: Optional[list[str]] = None,
59
+ external: Optional[ExternalIndexManager] = None,
60
+ ) -> None:
61
+ """
62
+ Creates a new index for a knowledgebox in the index node cluster (and to the external index provider if configured).
63
+ For the external index case, we still need the shard on the index node cluster to be created because
64
+ it is used to store the rollover state during the rollover. However, the actual indexing will be done
65
+ by the external index provider.
66
+ """
67
+ await create_rollover_shards(app_context, kbid, drain_nodes=drain_nodes)
68
+ if external is not None:
69
+ if external.supports_rollover:
70
+ await create_rollover_external_index(kbid, external)
71
+ else:
72
+ logger.info(
73
+ "External index provider does not support rollover",
74
+ extra={"kbid": kbid, "external_index_provider": external.type.value},
75
+ )
51
76
 
52
- def _set_rollover_status(rollover_shards: writer_pb2.Shards, status: RolloverStatus):
53
- rollover_shards.extra[status.value] = "true"
54
77
 
78
+ async def create_rollover_external_index(kbid: str, external: ExternalIndexManager) -> None:
79
+ extra = {"kbid": kbid, "external_index_provider": external.type.value}
80
+ async with datamanagers.with_ro_transaction() as txn:
81
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
82
+ if state.external_index_created:
83
+ logger.info("Rollover external index already created, skipping", extra=extra)
84
+ return
55
85
 
56
- def _clear_rollover_status(rollover_shards: writer_pb2.Shards):
57
- for status in RolloverStatus:
58
- rollover_shards.extra.pop(status.value, None)
86
+ logger.info("Creating rollover external index", extra=extra)
87
+ async with datamanagers.with_ro_transaction() as txn:
88
+ stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
89
+ if stored_metadata is None:
90
+ raise UnexpectedRolloverError("External index metadata not found")
59
91
 
92
+ rollover_metadata = await external.rollover_create_indexes(stored_metadata)
60
93
 
61
- class UnexpectedRolloverError(Exception):
62
- pass
94
+ async with datamanagers.with_rw_transaction() as txn:
95
+ await datamanagers.rollover.update_kb_rollover_external_index_metadata(
96
+ txn, kbid=kbid, metadata=rollover_metadata
97
+ )
98
+ state.external_index_created = True
99
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
100
+ await txn.commit()
63
101
 
64
102
 
65
103
  async def create_rollover_shards(
66
104
  app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
67
105
  ) -> writer_pb2.Shards:
68
106
  """
69
- Creates shards to be used for a rollover operation.
107
+ Creates new index node shards for a rollover operation.
70
108
  If drain_nodes is provided, no replicas will be created on those nodes.
71
109
  """
110
+
72
111
  logger.info("Creating rollover shards", extra={"kbid": kbid})
73
112
  sm = app_context.shard_manager
113
+ nidx_node = get_nidx_fake_node()
74
114
 
75
- async with datamanagers.with_transaction() as txn:
76
- existing_rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
77
- txn, kbid=kbid
78
- )
79
- if existing_rollover_shards is not None:
80
- logger.info("Rollover shards already exist, skipping", extra={"kbid": kbid})
81
- return existing_rollover_shards
115
+ async with datamanagers.with_ro_transaction() as txn:
116
+ try:
117
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
118
+ except RolloverStateNotFoundError:
119
+ # State is not set yet, create it
120
+ state = RolloverState(
121
+ rollover_shards_created=False,
122
+ external_index_created=False,
123
+ resources_scheduled=False,
124
+ resources_indexed=False,
125
+ cutover_shards=False,
126
+ cutover_external_index=False,
127
+ resources_validated=False,
128
+ )
82
129
 
83
130
  kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
84
131
  if kb_shards is None:
85
132
  raise UnexpectedRolloverError(f"No shards found for KB {kbid}")
86
133
 
87
- # create new shards
88
- created_shards = []
89
- try:
90
- nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
91
- for shard in kb_shards.shards:
92
- shard.ClearField("replicas")
93
- # Attempt to create configured number of replicas
94
- replicas_created = 0
95
- while replicas_created < settings.node_replicas:
96
- if len(nodes) == 0:
97
- # could have multiple shards on single node
98
- nodes = cluster_manager.sorted_primary_nodes(
99
- ignore_nodes=drain_nodes
134
+ if state.rollover_shards_created:
135
+ logger.info("Rollover shards already created, skipping", extra={"kbid": kbid})
136
+ return kb_shards
137
+
138
+ # create new shards
139
+ created_shards = []
140
+ try:
141
+ nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
142
+ for shard in kb_shards.shards:
143
+ shard.ClearField("replicas")
144
+ # Attempt to create configured number of replicas
145
+ replicas_created = 0
146
+ while replicas_created < settings.node_replicas:
147
+ if len(nodes) == 0:
148
+ # could have multiple shards on single node
149
+ nodes = cluster_manager.sorted_primary_nodes(ignore_nodes=drain_nodes)
150
+ node_id = nodes.pop(0)
151
+
152
+ node = get_index_node(node_id)
153
+ if node is None:
154
+ logger.error(f"Node {node_id} is not found or not available")
155
+ continue
156
+
157
+ vectorsets = {
158
+ vectorset_id: vectorset_config.vectorset_index_config
159
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
160
+ txn, kbid=kbid
161
+ )
162
+ }
163
+ try:
164
+ if not vectorsets:
165
+ is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
166
+ vector_index_config = nodewriter_pb2.VectorIndexConfig(
167
+ similarity=kb_shards.similarity,
168
+ vector_type=nodewriter_pb2.VectorType.DENSE_F32,
169
+ vector_dimension=kb_shards.model.vector_dimension,
170
+ normalize_vectors=is_matryoshka,
100
171
  )
101
- node_id = nodes.pop(0)
102
-
103
- node = get_index_node(node_id)
104
- if node is None:
105
- logger.error(f"Node {node_id} is not found or not available")
106
- continue
107
- is_matryoshka = len(kb_shards.model.matryoshka_dimensions) > 0
108
- try:
109
172
  shard_created = await node.new_shard(
110
173
  kbid,
111
- similarity=kb_shards.similarity,
112
- release_channel=kb_shards.release_channel,
113
- normalize_vectors=is_matryoshka,
174
+ vector_index_config=vector_index_config,
114
175
  )
115
- except Exception as e:
116
- errors.capture_exception(e)
117
- logger.exception(f"Error creating new shard at {node}")
118
- continue
119
-
120
- replica = writer_pb2.ShardReplica(node=str(node_id))
121
- replica.shard.CopyFrom(shard_created)
122
- shard.replicas.append(replica)
123
- created_shards.append(shard)
124
- replicas_created += 1
125
- except Exception as e:
126
- errors.capture_exception(e)
127
- logger.exception("Unexpected error creating new shard")
128
- for created_shard in created_shards:
129
- await sm.rollback_shard(created_shard)
130
- raise e
131
-
132
- await datamanagers.rollover.update_kb_rollover_shards(
133
- txn, kbid=kbid, kb_shards=kb_shards
134
- )
176
+ else:
177
+ shard_created = await node.new_shard_with_vectorsets(
178
+ kbid,
179
+ vectorsets_configs=vectorsets,
180
+ )
181
+ except Exception as e:
182
+ errors.capture_exception(e)
183
+ logger.exception(f"Error creating new shard at {node}")
184
+ continue
185
+
186
+ replica = writer_pb2.ShardReplica(node=str(node_id))
187
+ replica.shard.CopyFrom(shard_created)
188
+ shard.replicas.append(replica)
189
+ created_shards.append(shard)
190
+ replicas_created += 1
191
+
192
+ if nidx_node:
193
+ nidx_shard = await nidx_node.new_shard_with_vectorsets(
194
+ kbid,
195
+ vectorsets_configs=vectorsets,
196
+ )
197
+ shard.nidx_shard_id = nidx_shard.id
198
+
199
+ except Exception as e:
200
+ errors.capture_exception(e)
201
+ logger.exception("Unexpected error creating new shard")
202
+ for created_shard in created_shards:
203
+ await sm.rollback_shard(created_shard)
204
+ raise e
205
+
206
+ async with datamanagers.with_transaction() as txn:
207
+ await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=kb_shards)
208
+ state.rollover_shards_created = True
209
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
135
210
  await txn.commit()
136
211
  return kb_shards
137
212
 
138
213
 
139
- def _get_shard(
140
- shards: writer_pb2.Shards, shard_id: str
141
- ) -> Optional[writer_pb2.ShardObject]:
214
+ def _get_shard(shards: writer_pb2.Shards, shard_id: str) -> Optional[writer_pb2.ShardObject]:
142
215
  for shard in shards.shards:
143
216
  if shard_id == shard.shard:
144
217
  return shard
145
218
  return None
146
219
 
147
220
 
148
- async def schedule_resource_indexing(
149
- app_context: ApplicationContext, kbid: str
150
- ) -> None:
221
+ async def schedule_resource_indexing(app_context: ApplicationContext, kbid: str) -> None:
151
222
  """
152
223
  Schedule indexing all data in a kb in rollover shards
153
224
  """
154
- logger.info("Indexing rollover shards", extra={"kbid": kbid})
155
-
156
- async with datamanagers.with_transaction() as txn:
157
- rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
158
- txn, kbid=kbid
159
- )
160
- if rollover_shards is None:
225
+ logger.info("Scheduling resources to be indexed to rollover shards", extra={"kbid": kbid})
226
+ async with datamanagers.with_ro_transaction() as txn:
227
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
228
+ if not state.rollover_shards_created:
161
229
  raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
162
-
163
- if _get_rollover_status(rollover_shards, RolloverStatus.RESOURCES_SCHEDULED):
164
- logger.info(
165
- "Resources already scheduled for indexing, skipping",
166
- extra={"kbid": kbid},
167
- )
168
- return
230
+ if state.resources_scheduled:
231
+ logger.info(
232
+ "Resources already scheduled for indexing, skipping",
233
+ extra={"kbid": kbid},
234
+ )
235
+ return
169
236
 
170
237
  batch = []
171
238
  async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
@@ -173,9 +240,7 @@ async def schedule_resource_indexing(
173
240
 
174
241
  if len(batch) > 100:
175
242
  async with datamanagers.with_transaction() as txn:
176
- await datamanagers.rollover.add_batch_to_index(
177
- txn, kbid=kbid, batch=batch
178
- )
243
+ await datamanagers.rollover.add_batch_to_index(txn, kbid=kbid, batch=batch)
179
244
  await txn.commit()
180
245
  batch = []
181
246
  if len(batch) > 0:
@@ -184,10 +249,8 @@ async def schedule_resource_indexing(
184
249
  await txn.commit()
185
250
 
186
251
  async with datamanagers.with_transaction() as txn:
187
- _set_rollover_status(rollover_shards, RolloverStatus.RESOURCES_SCHEDULED)
188
- await datamanagers.rollover.update_kb_rollover_shards(
189
- txn, kbid=kbid, kb_shards=rollover_shards
190
- )
252
+ state.resources_scheduled = True
253
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
191
254
  await txn.commit()
192
255
 
193
256
 
@@ -195,24 +258,27 @@ def _to_ts(dt: datetime) -> int:
195
258
  return int(dt.timestamp() * 1000 * 1000)
196
259
 
197
260
 
198
- async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> None:
261
+ async def index_to_rollover_index(
262
+ app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
263
+ ) -> None:
199
264
  """
200
- Indexes all data in a kb in rollover shards
265
+ Indexes all data in a kb in rollover indexes. This happens before the cutover.
201
266
  """
202
-
203
- async with datamanagers.with_transaction() as txn:
204
- rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
205
- txn, kbid=kbid
206
- )
207
- if rollover_shards is None:
208
- raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
209
-
210
- if _get_rollover_status(rollover_shards, RolloverStatus.RESOURCES_INDEXED):
211
- logger.info("Resources already indexed, skipping", extra={"kbid": kbid})
267
+ extra = {"kbid": kbid, "external_index_provider": None}
268
+ if external is not None:
269
+ extra["external_index_provider"] = external.type.value
270
+ async with datamanagers.with_ro_transaction() as txn:
271
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
272
+ if not all([state.rollover_shards_created, state.resources_scheduled]):
273
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
274
+ rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
275
+ if rollover_shards is None:
276
+ raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
277
+ if state.resources_indexed:
278
+ logger.info("Resources already indexed, skipping", extra=extra)
212
279
  return
213
280
 
214
- logger.info("Indexing rollover shards", extra={"kbid": kbid})
215
-
281
+ logger.info("Indexing to rollover index", extra=extra)
216
282
  wait_index_batch: list[writer_pb2.ShardObject] = []
217
283
  # now index on all new shards only
218
284
  while True:
@@ -231,9 +297,7 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
231
297
  extra={"kbid": kbid, "resource_id": resource_id},
232
298
  )
233
299
  async with datamanagers.with_transaction() as txn:
234
- await datamanagers.rollover.remove_to_index(
235
- txn, kbid=kbid, resource=resource_id
236
- )
300
+ await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
237
301
  await txn.commit()
238
302
  continue
239
303
 
@@ -246,28 +310,29 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
246
310
  raise UnexpectedRolloverError(
247
311
  f"Shard {shard_id} not found. Was a new one created during migration?"
248
312
  )
249
-
250
- resource_index_message = await index_resource_to_shard(
251
- app_context, kbid, resource_id, shard
252
- )
253
- if resource_index_message is None:
313
+ resource = await get_resource(kbid, resource_id)
314
+ index_message = await get_resource_index_message(kbid, resource_id)
315
+ if resource is None or index_message is None:
254
316
  # resource no longer existing, remove indexing and carry on
255
317
  async with datamanagers.with_transaction() as txn:
256
- await datamanagers.rollover.remove_to_index(
257
- txn, kbid=kbid, resource=resource_id
258
- )
318
+ await datamanagers.rollover.remove_to_index(txn, kbid=kbid, resource=resource_id)
259
319
  await txn.commit()
260
320
  continue
261
321
 
322
+ if external is not None:
323
+ await external.index_resource(resource_id, index_message, to_rollover_indexes=True)
324
+ else:
325
+ await index_resource_to_shard(
326
+ app_context, kbid, resource_id, shard, resource_index_message=index_message
327
+ )
328
+
262
329
  async with datamanagers.with_transaction() as txn:
263
330
  await datamanagers.rollover.add_indexed(
264
331
  txn,
265
332
  kbid=kbid,
266
333
  resource_id=resource_id,
267
334
  shard_id=shard_id,
268
- modification_time=_to_ts(
269
- resource_index_message.metadata.modified.ToDatetime()
270
- ),
335
+ modification_time=_to_ts(resource.basic.modified.ToDatetime()), # type: ignore
271
336
  )
272
337
  await txn.commit()
273
338
  wait_index_batch.append(shard)
@@ -281,11 +346,66 @@ async def index_rollover_shards(app_context: ApplicationContext, kbid: str) -> N
281
346
  await wait_for_node(app_context, node_id)
282
347
  wait_index_batch = []
283
348
 
284
- _set_rollover_status(rollover_shards, RolloverStatus.RESOURCES_INDEXED)
285
349
  async with datamanagers.with_transaction() as txn:
286
- await datamanagers.rollover.update_kb_rollover_shards(
287
- txn, kbid=kbid, kb_shards=rollover_shards
350
+ state.resources_indexed = True
351
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
352
+ await datamanagers.rollover.update_kb_rollover_shards(txn, kbid=kbid, kb_shards=rollover_shards)
353
+ await txn.commit()
354
+
355
+
356
+ async def cutover_index(
357
+ app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
358
+ ) -> None:
359
+ """
360
+ Swaps our the current active index for a knowledgebox.
361
+ """
362
+ await cutover_shards(app_context, kbid)
363
+ if external is not None:
364
+ if external.supports_rollover:
365
+ await cutover_external_index(kbid, external)
366
+ else:
367
+ logger.info(
368
+ "External index provider does not support rollover",
369
+ extra={"kbid": kbid, "external_index_provider": external.type.value},
370
+ )
371
+
372
+
373
+ async def cutover_external_index(kbid: str, external: ExternalIndexManager) -> None:
374
+ """
375
+ Cuts over to the newly creted external index for a knowledgebox.
376
+ The old indexes are deleted.
377
+ """
378
+ extra = {"kbid": kbid, "external_index_provider": external.type.value}
379
+ logger.info("Cutting over external index", extra=extra)
380
+ async with datamanagers.with_rw_transaction() as txn:
381
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
382
+ if not all(
383
+ [
384
+ state.rollover_shards_created,
385
+ state.resources_scheduled,
386
+ state.resources_indexed,
387
+ ]
388
+ ):
389
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
390
+ if state.cutover_external_index:
391
+ logger.info("External index already cut over, skipping", extra=extra)
392
+ return
393
+
394
+ stored_metadata = await datamanagers.kb.get_external_index_provider_metadata(txn, kbid=kbid)
395
+ rollover_metadata = await datamanagers.rollover.get_kb_rollover_external_index_metadata(
396
+ txn, kbid=kbid
288
397
  )
398
+ if stored_metadata is None or rollover_metadata is None:
399
+ raise UnexpectedRolloverError("stored or rollover external index metadata not found")
400
+
401
+ await external.rollover_cutover_indexes()
402
+
403
+ await datamanagers.kb.set_external_index_provider_metadata(
404
+ txn, kbid=kbid, metadata=rollover_metadata
405
+ )
406
+ await datamanagers.rollover.delete_kb_rollover_external_index_metadata(txn, kbid=kbid)
407
+ state.cutover_external_index = True
408
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
289
409
  await txn.commit()
290
410
 
291
411
 
@@ -297,29 +417,40 @@ async def cutover_shards(app_context: ApplicationContext, kbid: str) -> None:
297
417
  async with datamanagers.with_transaction() as txn:
298
418
  sm = app_context.shard_manager
299
419
 
420
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
421
+ if not all(
422
+ [
423
+ state.rollover_shards_created,
424
+ state.resources_scheduled,
425
+ state.resources_indexed,
426
+ ]
427
+ ):
428
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
429
+ if state.cutover_shards:
430
+ logger.info("Shards already cut over, skipping", extra={"kbid": kbid})
431
+ return
432
+
300
433
  previously_active_shards = await datamanagers.cluster.get_kb_shards(
301
- txn, kbid=kbid
302
- )
303
- rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(
304
- txn, kbid=kbid
434
+ txn, kbid=kbid, for_update=True
305
435
  )
436
+ rollover_shards = await datamanagers.rollover.get_kb_rollover_shards(txn, kbid=kbid)
306
437
  if previously_active_shards is None or rollover_shards is None:
307
438
  raise UnexpectedRolloverError("Shards for kb not found")
308
439
 
309
- _clear_rollover_status(rollover_shards)
310
- await datamanagers.cluster.update_kb_shards(
311
- txn, kbid=kbid, shards=rollover_shards
312
- )
440
+ await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rollover_shards)
313
441
  await datamanagers.rollover.delete_kb_rollover_shards(txn, kbid=kbid)
314
442
 
315
443
  for shard in previously_active_shards.shards:
316
444
  await sm.rollback_shard(shard)
317
445
 
446
+ state.cutover_shards = True
447
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
448
+
318
449
  await txn.commit()
319
450
 
320
451
 
321
452
  async def validate_indexed_data(
322
- app_context: ApplicationContext, kbid: str
453
+ app_context: ApplicationContext, kbid: str, external: Optional[ExternalIndexManager] = None
323
454
  ) -> list[str]:
324
455
  """
325
456
  Goes through all the resources in a knowledgebox and validates it
@@ -329,21 +460,34 @@ async def validate_indexed_data(
329
460
 
330
461
  If a resource was removed during the rollover, it will be removed as well.
331
462
  """
463
+ extra = {"kbid": kbid, "external_index_provider": None}
464
+ if external is not None:
465
+ extra["external_index_provider"] = external.type.value
466
+ async with datamanagers.with_ro_transaction() as txn:
467
+ state = await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
468
+ if not all(
469
+ [
470
+ state.rollover_shards_created,
471
+ state.resources_scheduled,
472
+ state.resources_indexed,
473
+ state.cutover_shards,
474
+ ]
475
+ ):
476
+ raise UnexpectedRolloverError(f"Preconditions not met for KB {kbid}")
332
477
 
333
- async with datamanagers.with_transaction() as txn:
334
478
  rolled_over_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
335
479
  if rolled_over_shards is None:
336
480
  raise UnexpectedRolloverError(f"No rollover shards found for KB {kbid}")
337
481
 
338
- if _get_rollover_status(rolled_over_shards, RolloverStatus.RESOURCES_VALIDATED):
339
- logger.info("Resources already validated, skipping", extra={"kbid": kbid})
482
+ if state.resources_validated:
483
+ logger.info("Resources already validated, skipping", extra=extra)
340
484
  return []
341
485
 
342
- logger.info("Validating indexed data", extra={"kbid": kbid})
486
+ logger.info("Validating indexed data", extra=extra)
343
487
 
344
- repaired_resources = []
488
+ repaired_resources: list[str] = []
345
489
  async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
346
- async with datamanagers.with_transaction() as txn:
490
+ async with datamanagers.with_ro_transaction() as txn:
347
491
  indexed_data = await datamanagers.rollover.get_indexed_data(
348
492
  txn, kbid=kbid, resource_id=resource_id
349
493
  )
@@ -360,7 +504,7 @@ async def validate_indexed_data(
360
504
  if shard_id is None:
361
505
  logger.error(
362
506
  "Shard id not found for resource",
363
- extra={"kbid": kbid, "resource_id": resource_id},
507
+ extra={"resource_id": resource_id, **extra},
364
508
  )
365
509
  raise UnexpectedRolloverError("Shard id not found for resource")
366
510
  last_indexed = 0
@@ -370,23 +514,18 @@ async def validate_indexed_data(
370
514
  logger.error(
371
515
  "Shard not found for resource",
372
516
  extra={
373
- "kbid": kbid,
374
517
  "resource_id": resource_id,
375
518
  "shard_id": shard_id,
519
+ **extra,
376
520
  },
377
521
  )
378
- raise UnexpectedRolloverError(
379
- f"Shard {shard_id} not found. This should not happen"
380
- )
522
+ raise UnexpectedRolloverError(f"Shard {shard_id} not found. This should not happen")
381
523
 
382
- async with datamanagers.with_transaction() as txn:
383
- res = await datamanagers.resources.get_resource(
384
- txn, kbid=kbid, rid=resource_id
385
- )
524
+ res = await get_resource(kbid, resource_id)
386
525
  if res is None:
387
526
  logger.error(
388
527
  "Resource not found while validating, skipping",
389
- extra={"kbid": kbid, "resource_id": resource_id},
528
+ extra={"resource_id": resource_id, **extra},
390
529
  )
391
530
  continue
392
531
 
@@ -403,12 +542,26 @@ async def validate_indexed_data(
403
542
  await txn.commit()
404
543
  continue
405
544
 
545
+ index_message = await get_resource_index_message(kbid, resource_id)
546
+ if index_message is None:
547
+ logger.error(
548
+ "Resource index message not found while validating, skipping",
549
+ extra={"resource_id": resource_id, **extra},
550
+ )
551
+ continue
552
+
406
553
  # resource was modified or added during rollover, reindex
407
- resource_index_message = await index_resource_to_shard(
408
- app_context, kbid, resource_id, shard
409
- )
410
- if resource_index_message is not None:
411
- repaired_resources.append(resource_id)
554
+ if external is not None:
555
+ await external.index_resource(
556
+ resource_id,
557
+ index_message,
558
+ to_rollover_indexes=True,
559
+ )
560
+ else:
561
+ await index_resource_to_shard(
562
+ app_context, kbid, resource_id, shard, resource_index_message=index_message
563
+ )
564
+ repaired_resources.append(resource_id)
412
565
  async with datamanagers.with_transaction() as txn:
413
566
  await datamanagers.rollover.add_indexed(
414
567
  txn,
@@ -432,11 +585,10 @@ async def validate_indexed_data(
432
585
  raise UnexpectedRolloverError("Shard not found. This should not happen")
433
586
  await delete_resource_from_shard(app_context, kbid, resource_id, shard)
434
587
 
435
- _set_rollover_status(rolled_over_shards, RolloverStatus.RESOURCES_VALIDATED)
436
588
  async with datamanagers.with_transaction() as txn:
437
- await datamanagers.cluster.update_kb_shards(
438
- txn, kbid=kbid, shards=rolled_over_shards
439
- )
589
+ state.resources_validated = True
590
+ await datamanagers.rollover.set_rollover_state(txn, kbid=kbid, state=state)
591
+ await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=rolled_over_shards)
440
592
 
441
593
  return repaired_resources
442
594
 
@@ -458,69 +610,76 @@ async def clean_indexed_data(app_context: ApplicationContext, kbid: str) -> None
458
610
 
459
611
  async def clean_rollover_status(app_context: ApplicationContext, kbid: str) -> None:
460
612
  async with datamanagers.with_transaction() as txn:
461
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
462
- if kb_shards is None:
613
+ try:
614
+ await datamanagers.rollover.get_rollover_state(txn, kbid=kbid)
615
+ except RolloverStateNotFoundError:
463
616
  logger.warning(
464
- "No shards found for KB, skipping clean rollover status",
465
- extra={"kbid": kbid},
617
+ "No rollover state found, skipping clean rollover status", extra={"kbid": kbid}
466
618
  )
467
619
  return
620
+ await datamanagers.rollover.clear_rollover_state(txn, kbid=kbid)
621
+ await txn.commit()
468
622
 
469
- _clear_rollover_status(kb_shards)
470
- await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
623
+
624
+ async def wait_for_cluster_ready() -> None:
625
+ node_ready_checks = 0
626
+ while len(cluster_manager.INDEX_NODES) == 0:
627
+ if node_ready_checks > 10:
628
+ raise Exception("No index nodes available")
629
+ logger.info("Waiting for index nodes to be available")
630
+ await asyncio.sleep(1)
631
+ node_ready_checks += 1
471
632
 
472
633
 
473
- async def rollover_kb_shards(
634
+ async def rollover_kb_index(
474
635
  app_context: ApplicationContext, kbid: str, drain_nodes: Optional[list[str]] = None
475
636
  ) -> None:
476
637
  """
477
- Rollover a shard is the process of creating new shard replicas for every
478
- shard and indexing all existing resources into the replicas.
638
+ Rollover a KB index is the process of creating new shard replicas for every
639
+ shard and indexing all existing resources into the replicas. Also includes creating new external indexes if
640
+ the KB is configured to use them.
479
641
 
480
- Once all the data is in the new shards, cut over the registered replicas
481
- to the new shards and delete the old shards.
642
+ Once all the data is in the new indexes, cut over to the replicated index delete the old one.
482
643
 
483
- If drain_nodes is provided, no replicas will be created on those nodes. This is useful
484
- for when we want to remove a set of nodes from the cluster.
644
+ If drain_nodes is provided, no index node replicas will be created on those nodes. This is useful
645
+ for when we want to remove a set of nodes from the index node cluster.
485
646
 
486
647
  This is a very expensive operation and should be done with care.
487
648
 
488
649
  Process:
489
- - Create new shards
650
+ - Create new index for kb index (index node shards or external indexes if configured)
490
651
  - Schedule all resources to be indexed
491
- - Index all resources into new shards
492
- - Cut over replicas to new shards
493
- - Validate that all resources are in the new shards
652
+ - Index all resources into new kb index (index node shards or external indexes if configured)
653
+ - Cut over replicas to new shards (and external indexes if configured)
654
+ - Validate that all resources are in the new kb index
494
655
  - Clean up indexed data
495
656
  """
496
- node_ready_checks = 0
497
- while len(cluster_manager.INDEX_NODES) == 0:
498
- if node_ready_checks > 10:
499
- raise Exception("No index nodes available")
500
- logger.info("Waiting for index nodes to be available")
501
- await asyncio.sleep(1)
502
- node_ready_checks += 1
657
+ await wait_for_cluster_ready()
503
658
 
504
- logger.info("Rolling over shards", extra={"kbid": kbid})
659
+ extra = {"kbid": kbid, "external_index_provider": None}
660
+ external = await get_external_index_manager(kbid, for_rollover=True)
661
+ if external is not None:
662
+ extra["external_index_provider"] = external.type.value
663
+ logger.info("Rolling over KB index", extra=extra)
505
664
 
506
665
  async with locking.distributed_lock(locking.KB_SHARDS_LOCK.format(kbid=kbid)):
507
- await create_rollover_shards(app_context, kbid, drain_nodes=drain_nodes)
666
+ await create_rollover_index(app_context, kbid, drain_nodes=drain_nodes, external=external)
508
667
  await schedule_resource_indexing(app_context, kbid)
509
- await index_rollover_shards(app_context, kbid)
510
- await cutover_shards(app_context, kbid)
668
+ await index_to_rollover_index(app_context, kbid, external=external)
669
+ await cutover_index(app_context, kbid, external=external)
511
670
  # we need to cut over BEFORE we validate the data
512
- await validate_indexed_data(app_context, kbid)
671
+ await validate_indexed_data(app_context, kbid, external=external)
513
672
  await clean_indexed_data(app_context, kbid)
514
673
  await clean_rollover_status(app_context, kbid)
515
674
 
516
- logger.info("Finished rolling over shards", extra={"kbid": kbid})
675
+ logger.info("Finished rolling over KB indes", extra=extra)
517
676
 
518
677
 
519
678
  async def _rollover_kbid_command(kbid: str) -> None: # pragma: no cover
520
679
  app_context = ApplicationContext()
521
680
  await app_context.initialize()
522
681
  try:
523
- await rollover_kb_shards(app_context, kbid)
682
+ await rollover_kb_index(app_context, kbid)
524
683
  finally:
525
684
  await app_context.finalize()
526
685