nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,55 +18,71 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from datetime import datetime
21
- from typing import AsyncGenerator, AsyncIterator, Optional, Sequence
21
+ from functools import partial
22
+ from typing import Any, AsyncGenerator, Callable, Coroutine, Optional, Sequence
22
23
  from uuid import uuid4
23
24
 
24
25
  from grpc import StatusCode
25
26
  from grpc.aio import AioRpcError
26
- from nucliadb_protos.knowledgebox_pb2 import (
27
- KnowledgeBoxConfig,
28
- Labels,
29
- LabelSet,
30
- SemanticModelMetadata,
31
- )
32
- from nucliadb_protos.knowledgebox_pb2 import Synonyms as PBSynonyms
33
- from nucliadb_protos.knowledgebox_pb2 import VectorSet, VectorSets
34
- from nucliadb_protos.resources_pb2 import Basic
35
- from nucliadb_protos.utils_pb2 import ReleaseChannel
36
27
 
37
28
  from nucliadb.common import datamanagers
38
- from nucliadb.common.cluster.base import AbstractIndexNode
39
- from nucliadb.common.cluster.exceptions import ShardNotFound, ShardsNotFound
29
+ from nucliadb.common.cluster.exceptions import ShardNotFound
40
30
  from nucliadb.common.cluster.manager import get_index_node
41
31
  from nucliadb.common.cluster.utils import get_shard_manager
42
- from nucliadb.common.maindb.driver import Driver, Transaction
43
- from nucliadb.ingest import SERVICE_NAME, logger
44
- from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
45
- from nucliadb.ingest.orm.resource import (
32
+
33
+ # XXX: this keys shouldn't be exposed outside datamanagers
34
+ from nucliadb.common.datamanagers.resources import (
46
35
  KB_RESOURCE_SLUG,
47
36
  KB_RESOURCE_SLUG_BASE,
48
- Resource,
49
37
  )
50
- from nucliadb.ingest.orm.synonyms import Synonyms
51
- from nucliadb.ingest.orm.utils import compute_paragraph_key, get_basic, set_basic
38
+ from nucliadb.common.external_index_providers.base import VectorsetExternalIndex
39
+ from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
40
+ from nucliadb.common.maindb.driver import Driver, Transaction
41
+ from nucliadb.common.maindb.pg import PGTransaction
42
+ from nucliadb.common.nidx import get_nidx_api_client
43
+ from nucliadb.ingest import SERVICE_NAME, logger
44
+ from nucliadb.ingest.orm.exceptions import (
45
+ KnowledgeBoxConflict,
46
+ KnowledgeBoxCreationError,
47
+ VectorSetConflict,
48
+ )
49
+ from nucliadb.ingest.orm.metrics import processor_observer
50
+ from nucliadb.ingest.orm.resource import Resource
51
+ from nucliadb.ingest.orm.utils import choose_matryoshka_dimension, compute_paragraph_key
52
52
  from nucliadb.migrator.utils import get_latest_version
53
- from nucliadb_protos import writer_pb2
53
+ from nucliadb_protos import knowledgebox_pb2, noderesources_pb2, nodewriter_pb2, writer_pb2
54
+ from nucliadb_protos.knowledgebox_pb2 import (
55
+ CreateExternalIndexProviderMetadata,
56
+ ExternalIndexProviderType,
57
+ KnowledgeBoxConfig,
58
+ SemanticModelMetadata,
59
+ StoredExternalIndexProviderMetadata,
60
+ )
61
+ from nucliadb_protos.resources_pb2 import Basic
62
+ from nucliadb_utils.settings import is_onprem_nucliadb
54
63
  from nucliadb_utils.storages.storage import Storage
55
- from nucliadb_utils.utilities import get_audit, get_storage
64
+ from nucliadb_utils.utilities import (
65
+ get_audit,
66
+ get_storage,
67
+ )
56
68
 
57
69
  # XXX Eventually all these keys should be moved to datamanagers.kb
58
70
  KB_RESOURCE = "/kbs/{kbid}/r/{uuid}"
59
71
 
60
72
  KB_KEYS = "/kbs/{kbid}/"
61
73
 
62
- KB_VECTORSET = "/kbs/{kbid}/vectorsets"
63
-
64
74
  KB_TO_DELETE_BASE = "/kbtodelete/"
65
75
  KB_TO_DELETE_STORAGE_BASE = "/storagetodelete/"
66
76
 
77
+ RESOURCE_TO_DELETE_STORAGE_BASE = "/resourcestoragetodelete"
78
+ RESOURCE_TO_DELETE_STORAGE = f"{RESOURCE_TO_DELETE_STORAGE_BASE}/{{kbid}}/{{uuid}}"
79
+
67
80
  KB_TO_DELETE = f"{KB_TO_DELETE_BASE}{{kbid}}"
68
81
  KB_TO_DELETE_STORAGE = f"{KB_TO_DELETE_STORAGE_BASE}{{kbid}}"
69
82
 
83
+ KB_VECTORSET_TO_DELETE_BASE = "/vectorsettodelete"
84
+ KB_VECTORSET_TO_DELETE = f"{KB_VECTORSET_TO_DELETE_BASE}/{{kbid}}/{{vectorset}}"
85
+
70
86
 
71
87
  class KnowledgeBox:
72
88
  def __init__(self, txn: Transaction, storage: Storage, kbid: str):
@@ -74,115 +90,154 @@ class KnowledgeBox:
74
90
  self.storage = storage
75
91
  self.kbid = kbid
76
92
  self._config: Optional[KnowledgeBoxConfig] = None
77
- self.synonyms = Synonyms(self.txn, self.kbid)
78
-
79
- async def get_config(self) -> Optional[KnowledgeBoxConfig]:
80
- if self._config is None:
81
- async with datamanagers.with_transaction() as txn:
82
- config = await datamanagers.kb.get_config(txn, kbid=self.kbid)
83
- if config is not None:
84
- self._config = config
85
- return config
86
- else:
87
- return None
88
- else:
89
- return self._config
90
-
91
- @classmethod
92
- async def delete_kb(cls, txn: Transaction, slug: str = "", kbid: str = ""):
93
- # Mark storage to be deleted
94
- # Mark keys to be deleted
95
- logger.info(f"Deleting KB kbid={kbid} slug={slug}")
96
- if not kbid and not slug:
97
- raise AttributeError()
98
-
99
- if slug and not kbid:
100
- kbid_bytes = await txn.get(datamanagers.kb.KB_SLUGS.format(slug=slug))
101
- if kbid_bytes is None:
102
- raise datamanagers.exceptions.KnowledgeBoxNotFound()
103
- kbid = kbid_bytes.decode()
104
-
105
- if kbid and not slug:
106
- kbconfig_bytes = await txn.get(datamanagers.kb.KB_UUID.format(kbid=kbid))
107
- if kbconfig_bytes is None:
108
- raise datamanagers.exceptions.KnowledgeBoxNotFound()
109
- pbconfig = KnowledgeBoxConfig()
110
- pbconfig.ParseFromString(kbconfig_bytes)
111
- slug = pbconfig.slug
112
-
113
- # Delete main anchor
114
- async with txn.driver.transaction() as subtxn:
115
- key_match = datamanagers.kb.KB_SLUGS.format(slug=slug)
116
- logger.info(f"Deleting KB with slug: {slug}")
117
- await subtxn.delete(key_match)
118
93
 
119
- when = datetime.now().isoformat()
120
- await subtxn.set(KB_TO_DELETE.format(kbid=kbid), when.encode())
121
- await subtxn.commit()
122
-
123
- audit_util = get_audit()
124
- if audit_util is not None:
125
- await audit_util.delete_kb(kbid)
126
- return kbid
94
+ @staticmethod
95
+ def new_unique_kbid() -> str:
96
+ return str(uuid4())
127
97
 
128
98
  @classmethod
99
+ @processor_observer.wrap({"type": "create_kb"})
129
100
  async def create(
130
101
  cls,
131
- txn: Transaction,
102
+ driver: Driver,
103
+ *,
104
+ kbid: str,
132
105
  slug: str,
133
- semantic_model: SemanticModelMetadata,
134
- uuid: Optional[str] = None,
135
- config: Optional[KnowledgeBoxConfig] = None,
136
- release_channel: ReleaseChannel.ValueType = ReleaseChannel.STABLE,
137
- ) -> tuple[str, bool]:
138
- failed = False
139
- exist = await datamanagers.kb.get_kb_uuid(txn, slug=slug)
140
- if exist:
141
- raise KnowledgeBoxConflict()
142
- if uuid is None or uuid == "":
143
- uuid = str(uuid4())
144
-
145
- if slug == "":
146
- slug = uuid
147
-
148
- await txn.set(
149
- datamanagers.kb.KB_SLUGS.format(slug=slug),
150
- uuid.encode(),
151
- )
152
- if config is None:
153
- config = KnowledgeBoxConfig()
154
-
155
- config.migration_version = get_latest_version()
156
- config.slug = slug
157
- await txn.set(
158
- datamanagers.kb.KB_UUID.format(kbid=uuid),
159
- config.SerializeToString(),
160
- )
161
- # Create Storage
162
- storage = await get_storage(service_name=SERVICE_NAME)
106
+ title: str = "",
107
+ description: str = "",
108
+ semantic_models: Optional[dict[str, SemanticModelMetadata]] = None,
109
+ external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
110
+ hidden_resources_enabled: bool = False,
111
+ hidden_resources_hide_on_creation: bool = False,
112
+ ) -> tuple[str, str]:
113
+ """Creates a new knowledge box and return its id and slug."""
114
+
115
+ if not kbid:
116
+ raise KnowledgeBoxCreationError("A kbid must be provided to create a new KB")
117
+ if not slug:
118
+ raise KnowledgeBoxCreationError("A slug must be provided to create a new KB")
119
+ if hidden_resources_hide_on_creation and not hidden_resources_enabled:
120
+ raise KnowledgeBoxCreationError(
121
+ "Cannot hide new resources if the hidden resources feature is disabled"
122
+ )
123
+ if semantic_models is None or len(semantic_models) == 0:
124
+ raise KnowledgeBoxCreationError("KB must define at least one semantic model")
163
125
 
164
- created = await storage.create_kb(uuid)
165
- if created is False:
166
- logger.error(f"{uuid} KB could not be created")
167
- failed = True
126
+ rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
168
127
 
169
- if failed is False:
170
- shard_manager = get_shard_manager()
171
- try:
172
- await shard_manager.create_shard_by_kbid(
173
- txn,
174
- uuid,
175
- semantic_model=semantic_model,
176
- release_channel=release_channel,
128
+ try:
129
+ async with driver.transaction() as txn:
130
+ exists = await datamanagers.kb.get_kb_uuid(
131
+ txn, slug=slug
132
+ ) or await datamanagers.kb.exists_kb(txn, kbid=kbid)
133
+ if exists:
134
+ raise KnowledgeBoxConflict()
135
+
136
+ # Create in maindb
137
+ await datamanagers.kb.set_kbid_for_slug(txn, slug=slug, kbid=kbid)
138
+
139
+ # all KBs have the vectorset key initialized, although (for
140
+ # now), not every KB will store vectorsets there
141
+ await datamanagers.vectorsets.initialize(txn, kbid=kbid)
142
+
143
+ kb_shards = writer_pb2.Shards()
144
+ kb_shards.kbid = kbid
145
+ # B/c with Shards.actual
146
+ kb_shards.actual = -1
147
+
148
+ vs_external_indexes = []
149
+ for vectorset_id, semantic_model in semantic_models.items(): # type: ignore
150
+ # if this KB uses a matryoshka model, we can choose a different
151
+ # dimension
152
+ if len(semantic_model.matryoshka_dimensions) > 0:
153
+ dimension = choose_matryoshka_dimension(semantic_model.matryoshka_dimensions)
154
+ else:
155
+ dimension = semantic_model.vector_dimension
156
+
157
+ vs_external_indexes.append(
158
+ VectorsetExternalIndex(
159
+ vectorset_id=vectorset_id,
160
+ dimension=dimension,
161
+ similarity=semantic_model.similarity_function,
162
+ )
163
+ )
164
+
165
+ vectorset_config = knowledgebox_pb2.VectorSetConfig(
166
+ vectorset_id=vectorset_id,
167
+ vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
168
+ similarity=semantic_model.similarity_function,
169
+ # XXX: hardcoded value
170
+ vector_type=nodewriter_pb2.VectorType.DENSE_F32,
171
+ normalize_vectors=len(semantic_model.matryoshka_dimensions) > 0,
172
+ vector_dimension=dimension,
173
+ ),
174
+ matryoshka_dimensions=semantic_model.matryoshka_dimensions,
175
+ )
176
+ await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset_config)
177
+
178
+ stored_external_index_provider = await cls._maybe_create_external_indexes(
179
+ kbid, request=external_index_provider, indexes=vs_external_indexes
180
+ )
181
+ rollback_ops.append(
182
+ partial(
183
+ cls._maybe_delete_external_indexes,
184
+ kbid,
185
+ stored_external_index_provider,
186
+ )
177
187
  )
178
- except Exception as e:
179
- await storage.delete_kb(uuid)
180
- raise e
181
-
182
- if failed:
183
- await storage.delete_kb(uuid)
184
188
 
185
- return uuid, failed
189
+ config = KnowledgeBoxConfig(
190
+ title=title,
191
+ description=description,
192
+ slug=slug,
193
+ migration_version=get_latest_version(),
194
+ hidden_resources_enabled=hidden_resources_enabled,
195
+ hidden_resources_hide_on_creation=hidden_resources_hide_on_creation,
196
+ )
197
+ config.external_index_provider.CopyFrom(stored_external_index_provider)
198
+ await datamanagers.kb.set_config(txn, kbid=kbid, config=config)
199
+ await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
200
+
201
+ # shard creation will alter this value on maindb, make sure nobody
202
+ # uses this variable anymore
203
+ del kb_shards
204
+
205
+ # Create in storage
206
+
207
+ storage = await get_storage(service_name=SERVICE_NAME)
208
+
209
+ created = await storage.create_kb(kbid)
210
+ if not created:
211
+ logger.error(f"KB {kbid} could not be created")
212
+ raise KnowledgeBoxCreationError(
213
+ f"KB blob storage could not be created (slug={slug})"
214
+ )
215
+ rollback_ops.append(partial(storage.delete_kb, kbid))
216
+
217
+ # Create shards in index nodes
218
+
219
+ shard_manager = get_shard_manager()
220
+ # XXX creating a shard is a slow IO operation that requires a write
221
+ # txn to be open!
222
+ await shard_manager.create_shard_by_kbid(txn, kbid)
223
+ # shards don't need a rollback as they will be eventually purged
224
+
225
+ await txn.commit()
226
+
227
+ except Exception as exc:
228
+ # rollback all changes on the db and raise the exception
229
+ for op in reversed(rollback_ops):
230
+ try:
231
+ await op()
232
+ except Exception:
233
+ if isinstance(op, partial):
234
+ name: str = op.func.__name__
235
+ else:
236
+ getattr(op, "__name__", "unknown?")
237
+ logger.exception(f"Unexpected error rolling back {name}. Keep rolling back")
238
+ raise exc
239
+
240
+ return (kbid, slug)
186
241
 
187
242
  @classmethod
188
243
  async def update(
@@ -192,7 +247,7 @@ class KnowledgeBox:
192
247
  slug: Optional[str] = None,
193
248
  config: Optional[KnowledgeBoxConfig] = None,
194
249
  ) -> str:
195
- exist = await datamanagers.kb.get_config(txn, kbid=uuid)
250
+ exist = await datamanagers.kb.get_config(txn, kbid=uuid, for_update=True)
196
251
  if not exist:
197
252
  raise datamanagers.exceptions.KnowledgeBoxNotFound()
198
253
 
@@ -209,94 +264,61 @@ class KnowledgeBox:
209
264
 
210
265
  if config and exist != config:
211
266
  exist.MergeFrom(config)
267
+ exist.hidden_resources_enabled = config.hidden_resources_enabled
268
+ exist.hidden_resources_hide_on_creation = config.hidden_resources_hide_on_creation
212
269
 
213
- await txn.set(
214
- datamanagers.kb.KB_UUID.format(kbid=uuid),
215
- exist.SerializeToString(),
216
- )
270
+ if exist.hidden_resources_hide_on_creation and not exist.hidden_resources_enabled:
271
+ raise KnowledgeBoxCreationError(
272
+ "Cannot hide new resources if the hidden resources feature is disabled"
273
+ )
274
+
275
+ await datamanagers.kb.set_config(txn, kbid=uuid, config=exist)
217
276
 
218
277
  return uuid
219
278
 
220
- async def iterate_kb_nodes(self) -> AsyncIterator[tuple[AbstractIndexNode, str]]:
221
- async with datamanagers.with_transaction() as txn:
222
- shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=self.kbid)
223
- if shards_obj is None:
224
- raise ShardsNotFound(self.kbid)
225
-
226
- for shard in shards_obj.shards:
227
- for replica in shard.replicas:
228
- node = get_index_node(replica.node)
229
- if node is not None:
230
- yield node, replica.shard.id
231
-
232
- # Vectorset
233
- async def get_vectorsets(self, response: writer_pb2.GetVectorSetsResponse):
234
- vectorset_key = KB_VECTORSET.format(kbid=self.kbid)
235
- payload = await self.txn.get(vectorset_key)
236
- if payload is not None:
237
- response.vectorsets.ParseFromString(payload)
238
-
239
- async def del_vectorset(self, id: str):
240
- vectorset_key = KB_VECTORSET.format(kbid=self.kbid)
241
- payload = await self.txn.get(vectorset_key)
242
- vts = VectorSets()
243
- if payload is not None:
244
- vts.ParseFromString(payload)
245
- del vts.vectorsets[id]
246
- # For each Node on the KB delete the vectorset
247
- async for node, shard in self.iterate_kb_nodes():
248
- await node.del_vectorset(shard, id)
249
- payload = vts.SerializeToString()
250
- await self.txn.set(vectorset_key, payload)
251
-
252
- async def set_vectorset(self, id: str, vs: VectorSet):
253
- vectorset_key = KB_VECTORSET.format(kbid=self.kbid)
254
- payload = await self.txn.get(vectorset_key)
255
- vts = VectorSets()
256
- if payload is not None:
257
- vts.ParseFromString(payload)
258
- vts.vectorsets[id].CopyFrom(vs)
259
- # For each Node on the KB add the vectorset
260
- async for node, shard in self.iterate_kb_nodes():
261
- await node.set_vectorset(shard, id, similarity=vs.similarity)
262
- payload = vts.SerializeToString()
263
- await self.txn.set(vectorset_key, payload)
264
-
265
- # Labels
266
- async def set_labelset(self, id: str, labelset: LabelSet):
267
- await datamanagers.labels.set_labelset(
268
- self.txn, kbid=self.kbid, labelset_id=id, labelset=labelset
269
- )
279
+ @classmethod
280
+ async def delete(cls, driver: Driver, kbid: str):
281
+ async with driver.transaction() as txn:
282
+ exists = await datamanagers.kb.exists_kb(txn, kbid=kbid)
283
+ if not exists:
284
+ return
270
285
 
271
- async def get_labels(self) -> Labels:
272
- return await datamanagers.labels.get_labels(self.txn, kbid=self.kbid)
286
+ # Delete main anchor
287
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
288
+ if kb_config is not None:
289
+ slug = kb_config.slug
290
+ await datamanagers.kb.delete_kb_slug(txn, slug=slug)
273
291
 
274
- async def get_labelset(
275
- self, labelset: str, labelset_response: writer_pb2.GetLabelSetResponse
276
- ):
277
- ls = await datamanagers.labels.get_labelset(
278
- self.txn,
279
- kbid=self.kbid,
280
- labelset_id=labelset,
281
- )
282
- if ls is not None:
283
- labelset_response.labelset.CopyFrom(ls)
292
+ await datamanagers.kb.delete_config(txn, kbid=kbid)
284
293
 
285
- async def del_labelset(self, id: str):
286
- await datamanagers.labels.delete_labelset(
287
- self.txn, kbid=self.kbid, labelset_id=id
288
- )
294
+ # Mark KB to purge. This will eventually delete all KB keys, storage
295
+ # and index data (for the old index nodes)
296
+ when = datetime.now().isoformat()
297
+ await txn.set(KB_TO_DELETE.format(kbid=kbid), when.encode())
289
298
 
290
- async def get_synonyms(self, synonyms: PBSynonyms):
291
- pbsyn = await self.synonyms.get()
292
- if pbsyn is not None:
293
- synonyms.CopyFrom(pbsyn)
299
+ shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
294
300
 
295
- async def set_synonyms(self, synonyms: PBSynonyms):
296
- await self.synonyms.set(synonyms)
301
+ await txn.commit()
297
302
 
298
- async def delete_synonyms(self):
299
- await self.synonyms.clear()
303
+ if shards_obj is None:
304
+ logger.warning(f"Shards not found for KB while deleting it", extra={"kbid": kbid})
305
+ else:
306
+ nidx_api = get_nidx_api_client()
307
+ # Delete shards from nidx. They'll be marked for eventual deletion,
308
+ # so this call shouldn't be costly
309
+ if nidx_api is not None:
310
+ for shard in shards_obj.shards:
311
+ if shard.nidx_shard_id:
312
+ await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
313
+
314
+ if kb_config is not None:
315
+ await cls._maybe_delete_external_indexes(kbid, kb_config.external_index_provider)
316
+
317
+ audit = get_audit()
318
+ if audit is not None:
319
+ audit.delete_kb(kbid=kbid)
320
+
321
+ return kbid
300
322
 
301
323
  @classmethod
302
324
  async def purge(cls, driver: Driver, kbid: str):
@@ -307,6 +329,8 @@ class KnowledgeBox:
307
329
  need to delete the kb shards and also deletes the related storage
308
330
  buckets.
309
331
 
332
+ Removes all catalog entries related to the kb.
333
+
310
334
  As non-empty buckets cannot be deleted, they are scheduled to be
311
335
  deleted instead. Actually, this empties the bucket asynchronouysly
312
336
  but it doesn't delete it. To do it, we save a marker using the
@@ -322,16 +346,13 @@ class KnowledgeBox:
322
346
  storage_to_delete = KB_TO_DELETE_STORAGE.format(kbid=kbid)
323
347
  await txn.set(storage_to_delete, b"")
324
348
 
325
- # Delete KB Shards
326
- shards_match = datamanagers.cluster.KB_SHARDS.format(kbid=kbid)
327
- payload = await txn.get(shards_match)
349
+ await catalog_delete_kb(txn, kbid)
328
350
 
329
- if payload is None:
330
- logger.warning(f"Shards not found for kbid={kbid}")
351
+ # Delete KB Shards
352
+ shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
353
+ if shards_obj is None:
354
+ logger.warning(f"Shards not found for KB while purging it", extra={"kbid": kbid})
331
355
  else:
332
- shards_obj = writer_pb2.Shards()
333
- shards_obj.ParseFromString(payload) # type: ignore
334
-
335
356
  for shard in shards_obj.shards:
336
357
  # Delete the shard on nodes
337
358
  for replica in shard.replicas:
@@ -357,29 +378,14 @@ class KnowledgeBox:
357
378
  await cls.delete_all_kb_keys(driver, kbid)
358
379
 
359
380
  @classmethod
360
- async def delete_all_kb_keys(
361
- cls, driver: Driver, kbid: str, chunk_size: int = 1_000
362
- ):
381
+ async def delete_all_kb_keys(cls, driver: Driver, kbid: str, chunk_size: int = 1_000):
363
382
  prefix = KB_KEYS.format(kbid=kbid)
364
- while True:
365
- async with driver.transaction() as txn:
366
- all_keys = [key async for key in txn.keys(match=prefix, count=-1)]
367
-
368
- if len(all_keys) == 0:
369
- break
370
-
371
- # We commit deletions in chunks because otherwise
372
- # tikv complains if there is too much data to commit
373
- for chunk_of_keys in chunker(all_keys, chunk_size):
374
- async with driver.transaction() as txn:
375
- for key in chunk_of_keys:
376
- await txn.delete(key)
377
- await txn.commit()
378
-
379
- async def get_resource_shard(
380
- self, shard_id: str
381
- ) -> Optional[writer_pb2.ShardObject]:
382
- async with datamanagers.with_transaction() as txn:
383
+ async with driver.transaction() as txn:
384
+ await txn.delete_by_prefix(prefix)
385
+ await txn.commit()
386
+
387
+ async def get_resource_shard(self, shard_id: str) -> Optional[writer_pb2.ShardObject]:
388
+ async with datamanagers.with_ro_transaction() as txn:
383
389
  pb = await datamanagers.cluster.get_kb_shards(txn, kbid=self.kbid)
384
390
  if pb is None:
385
391
  logger.warning("Shards not found for kbid", extra={"kbid": self.kbid})
@@ -390,52 +396,55 @@ class KnowledgeBox:
390
396
  return None
391
397
 
392
398
  async def get(self, uuid: str) -> Optional[Resource]:
393
- raw_basic = await get_basic(self.txn, self.kbid, uuid)
394
- if raw_basic:
395
- return Resource(
396
- txn=self.txn,
397
- storage=self.storage,
398
- kb=self,
399
- uuid=uuid,
400
- basic=Resource.parse_basic(raw_basic),
401
- disable_vectors=False,
402
- )
403
- else:
399
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
400
+ if basic is None:
404
401
  return None
402
+ return Resource(
403
+ txn=self.txn,
404
+ storage=self.storage,
405
+ kb=self,
406
+ uuid=uuid,
407
+ basic=basic,
408
+ disable_vectors=False,
409
+ )
405
410
 
406
- async def delete_resource(self, uuid: str):
407
- raw_basic = await get_basic(self.txn, self.kbid, uuid)
408
- if raw_basic:
409
- basic = Resource.parse_basic(raw_basic)
410
- else:
411
- basic = None
412
-
413
- async for key in self.txn.keys(
414
- KB_RESOURCE.format(kbid=self.kbid, uuid=uuid), count=-1
415
- ):
416
- await self.txn.delete(key)
417
-
411
+ async def maindb_delete_resource(self, uuid: str):
412
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
413
+ await self.txn.delete_by_prefix(KB_RESOURCE.format(kbid=self.kbid, uuid=uuid))
418
414
  if basic and basic.slug:
419
- slug_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
420
415
  try:
421
- await self.txn.delete(slug_key)
416
+ await self.txn.delete(KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug))
422
417
  except Exception:
423
- pass
418
+ logger.exception("Error deleting slug")
419
+
420
+ async def storage_delete_resource(self, uuid: str):
421
+ if is_onprem_nucliadb():
422
+ await self.storage.delete_resource(self.kbid, uuid)
423
+ else:
424
+ # Deleting from storage can be slow, so we schedule its deletion and the purge cronjob
425
+ # will take care of it
426
+ await self.schedule_delete_resource(self.kbid, uuid)
427
+
428
+ async def schedule_delete_resource(self, kbid: str, uuid: str):
429
+ key = RESOURCE_TO_DELETE_STORAGE.format(kbid=kbid, uuid=uuid)
430
+ await self.txn.set(key, b"")
424
431
 
425
- await self.storage.delete_resource(self.kbid, uuid)
432
+ async def delete_resource(self, uuid: str):
433
+ with processor_observer({"type": "delete_resource_maindb"}):
434
+ await self.maindb_delete_resource(uuid)
435
+ with processor_observer({"type": "delete_resource_storage"}):
436
+ await self.storage_delete_resource(uuid)
426
437
 
427
438
  async def get_resource_uuid_by_slug(self, slug: str) -> Optional[str]:
428
- uuid = await self.txn.get(KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug))
429
- if uuid is not None:
430
- return uuid.decode()
431
- else:
432
- return None
439
+ return await datamanagers.resources.get_resource_uuid_from_slug(
440
+ self.txn, kbid=self.kbid, slug=slug
441
+ )
433
442
 
434
443
  async def get_unique_slug(self, uuid: str, slug: str) -> str:
435
444
  key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
436
445
  key_ok = False
437
446
  while key_ok is False:
438
- found = await self.txn.get(key)
447
+ found = await self.txn.get(key, for_update=False)
439
448
  if found and found.decode() != uuid:
440
449
  slug += ".c"
441
450
  key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
@@ -443,17 +452,7 @@ class KnowledgeBox:
443
452
  key_ok = True
444
453
  return slug
445
454
 
446
- @classmethod
447
- async def resource_slug_exists(
448
- self, txn: Transaction, kbid: str, slug: str
449
- ) -> bool:
450
- key = KB_RESOURCE_SLUG.format(kbid=kbid, slug=slug)
451
- encoded_slug: Optional[bytes] = await txn.get(key)
452
- return encoded_slug not in (None, b"")
453
-
454
- async def add_resource(
455
- self, uuid: str, slug: str, basic: Optional[Basic] = None
456
- ) -> Resource:
455
+ async def add_resource(self, uuid: str, slug: str, basic: Optional[Basic] = None) -> Resource:
457
456
  if basic is None:
458
457
  basic = Basic()
459
458
  if slug == "":
@@ -461,7 +460,7 @@ class KnowledgeBox:
461
460
  slug = await self.get_unique_slug(uuid, slug)
462
461
  basic.slug = slug
463
462
  fix_paragraph_annotation_keys(uuid, basic)
464
- await set_basic(self.txn, self.kbid, uuid, basic)
463
+ await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=uuid, basic=basic)
465
464
  return Resource(
466
465
  storage=self.storage,
467
466
  txn=self.txn,
@@ -473,7 +472,7 @@ class KnowledgeBox:
473
472
 
474
473
  async def iterate_resources(self) -> AsyncGenerator[Resource, None]:
475
474
  base = KB_RESOURCE_SLUG_BASE.format(kbid=self.kbid)
476
- async for key in self.txn.keys(match=base, count=-1):
475
+ async for key in self.txn.keys(match=base):
477
476
  slug = key.split("/")[-1]
478
477
  uuid = await self.get_resource_uuid_by_slug(slug)
479
478
  if uuid is not None:
@@ -485,6 +484,55 @@ class KnowledgeBox:
485
484
  disable_vectors=False,
486
485
  )
487
486
 
487
+ async def create_vectorset(self, config: knowledgebox_pb2.VectorSetConfig):
488
+ if await datamanagers.vectorsets.exists(
489
+ self.txn, kbid=self.kbid, vectorset_id=config.vectorset_id
490
+ ):
491
+ raise VectorSetConflict(f"Vectorset {config.vectorset_id} already exists")
492
+ await datamanagers.vectorsets.set(self.txn, kbid=self.kbid, config=config)
493
+
494
+ # Remove the async deletion mark if it exists, just in case there was a previous deletion
495
+ deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=config.vectorset_id)
496
+ deletion_mark = await self.txn.get(deletion_mark_key, for_update=True)
497
+ if deletion_mark is not None:
498
+ await self.txn.delete(deletion_mark_key)
499
+
500
+ shard_manager = get_shard_manager()
501
+ await shard_manager.create_vectorset(self.kbid, config)
502
+
503
+ async def delete_vectorset(self, vectorset_id: str):
504
+ await datamanagers.vectorsets.delete(self.txn, kbid=self.kbid, vectorset_id=vectorset_id)
505
+
506
+ # mark vectorset for async deletion
507
+ deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=vectorset_id)
508
+ await self.txn.set(deletion_mark_key, b"")
509
+
510
+ shard_manager = get_shard_manager()
511
+ await shard_manager.delete_vectorset(self.kbid, vectorset_id)
512
+
513
+ @classmethod
514
+ async def _maybe_create_external_indexes(
515
+ cls,
516
+ kbid: str,
517
+ request: CreateExternalIndexProviderMetadata,
518
+ indexes: list[VectorsetExternalIndex],
519
+ ) -> StoredExternalIndexProviderMetadata:
520
+ if request.type != ExternalIndexProviderType.PINECONE:
521
+ return StoredExternalIndexProviderMetadata(type=request.type)
522
+ # Only pinecone is supported for now
523
+ return await PineconeIndexManager.create_indexes(kbid, request, indexes)
524
+
525
+ @classmethod
526
+ async def _maybe_delete_external_indexes(
527
+ cls,
528
+ kbid: str,
529
+ stored: StoredExternalIndexProviderMetadata,
530
+ ) -> None:
531
+ if stored.type != ExternalIndexProviderType.PINECONE:
532
+ return
533
+ # Only pinecone is supported for now
534
+ await PineconeIndexManager.delete_indexes(kbid, stored)
535
+
488
536
 
489
537
  def chunker(seq: Sequence, size: int):
490
538
  return (seq[pos : pos + size] for pos in range(0, len(seq), size))
@@ -498,3 +546,11 @@ def fix_paragraph_annotation_keys(uuid: str, basic: Basic) -> None:
498
546
  for paragraph_annotation in ufm.paragraphs:
499
547
  key = compute_paragraph_key(uuid, paragraph_annotation.key)
500
548
  paragraph_annotation.key = key
549
+
550
+
551
+ @processor_observer.wrap({"type": "catalog_delete_kb"})
552
+ async def catalog_delete_kb(txn: Transaction, kbid: str):
553
+ if not isinstance(txn, PGTransaction):
554
+ return
555
+ async with txn.connection.cursor() as cur:
556
+ await cur.execute("DELETE FROM catalog where kbid = %(kbid)s", {"kbid": kbid})