nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,37 +18,53 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from datetime import datetime
21
- from typing import AsyncGenerator, Optional, Sequence
21
+ from functools import partial
22
+ from typing import Any, AsyncGenerator, Callable, Coroutine, Optional, Sequence
22
23
  from uuid import uuid4
23
24
 
24
25
  from grpc import StatusCode
25
26
  from grpc.aio import AioRpcError
26
- from nucliadb_protos.knowledgebox_pb2 import (
27
- KnowledgeBoxConfig,
28
- Labels,
29
- LabelSet,
30
- SemanticModelMetadata,
31
- )
32
- from nucliadb_protos.resources_pb2 import Basic
33
- from nucliadb_protos.utils_pb2 import ReleaseChannel
34
27
 
35
28
  from nucliadb.common import datamanagers
36
29
  from nucliadb.common.cluster.exceptions import ShardNotFound
37
30
  from nucliadb.common.cluster.manager import get_index_node
38
31
  from nucliadb.common.cluster.utils import get_shard_manager
39
- from nucliadb.common.maindb.driver import Driver, Transaction
40
- from nucliadb.ingest import SERVICE_NAME, logger
41
- from nucliadb.ingest.orm.exceptions import KnowledgeBoxConflict
42
- from nucliadb.ingest.orm.resource import (
32
+
33
+ # XXX: this keys shouldn't be exposed outside datamanagers
34
+ from nucliadb.common.datamanagers.resources import (
43
35
  KB_RESOURCE_SLUG,
44
36
  KB_RESOURCE_SLUG_BASE,
45
- Resource,
46
37
  )
38
+ from nucliadb.common.external_index_providers.base import VectorsetExternalIndex
39
+ from nucliadb.common.external_index_providers.pinecone import PineconeIndexManager
40
+ from nucliadb.common.maindb.driver import Driver, Transaction
41
+ from nucliadb.common.maindb.pg import PGTransaction
42
+ from nucliadb.common.nidx import get_nidx_api_client
43
+ from nucliadb.ingest import SERVICE_NAME, logger
44
+ from nucliadb.ingest.orm.exceptions import (
45
+ KnowledgeBoxConflict,
46
+ KnowledgeBoxCreationError,
47
+ VectorSetConflict,
48
+ )
49
+ from nucliadb.ingest.orm.metrics import processor_observer
50
+ from nucliadb.ingest.orm.resource import Resource
47
51
  from nucliadb.ingest.orm.utils import choose_matryoshka_dimension, compute_paragraph_key
48
52
  from nucliadb.migrator.utils import get_latest_version
49
- from nucliadb_protos import writer_pb2
53
+ from nucliadb_protos import knowledgebox_pb2, noderesources_pb2, nodewriter_pb2, writer_pb2
54
+ from nucliadb_protos.knowledgebox_pb2 import (
55
+ CreateExternalIndexProviderMetadata,
56
+ ExternalIndexProviderType,
57
+ KnowledgeBoxConfig,
58
+ SemanticModelMetadata,
59
+ StoredExternalIndexProviderMetadata,
60
+ )
61
+ from nucliadb_protos.resources_pb2 import Basic
62
+ from nucliadb_utils.settings import is_onprem_nucliadb
50
63
  from nucliadb_utils.storages.storage import Storage
51
- from nucliadb_utils.utilities import get_audit, get_storage
64
+ from nucliadb_utils.utilities import (
65
+ get_audit,
66
+ get_storage,
67
+ )
52
68
 
53
69
  # XXX Eventually all these keys should be moved to datamanagers.kb
54
70
  KB_RESOURCE = "/kbs/{kbid}/r/{uuid}"
@@ -58,9 +74,15 @@ KB_KEYS = "/kbs/{kbid}/"
58
74
  KB_TO_DELETE_BASE = "/kbtodelete/"
59
75
  KB_TO_DELETE_STORAGE_BASE = "/storagetodelete/"
60
76
 
77
+ RESOURCE_TO_DELETE_STORAGE_BASE = "/resourcestoragetodelete"
78
+ RESOURCE_TO_DELETE_STORAGE = f"{RESOURCE_TO_DELETE_STORAGE_BASE}/{{kbid}}/{{uuid}}"
79
+
61
80
  KB_TO_DELETE = f"{KB_TO_DELETE_BASE}{{kbid}}"
62
81
  KB_TO_DELETE_STORAGE = f"{KB_TO_DELETE_STORAGE_BASE}{{kbid}}"
63
82
 
83
+ KB_VECTORSET_TO_DELETE_BASE = "/vectorsettodelete"
84
+ KB_VECTORSET_TO_DELETE = f"{KB_VECTORSET_TO_DELETE_BASE}/{{kbid}}/{{vectorset}}"
85
+
64
86
 
65
87
  class KnowledgeBox:
66
88
  def __init__(self, txn: Transaction, storage: Storage, kbid: str):
@@ -69,119 +91,153 @@ class KnowledgeBox:
69
91
  self.kbid = kbid
70
92
  self._config: Optional[KnowledgeBoxConfig] = None
71
93
 
72
- async def get_config(self) -> Optional[KnowledgeBoxConfig]:
73
- if self._config is None:
74
- async with datamanagers.with_transaction() as txn:
75
- config = await datamanagers.kb.get_config(txn, kbid=self.kbid)
76
- if config is not None:
77
- self._config = config
78
- return config
79
- else:
80
- return None
81
- else:
82
- return self._config
83
-
84
- @classmethod
85
- async def delete_kb(cls, txn: Transaction, kbid: str):
86
- # Mark storage to be deleted
87
- # Mark keys to be deleted
88
- kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
89
- if kb_config is None:
90
- # consider KB as deleted
91
- return
92
- slug = kb_config.slug
93
-
94
- # Delete main anchor
95
- async with txn.driver.transaction() as subtxn:
96
- key_match = datamanagers.kb.KB_SLUGS.format(slug=slug)
97
- await subtxn.delete(key_match)
98
-
99
- when = datetime.now().isoformat()
100
- await subtxn.set(KB_TO_DELETE.format(kbid=kbid), when.encode())
101
- await subtxn.commit()
102
-
103
- audit_util = get_audit()
104
- if audit_util is not None:
105
- await audit_util.delete_kb(kbid)
106
- return kbid
94
+ @staticmethod
95
+ def new_unique_kbid() -> str:
96
+ return str(uuid4())
107
97
 
108
98
  @classmethod
99
+ @processor_observer.wrap({"type": "create_kb"})
109
100
  async def create(
110
101
  cls,
111
- txn: Transaction,
102
+ driver: Driver,
103
+ *,
104
+ kbid: str,
112
105
  slug: str,
113
- semantic_model: SemanticModelMetadata,
114
- uuid: Optional[str] = None,
115
- config: Optional[KnowledgeBoxConfig] = None,
116
- release_channel: ReleaseChannel.ValueType = ReleaseChannel.STABLE,
117
- ) -> tuple[str, bool]:
118
- failed = False
119
- exist = await datamanagers.kb.get_kb_uuid(txn, slug=slug)
120
- if exist:
121
- raise KnowledgeBoxConflict()
122
- if uuid is None or uuid == "":
123
- uuid = str(uuid4())
124
-
125
- if slug == "":
126
- slug = uuid
127
-
128
- await txn.set(
129
- datamanagers.kb.KB_SLUGS.format(slug=slug),
130
- uuid.encode(),
131
- )
132
- if config is None:
133
- config = KnowledgeBoxConfig()
134
-
135
- config.migration_version = get_latest_version()
136
- config.slug = slug
137
- await txn.set(
138
- datamanagers.kb.KB_UUID.format(kbid=uuid),
139
- config.SerializeToString(),
140
- )
141
- # Create Storage
142
- storage = await get_storage(service_name=SERVICE_NAME)
143
-
144
- created = await storage.create_kb(uuid)
145
- if created is False:
146
- logger.error(f"{uuid} KB could not be created")
147
- failed = True
148
-
149
- if failed is False:
150
- kb_shards = writer_pb2.Shards()
151
- kb_shards.kbid = uuid
152
- # B/c with Shards.actual
153
- kb_shards.actual = -1
154
- # B/c with `Shards.similarity`, replaced by `model`
155
- kb_shards.similarity = semantic_model.similarity_function
156
-
157
- # if this KB uses a matryoshka model, we can choose a different
158
- # dimension
159
- if len(semantic_model.matryoshka_dimensions) > 0:
160
- semantic_model.vector_dimension = choose_matryoshka_dimension(
161
- semantic_model.matryoshka_dimensions # type: ignore
162
- )
163
- kb_shards.model.CopyFrom(semantic_model)
164
-
165
- kb_shards.release_channel = release_channel
166
-
167
- await datamanagers.cluster.update_kb_shards(
168
- txn, kbid=uuid, shards=kb_shards
106
+ title: str = "",
107
+ description: str = "",
108
+ semantic_models: Optional[dict[str, SemanticModelMetadata]] = None,
109
+ external_index_provider: CreateExternalIndexProviderMetadata = CreateExternalIndexProviderMetadata(),
110
+ hidden_resources_enabled: bool = False,
111
+ hidden_resources_hide_on_creation: bool = False,
112
+ ) -> tuple[str, str]:
113
+ """Creates a new knowledge box and return its id and slug."""
114
+
115
+ if not kbid:
116
+ raise KnowledgeBoxCreationError("A kbid must be provided to create a new KB")
117
+ if not slug:
118
+ raise KnowledgeBoxCreationError("A slug must be provided to create a new KB")
119
+ if hidden_resources_hide_on_creation and not hidden_resources_enabled:
120
+ raise KnowledgeBoxCreationError(
121
+ "Cannot hide new resources if the hidden resources feature is disabled"
169
122
  )
123
+ if semantic_models is None or len(semantic_models) == 0:
124
+ raise KnowledgeBoxCreationError("KB must define at least one semantic model")
170
125
 
171
- # shard creation will alter this value on maindb, make sure nobody
172
- # uses this variable anymore
173
- del kb_shards
174
- shard_manager = get_shard_manager()
175
- try:
176
- await shard_manager.create_shard_by_kbid(txn, uuid)
177
- except Exception as e:
178
- await storage.delete_kb(uuid)
179
- raise e
126
+ rollback_ops: list[Callable[[], Coroutine[Any, Any, Any]]] = []
180
127
 
181
- if failed:
182
- await storage.delete_kb(uuid)
128
+ try:
129
+ async with driver.transaction() as txn:
130
+ exists = await datamanagers.kb.get_kb_uuid(
131
+ txn, slug=slug
132
+ ) or await datamanagers.kb.exists_kb(txn, kbid=kbid)
133
+ if exists:
134
+ raise KnowledgeBoxConflict()
135
+
136
+ # Create in maindb
137
+ await datamanagers.kb.set_kbid_for_slug(txn, slug=slug, kbid=kbid)
138
+
139
+ # all KBs have the vectorset key initialized, although (for
140
+ # now), not every KB will store vectorsets there
141
+ await datamanagers.vectorsets.initialize(txn, kbid=kbid)
142
+
143
+ kb_shards = writer_pb2.Shards()
144
+ kb_shards.kbid = kbid
145
+ # B/c with Shards.actual
146
+ kb_shards.actual = -1
147
+
148
+ vs_external_indexes = []
149
+ for vectorset_id, semantic_model in semantic_models.items(): # type: ignore
150
+ # if this KB uses a matryoshka model, we can choose a different
151
+ # dimension
152
+ if len(semantic_model.matryoshka_dimensions) > 0:
153
+ dimension = choose_matryoshka_dimension(semantic_model.matryoshka_dimensions)
154
+ else:
155
+ dimension = semantic_model.vector_dimension
156
+
157
+ vs_external_indexes.append(
158
+ VectorsetExternalIndex(
159
+ vectorset_id=vectorset_id,
160
+ dimension=dimension,
161
+ similarity=semantic_model.similarity_function,
162
+ )
163
+ )
164
+
165
+ vectorset_config = knowledgebox_pb2.VectorSetConfig(
166
+ vectorset_id=vectorset_id,
167
+ vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
168
+ similarity=semantic_model.similarity_function,
169
+ # XXX: hardcoded value
170
+ vector_type=nodewriter_pb2.VectorType.DENSE_F32,
171
+ normalize_vectors=len(semantic_model.matryoshka_dimensions) > 0,
172
+ vector_dimension=dimension,
173
+ ),
174
+ matryoshka_dimensions=semantic_model.matryoshka_dimensions,
175
+ )
176
+ await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset_config)
177
+
178
+ stored_external_index_provider = await cls._maybe_create_external_indexes(
179
+ kbid, request=external_index_provider, indexes=vs_external_indexes
180
+ )
181
+ rollback_ops.append(
182
+ partial(
183
+ cls._maybe_delete_external_indexes,
184
+ kbid,
185
+ stored_external_index_provider,
186
+ )
187
+ )
183
188
 
184
- return uuid, failed
189
+ config = KnowledgeBoxConfig(
190
+ title=title,
191
+ description=description,
192
+ slug=slug,
193
+ migration_version=get_latest_version(),
194
+ hidden_resources_enabled=hidden_resources_enabled,
195
+ hidden_resources_hide_on_creation=hidden_resources_hide_on_creation,
196
+ )
197
+ config.external_index_provider.CopyFrom(stored_external_index_provider)
198
+ await datamanagers.kb.set_config(txn, kbid=kbid, config=config)
199
+ await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=kb_shards)
200
+
201
+ # shard creation will alter this value on maindb, make sure nobody
202
+ # uses this variable anymore
203
+ del kb_shards
204
+
205
+ # Create in storage
206
+
207
+ storage = await get_storage(service_name=SERVICE_NAME)
208
+
209
+ created = await storage.create_kb(kbid)
210
+ if not created:
211
+ logger.error(f"KB {kbid} could not be created")
212
+ raise KnowledgeBoxCreationError(
213
+ f"KB blob storage could not be created (slug={slug})"
214
+ )
215
+ rollback_ops.append(partial(storage.delete_kb, kbid))
216
+
217
+ # Create shards in index nodes
218
+
219
+ shard_manager = get_shard_manager()
220
+ # XXX creating a shard is a slow IO operation that requires a write
221
+ # txn to be open!
222
+ await shard_manager.create_shard_by_kbid(txn, kbid)
223
+ # shards don't need a rollback as they will be eventually purged
224
+
225
+ await txn.commit()
226
+
227
+ except Exception as exc:
228
+ # rollback all changes on the db and raise the exception
229
+ for op in reversed(rollback_ops):
230
+ try:
231
+ await op()
232
+ except Exception:
233
+ if isinstance(op, partial):
234
+ name: str = op.func.__name__
235
+ else:
236
+ getattr(op, "__name__", "unknown?")
237
+ logger.exception(f"Unexpected error rolling back {name}. Keep rolling back")
238
+ raise exc
239
+
240
+ return (kbid, slug)
185
241
 
186
242
  @classmethod
187
243
  async def update(
@@ -191,7 +247,7 @@ class KnowledgeBox:
191
247
  slug: Optional[str] = None,
192
248
  config: Optional[KnowledgeBoxConfig] = None,
193
249
  ) -> str:
194
- exist = await datamanagers.kb.get_config(txn, kbid=uuid)
250
+ exist = await datamanagers.kb.get_config(txn, kbid=uuid, for_update=True)
195
251
  if not exist:
196
252
  raise datamanagers.exceptions.KnowledgeBoxNotFound()
197
253
 
@@ -208,35 +264,61 @@ class KnowledgeBox:
208
264
 
209
265
  if config and exist != config:
210
266
  exist.MergeFrom(config)
267
+ exist.hidden_resources_enabled = config.hidden_resources_enabled
268
+ exist.hidden_resources_hide_on_creation = config.hidden_resources_hide_on_creation
269
+
270
+ if exist.hidden_resources_hide_on_creation and not exist.hidden_resources_enabled:
271
+ raise KnowledgeBoxCreationError(
272
+ "Cannot hide new resources if the hidden resources feature is disabled"
273
+ )
211
274
 
212
275
  await datamanagers.kb.set_config(txn, kbid=uuid, config=exist)
213
276
 
214
277
  return uuid
215
278
 
216
- # Labels
217
- async def set_labelset(self, id: str, labelset: LabelSet):
218
- await datamanagers.labels.set_labelset(
219
- self.txn, kbid=self.kbid, labelset_id=id, labelset=labelset
220
- )
279
+ @classmethod
280
+ async def delete(cls, driver: Driver, kbid: str):
281
+ async with driver.transaction() as txn:
282
+ exists = await datamanagers.kb.exists_kb(txn, kbid=kbid)
283
+ if not exists:
284
+ return
221
285
 
222
- async def get_labels(self) -> Labels:
223
- return await datamanagers.labels.get_labels(self.txn, kbid=self.kbid)
286
+ # Delete main anchor
287
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
288
+ if kb_config is not None:
289
+ slug = kb_config.slug
290
+ await datamanagers.kb.delete_kb_slug(txn, slug=slug)
224
291
 
225
- async def get_labelset(
226
- self, labelset: str, labelset_response: writer_pb2.GetLabelSetResponse
227
- ):
228
- ls = await datamanagers.labels.get_labelset(
229
- self.txn,
230
- kbid=self.kbid,
231
- labelset_id=labelset,
232
- )
233
- if ls is not None:
234
- labelset_response.labelset.CopyFrom(ls)
292
+ await datamanagers.kb.delete_config(txn, kbid=kbid)
235
293
 
236
- async def del_labelset(self, id: str):
237
- await datamanagers.labels.delete_labelset(
238
- self.txn, kbid=self.kbid, labelset_id=id
239
- )
294
+ # Mark KB to purge. This will eventually delete all KB keys, storage
295
+ # and index data (for the old index nodes)
296
+ when = datetime.now().isoformat()
297
+ await txn.set(KB_TO_DELETE.format(kbid=kbid), when.encode())
298
+
299
+ shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
300
+
301
+ await txn.commit()
302
+
303
+ if shards_obj is None:
304
+ logger.warning(f"Shards not found for KB while deleting it", extra={"kbid": kbid})
305
+ else:
306
+ nidx_api = get_nidx_api_client()
307
+ # Delete shards from nidx. They'll be marked for eventual deletion,
308
+ # so this call shouldn't be costly
309
+ if nidx_api is not None:
310
+ for shard in shards_obj.shards:
311
+ if shard.nidx_shard_id:
312
+ await nidx_api.DeleteShard(noderesources_pb2.ShardId(id=shard.nidx_shard_id))
313
+
314
+ if kb_config is not None:
315
+ await cls._maybe_delete_external_indexes(kbid, kb_config.external_index_provider)
316
+
317
+ audit = get_audit()
318
+ if audit is not None:
319
+ audit.delete_kb(kbid=kbid)
320
+
321
+ return kbid
240
322
 
241
323
  @classmethod
242
324
  async def purge(cls, driver: Driver, kbid: str):
@@ -247,6 +329,8 @@ class KnowledgeBox:
247
329
  need to delete the kb shards and also deletes the related storage
248
330
  buckets.
249
331
 
332
+ Removes all catalog entries related to the kb.
333
+
250
334
  As non-empty buckets cannot be deleted, they are scheduled to be
251
335
  deleted instead. Actually, this empties the bucket asynchronouysly
252
336
  but it doesn't delete it. To do it, we save a marker using the
@@ -262,16 +346,13 @@ class KnowledgeBox:
262
346
  storage_to_delete = KB_TO_DELETE_STORAGE.format(kbid=kbid)
263
347
  await txn.set(storage_to_delete, b"")
264
348
 
265
- # Delete KB Shards
266
- shards_match = datamanagers.cluster.KB_SHARDS.format(kbid=kbid)
267
- payload = await txn.get(shards_match)
349
+ await catalog_delete_kb(txn, kbid)
268
350
 
269
- if payload is None:
270
- logger.warning(f"Shards not found for kbid={kbid}")
351
+ # Delete KB Shards
352
+ shards_obj = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
353
+ if shards_obj is None:
354
+ logger.warning(f"Shards not found for KB while purging it", extra={"kbid": kbid})
271
355
  else:
272
- shards_obj = writer_pb2.Shards()
273
- shards_obj.ParseFromString(payload) # type: ignore
274
-
275
356
  for shard in shards_obj.shards:
276
357
  # Delete the shard on nodes
277
358
  for replica in shard.replicas:
@@ -297,29 +378,14 @@ class KnowledgeBox:
297
378
  await cls.delete_all_kb_keys(driver, kbid)
298
379
 
299
380
  @classmethod
300
- async def delete_all_kb_keys(
301
- cls, driver: Driver, kbid: str, chunk_size: int = 1_000
302
- ):
381
+ async def delete_all_kb_keys(cls, driver: Driver, kbid: str, chunk_size: int = 1_000):
303
382
  prefix = KB_KEYS.format(kbid=kbid)
304
- while True:
305
- async with driver.transaction() as txn:
306
- all_keys = [key async for key in txn.keys(match=prefix, count=-1)]
307
-
308
- if len(all_keys) == 0:
309
- break
310
-
311
- # We commit deletions in chunks because otherwise
312
- # tikv complains if there is too much data to commit
313
- for chunk_of_keys in chunker(all_keys, chunk_size):
314
- async with driver.transaction() as txn:
315
- for key in chunk_of_keys:
316
- await txn.delete(key)
317
- await txn.commit()
318
-
319
- async def get_resource_shard(
320
- self, shard_id: str
321
- ) -> Optional[writer_pb2.ShardObject]:
322
- async with datamanagers.with_transaction() as txn:
383
+ async with driver.transaction() as txn:
384
+ await txn.delete_by_prefix(prefix)
385
+ await txn.commit()
386
+
387
+ async def get_resource_shard(self, shard_id: str) -> Optional[writer_pb2.ShardObject]:
388
+ async with datamanagers.with_ro_transaction() as txn:
323
389
  pb = await datamanagers.cluster.get_kb_shards(txn, kbid=self.kbid)
324
390
  if pb is None:
325
391
  logger.warning("Shards not found for kbid", extra={"kbid": self.kbid})
@@ -330,9 +396,7 @@ class KnowledgeBox:
330
396
  return None
331
397
 
332
398
  async def get(self, uuid: str) -> Optional[Resource]:
333
- basic = await datamanagers.resources.get_basic(
334
- self.txn, kbid=self.kbid, rid=uuid
335
- )
399
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
336
400
  if basic is None:
337
401
  return None
338
402
  return Resource(
@@ -344,24 +408,32 @@ class KnowledgeBox:
344
408
  disable_vectors=False,
345
409
  )
346
410
 
347
- async def delete_resource(self, uuid: str):
348
- basic = await datamanagers.resources.get_basic(
349
- self.txn, kbid=self.kbid, rid=uuid
350
- )
351
-
352
- async for key in self.txn.keys(
353
- KB_RESOURCE.format(kbid=self.kbid, uuid=uuid), count=-1
354
- ):
355
- await self.txn.delete(key)
356
-
411
+ async def maindb_delete_resource(self, uuid: str):
412
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kbid, rid=uuid)
413
+ await self.txn.delete_by_prefix(KB_RESOURCE.format(kbid=self.kbid, uuid=uuid))
357
414
  if basic and basic.slug:
358
- slug_key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug)
359
415
  try:
360
- await self.txn.delete(slug_key)
416
+ await self.txn.delete(KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=basic.slug))
361
417
  except Exception:
362
- pass
418
+ logger.exception("Error deleting slug")
363
419
 
364
- await self.storage.delete_resource(self.kbid, uuid)
420
+ async def storage_delete_resource(self, uuid: str):
421
+ if is_onprem_nucliadb():
422
+ await self.storage.delete_resource(self.kbid, uuid)
423
+ else:
424
+ # Deleting from storage can be slow, so we schedule its deletion and the purge cronjob
425
+ # will take care of it
426
+ await self.schedule_delete_resource(self.kbid, uuid)
427
+
428
+ async def schedule_delete_resource(self, kbid: str, uuid: str):
429
+ key = RESOURCE_TO_DELETE_STORAGE.format(kbid=kbid, uuid=uuid)
430
+ await self.txn.set(key, b"")
431
+
432
+ async def delete_resource(self, uuid: str):
433
+ with processor_observer({"type": "delete_resource_maindb"}):
434
+ await self.maindb_delete_resource(uuid)
435
+ with processor_observer({"type": "delete_resource_storage"}):
436
+ await self.storage_delete_resource(uuid)
365
437
 
366
438
  async def get_resource_uuid_by_slug(self, slug: str) -> Optional[str]:
367
439
  return await datamanagers.resources.get_resource_uuid_from_slug(
@@ -372,7 +444,7 @@ class KnowledgeBox:
372
444
  key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
373
445
  key_ok = False
374
446
  while key_ok is False:
375
- found = await self.txn.get(key)
447
+ found = await self.txn.get(key, for_update=False)
376
448
  if found and found.decode() != uuid:
377
449
  slug += ".c"
378
450
  key = KB_RESOURCE_SLUG.format(kbid=self.kbid, slug=slug)
@@ -380,9 +452,7 @@ class KnowledgeBox:
380
452
  key_ok = True
381
453
  return slug
382
454
 
383
- async def add_resource(
384
- self, uuid: str, slug: str, basic: Optional[Basic] = None
385
- ) -> Resource:
455
+ async def add_resource(self, uuid: str, slug: str, basic: Optional[Basic] = None) -> Resource:
386
456
  if basic is None:
387
457
  basic = Basic()
388
458
  if slug == "":
@@ -390,9 +460,7 @@ class KnowledgeBox:
390
460
  slug = await self.get_unique_slug(uuid, slug)
391
461
  basic.slug = slug
392
462
  fix_paragraph_annotation_keys(uuid, basic)
393
- await datamanagers.resources.set_basic(
394
- self.txn, kbid=self.kbid, rid=uuid, basic=basic
395
- )
463
+ await datamanagers.resources.set_basic(self.txn, kbid=self.kbid, rid=uuid, basic=basic)
396
464
  return Resource(
397
465
  storage=self.storage,
398
466
  txn=self.txn,
@@ -404,7 +472,7 @@ class KnowledgeBox:
404
472
 
405
473
  async def iterate_resources(self) -> AsyncGenerator[Resource, None]:
406
474
  base = KB_RESOURCE_SLUG_BASE.format(kbid=self.kbid)
407
- async for key in self.txn.keys(match=base, count=-1):
475
+ async for key in self.txn.keys(match=base):
408
476
  slug = key.split("/")[-1]
409
477
  uuid = await self.get_resource_uuid_by_slug(slug)
410
478
  if uuid is not None:
@@ -416,6 +484,55 @@ class KnowledgeBox:
416
484
  disable_vectors=False,
417
485
  )
418
486
 
487
+ async def create_vectorset(self, config: knowledgebox_pb2.VectorSetConfig):
488
+ if await datamanagers.vectorsets.exists(
489
+ self.txn, kbid=self.kbid, vectorset_id=config.vectorset_id
490
+ ):
491
+ raise VectorSetConflict(f"Vectorset {config.vectorset_id} already exists")
492
+ await datamanagers.vectorsets.set(self.txn, kbid=self.kbid, config=config)
493
+
494
+ # Remove the async deletion mark if it exists, just in case there was a previous deletion
495
+ deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=config.vectorset_id)
496
+ deletion_mark = await self.txn.get(deletion_mark_key, for_update=True)
497
+ if deletion_mark is not None:
498
+ await self.txn.delete(deletion_mark_key)
499
+
500
+ shard_manager = get_shard_manager()
501
+ await shard_manager.create_vectorset(self.kbid, config)
502
+
503
+ async def delete_vectorset(self, vectorset_id: str):
504
+ await datamanagers.vectorsets.delete(self.txn, kbid=self.kbid, vectorset_id=vectorset_id)
505
+
506
+ # mark vectorset for async deletion
507
+ deletion_mark_key = KB_VECTORSET_TO_DELETE.format(kbid=self.kbid, vectorset=vectorset_id)
508
+ await self.txn.set(deletion_mark_key, b"")
509
+
510
+ shard_manager = get_shard_manager()
511
+ await shard_manager.delete_vectorset(self.kbid, vectorset_id)
512
+
513
+ @classmethod
514
+ async def _maybe_create_external_indexes(
515
+ cls,
516
+ kbid: str,
517
+ request: CreateExternalIndexProviderMetadata,
518
+ indexes: list[VectorsetExternalIndex],
519
+ ) -> StoredExternalIndexProviderMetadata:
520
+ if request.type != ExternalIndexProviderType.PINECONE:
521
+ return StoredExternalIndexProviderMetadata(type=request.type)
522
+ # Only pinecone is supported for now
523
+ return await PineconeIndexManager.create_indexes(kbid, request, indexes)
524
+
525
+ @classmethod
526
+ async def _maybe_delete_external_indexes(
527
+ cls,
528
+ kbid: str,
529
+ stored: StoredExternalIndexProviderMetadata,
530
+ ) -> None:
531
+ if stored.type != ExternalIndexProviderType.PINECONE:
532
+ return
533
+ # Only pinecone is supported for now
534
+ await PineconeIndexManager.delete_indexes(kbid, stored)
535
+
419
536
 
420
537
  def chunker(seq: Sequence, size: int):
421
538
  return (seq[pos : pos + size] for pos in range(0, len(seq), size))
@@ -429,3 +546,11 @@ def fix_paragraph_annotation_keys(uuid: str, basic: Basic) -> None:
429
546
  for paragraph_annotation in ufm.paragraphs:
430
547
  key = compute_paragraph_key(uuid, paragraph_annotation.key)
431
548
  paragraph_annotation.key = key
549
+
550
+
551
+ @processor_observer.wrap({"type": "catalog_delete_kb"})
552
+ async def catalog_delete_kb(txn: Transaction, kbid: str):
553
+ if not isinstance(txn, PGTransaction):
554
+ return
555
+ async with txn.connection.cursor() as cur:
556
+ await cur.execute("DELETE FROM catalog where kbid = %(kbid)s", {"kbid": kbid})