nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -17,22 +17,27 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from unittest.mock import Mock
21
20
 
22
- from starlette.routing import Mount
21
+ """Migration #22
23
22
 
24
- from nucliadb.openapi import extract_openapi, is_versioned_route
25
- from nucliadb.search.app import application
23
+ There was a bug while ingesting/indexing that made paragraphs not being properly
24
+ removed in some cases. This rollover migration ensures data is consistently
25
+ indexed.
26
26
 
27
+ """
27
28
 
28
- def get_route(path):
29
- return Mount(path=path, app=Mock())
29
+ import logging
30
30
 
31
+ from nucliadb.migrator.context import ExecutionContext
31
32
 
32
- def test_is_versioned_route():
33
- assert is_versioned_route(get_route(path="/api/v1/search"))
34
- assert not is_versioned_route(get_route(path="/metrics"))
33
+ logger = logging.getLogger(__name__)
35
34
 
36
35
 
37
- def test_extract_openapi():
38
- assert extract_openapi(application, "1", "commitid", "nucliadb_search")
36
+ async def migrate(context: ExecutionContext) -> None: ...
37
+
38
+
39
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
40
+ """
41
+ We only need 1 rollover migration defined at a time; otherwise, we will
42
+ possibly run many for a kb when we only ever need to run one
43
+ """
@@ -0,0 +1,80 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #23
22
+
23
+ Backfill the data into the PG catalog
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common import datamanagers
31
+ from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
+ from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
33
+ from nucliadb.migrator.context import ExecutionContext
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ async def migrate(context: ExecutionContext) -> None: ...
39
+
40
+
41
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
42
+ if not isinstance(context.kv_driver, PGDriver):
43
+ return
44
+
45
+ BATCH_SIZE = 100
46
+ async with context.kv_driver.transaction() as txn:
47
+ txn = cast(PGTransaction, txn)
48
+ continue_sql = ""
49
+ while True:
50
+ async with txn.connection.cursor() as cur:
51
+ # Get list of resources except those already in the catalog
52
+ await cur.execute(
53
+ f"""
54
+ SELECT SPLIT_PART(key, '/', 5)::UUID FROM resources
55
+ LEFT JOIN catalog ON kbid = %s AND SPLIT_PART(key, '/', 5)::UUID = rid
56
+ WHERE key SIMILAR TO %s
57
+ AND rid IS NULL
58
+ {continue_sql}
59
+ ORDER BY key
60
+ LIMIT %s
61
+ """,
62
+ (kbid, f"/kbs/{kbid}/r/[a-f0-9]*", BATCH_SIZE),
63
+ )
64
+ resources_to_index = [r[0] for r in await cur.fetchall()]
65
+ if len(resources_to_index) == 0:
66
+ return
67
+
68
+ # Index each resource
69
+ for rid in resources_to_index:
70
+ rid = str(rid).replace("-", "")
71
+ resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
72
+ if resource is None:
73
+ logger.warning(f"Could not load resource {rid} for kbid {kbid}")
74
+ continue
75
+
76
+ await resource.compute_global_tags(resource.indexer)
77
+ await pgcatalog_update(txn, kbid, resource)
78
+
79
+ await txn.commit()
80
+ continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
@@ -0,0 +1,113 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #25 (Fixed migration 24)
22
+
23
+ Vectorsets are coming and we need to be ready at nucliadb. Vector index config
24
+ shouldn't be stored anymore in the `Shards` protobuffer, we need to migrate to
25
+ the new vectorsets config.
26
+
27
+ This migration asks learning_config for each KB configuration and saves the
28
+ model name as the vectorset_id. Creates a vectorset configuration for each model
29
+ and deprecates the vectors index config from the `Shards` protobuffer.
30
+
31
+ This migration should work for onprem and hosted deployments, as
32
+ learning_proxy handles which API is used (internal or external)
33
+
34
+ """
35
+
36
+ import logging
37
+
38
+ from nucliadb import learning_proxy
39
+ from nucliadb.common import datamanagers
40
+ from nucliadb.migrator.context import ExecutionContext
41
+ from nucliadb_protos import (
42
+ knowledgebox_pb2,
43
+ nodewriter_pb2,
44
+ )
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ async def migrate(context: ExecutionContext) -> None: ...
50
+
51
+
52
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
53
+ async with context.kv_driver.transaction(read_only=True) as txn:
54
+ vectorsets_count = len([vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)])
55
+ if vectorsets_count > 0:
56
+ logger.info("Skipping KB with vectorsets already populated", extra={"kbid": kbid})
57
+ return
58
+
59
+ learning_config = await learning_proxy.get_configuration(kbid)
60
+ if learning_config is None:
61
+ logger.warning(f"KB has no learning config", extra={"kbid": kbid})
62
+ return None
63
+
64
+ vectorset_id = learning_config.semantic_model
65
+ learning_model_metadata = learning_config.into_semantic_model_metadata()
66
+ learning_similarity = learning_model_metadata.similarity_function
67
+ learning_vector_dimension = learning_model_metadata.vector_dimension
68
+ learning_matryoshka_dimensions = learning_model_metadata.matryoshka_dimensions
69
+ learning_normalize_vectors = len(learning_matryoshka_dimensions) > 0
70
+
71
+ async with context.kv_driver.transaction(read_only=True) as txn:
72
+ semantic_model = await datamanagers.kb.get_model_metadata(txn, kbid=kbid)
73
+
74
+ maindb_similarity = semantic_model.similarity_function
75
+
76
+ maindb_vector_dimension = None
77
+ if semantic_model.vector_dimension:
78
+ maindb_vector_dimension = semantic_model.vector_dimension
79
+
80
+ maindb_matryoshka_dimensions: list[int] = []
81
+ if len(semantic_model.matryoshka_dimensions) > 0:
82
+ maindb_matryoshka_dimensions.extend(semantic_model.matryoshka_dimensions)
83
+
84
+ maindb_normalize_vectors = len(maindb_matryoshka_dimensions) > 0
85
+
86
+ if (
87
+ maindb_similarity != learning_similarity
88
+ or (maindb_vector_dimension is not None and maindb_vector_dimension != learning_vector_dimension)
89
+ or set(maindb_matryoshka_dimensions) != set(learning_matryoshka_dimensions)
90
+ or maindb_normalize_vectors != learning_normalize_vectors
91
+ ):
92
+ logger.error(
93
+ "KB has mismatched data between nucliadb and learning_config! Please, review manually",
94
+ extra={"kbid": kbid},
95
+ )
96
+ return None
97
+
98
+ default_vectorset = knowledgebox_pb2.VectorSetConfig(
99
+ vectorset_id=vectorset_id,
100
+ vectorset_index_config=nodewriter_pb2.VectorIndexConfig(
101
+ vector_dimension=maindb_vector_dimension,
102
+ similarity=maindb_similarity,
103
+ vector_type=nodewriter_pb2.VectorType.DENSE_F32, # we only support this for now
104
+ normalize_vectors=maindb_normalize_vectors,
105
+ ),
106
+ matryoshka_dimensions=maindb_matryoshka_dimensions,
107
+ )
108
+
109
+ async with context.kv_driver.transaction() as txn:
110
+ # Populate KB vectorsets with data from learning. We are skipping KBs
111
+ # with this key already set, so we can set here safely
112
+ await datamanagers.vectorsets.set(txn, kbid=kbid, config=default_vectorset)
113
+ await txn.commit()
@@ -0,0 +1,61 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #26
22
+
23
+ Previously, there was no validation on content types added by users on upload. This caused that in some KBs,
24
+ there were content types that included random uuids, which caused high cardinality in the content type field.
25
+
26
+ This migration will fix those invalid content types.
27
+ """
28
+
29
+ import logging
30
+
31
+ from nucliadb.common import datamanagers
32
+ from nucliadb.migrator.context import ExecutionContext
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ AFFECTED_KBS = [
38
+ "78d289e0-dd4d-448c-84b5-8ef0b01a5aba",
39
+ ]
40
+
41
+
42
+ async def migrate(context: ExecutionContext) -> None: ...
43
+
44
+
45
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
46
+ if kbid not in AFFECTED_KBS:
47
+ return
48
+ async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
49
+ async with datamanagers.with_rw_transaction() as txn:
50
+ basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
51
+ if not basic or not basic.icon:
52
+ continue
53
+ # We're aiming to fix content types like "multipart/form-data; boundary={uuid}"
54
+ if "multipart/form-data" not in basic.icon:
55
+ continue
56
+ if "boundary=" not in basic.icon:
57
+ continue
58
+ logger.info("Fixing content type for resource", extra={"kbid": kbid, "rid": rid})
59
+ basic.icon = "multipart/form-data"
60
+ await datamanagers.resources.set_basic(txn, kbid=kbid, rid=rid, basic=basic)
61
+ await txn.commit()
@@ -0,0 +1,73 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #27
22
+
23
+ Rollover for nucliadb_texts3
24
+ """
25
+
26
+ import logging
27
+
28
+ from nucliadb import learning_proxy
29
+ from nucliadb.common import datamanagers
30
+ from nucliadb.common.cluster.rollover import rollover_kb_index
31
+ from nucliadb.migrator.context import ExecutionContext
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ async def migrate(context: ExecutionContext) -> None: ...
37
+
38
+
39
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
40
+ await maybe_fix_vector_dimensions(context, kbid)
41
+ await rollover_kb_index(context, kbid)
42
+
43
+
44
+ async def maybe_fix_vector_dimensions(context: ExecutionContext, kbid: str) -> None:
45
+ learning_config = await learning_proxy.get_configuration(kbid)
46
+ if learning_config is None:
47
+ logger.warning(f"KB has no learning config", extra={"kbid": kbid})
48
+ return
49
+
50
+ async with context.kv_driver.transaction() as txn:
51
+ vectorsets = [vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)]
52
+ if len(vectorsets) != 1:
53
+ # If multiple vectorsets, they are new shards created correctly, we can safely skip it
54
+ logger.warning(f"KB has {len(vectorsets)} vectorsets, skipping...", extra={"kbid": kbid})
55
+ return
56
+ vectorset = vectorsets[0][1]
57
+
58
+ # Correct value, skip
59
+ if vectorset.vectorset_index_config.vector_dimension != 0:
60
+ return
61
+
62
+ learning_model_metadata = learning_config.into_semantic_model_metadata()
63
+ logger.info(
64
+ f"Fixing KB vectorset dimension",
65
+ extra={
66
+ "kbid": kbid,
67
+ "from": vectorset.vectorset_index_config.vector_dimension,
68
+ "to": learning_model_metadata.vector_dimension,
69
+ },
70
+ )
71
+ vectorset.vectorset_index_config.vector_dimension = learning_model_metadata.vector_dimension
72
+
73
+ await datamanagers.vectorsets.set(txn, kbid=kbid, config=vectorset)
@@ -17,18 +17,16 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from nucliadb_protos.resources_pb2 import FieldDatetime
21
20
 
22
- from nucliadb.ingest.fields.base import Field
21
+ from nucliadb.common.maindb.pg import PGTransaction
23
22
 
24
23
 
25
- class Datetime(Field):
26
- pbklass = FieldDatetime
27
- value: FieldDatetime
28
- type: str = "d"
29
-
30
- async def set_value(self, payload: FieldDatetime):
31
- await self.db_set_value(payload)
32
-
33
- async def get_value(self) -> FieldDatetime:
34
- return await self.db_get_value()
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ # IF NOT EXISTS just for compatibility with older install predating the migration system
27
+ await cur.execute("""
28
+ CREATE TABLE IF NOT EXISTS resources (
29
+ key TEXT PRIMARY KEY,
30
+ value BYTEA
31
+ );
32
+ """)
@@ -0,0 +1,42 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute(r"""
27
+ CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
28
+ CREATE EXTENSION IF NOT EXISTS btree_gin;
29
+ CREATE TABLE catalog (
30
+ kbid UUID,
31
+ rid UUID,
32
+ title TEXT,
33
+ created_at TIMESTAMP,
34
+ modified_at TIMESTAMP,
35
+ labels TEXT[],
36
+ PRIMARY KEY(kbid, rid)
37
+ );
38
+ CREATE INDEX ON catalog USING GIN(kbid, labels);
39
+ CREATE INDEX ON catalog USING GIN(kbid, regexp_split_to_array(lower(title), '\W'::text));
40
+ CREATE INDEX ON catalog(kbid, created_at);
41
+ CREATE INDEX ON catalog(kbid, modified_at);
42
+ """)
@@ -17,8 +17,10 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from nucliadb.ingest.settings import DriverConfig
21
20
 
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
22
 
23
- def test_case_insenstive_driver_config():
24
- assert DriverConfig("PG") == DriverConfig.PG
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute("CREATE INDEX ON catalog(kbid);")
@@ -20,16 +20,16 @@
20
20
  from abc import ABCMeta, abstractmethod
21
21
  from typing import AsyncIterator, Optional
22
22
 
23
+ from nucliadb_protos import nodereader_pb2, noderesources_pb2, utils_pb2
23
24
  from nucliadb_protos.nodereader_pb2_grpc import NodeReaderStub
24
25
  from nucliadb_protos.nodewriter_pb2 import (
25
26
  NewShardRequest,
26
27
  NewVectorSetRequest,
27
28
  OpStatus,
29
+ VectorIndexConfig,
28
30
  )
29
31
  from nucliadb_protos.nodewriter_pb2_grpc import NodeWriterStub
30
32
 
31
- from nucliadb_protos import nodereader_pb2, noderesources_pb2, utils_pb2
32
-
33
33
 
34
34
  class AbstractIndexNode(metaclass=ABCMeta):
35
35
  label: str = "index-node"
@@ -85,23 +85,37 @@ class AbstractIndexNode(metaclass=ABCMeta):
85
85
  async for idandfacets in self.reader.Paragraphs(stream_request): # type: ignore
86
86
  yield idandfacets
87
87
 
88
- async def get_shard(
89
- self, shard_id: str, vectorset: Optional[str] = None
90
- ) -> noderesources_pb2.Shard:
88
+ async def get_shard(self, shard_id: str) -> noderesources_pb2.Shard:
91
89
  req = nodereader_pb2.GetShardRequest()
92
90
  req.shard_id.id = shard_id
93
- if vectorset is not None:
94
- req.vectorset = vectorset
95
91
  return await self.reader.GetShard(req) # type: ignore
96
92
 
97
93
  async def new_shard(
98
94
  self,
99
95
  kbid: str,
100
- similarity: utils_pb2.VectorSimilarity.ValueType,
101
- release_channel: utils_pb2.ReleaseChannel.ValueType,
96
+ vector_index_config: VectorIndexConfig,
102
97
  ) -> noderesources_pb2.ShardCreated:
103
98
  req = NewShardRequest(
104
- kbid=kbid, similarity=similarity, release_channel=release_channel
99
+ kbid=kbid,
100
+ release_channel=utils_pb2.ReleaseChannel.STABLE,
101
+ config=vector_index_config,
102
+ # Deprecated fields, only for backwards compatibility with older nodes
103
+ similarity=vector_index_config.similarity,
104
+ normalize_vectors=vector_index_config.normalize_vectors,
105
+ )
106
+
107
+ resp = await self.writer.NewShard(req) # type: ignore
108
+ return resp
109
+
110
+ async def new_shard_with_vectorsets(
111
+ self,
112
+ kbid: str,
113
+ vectorsets_configs: dict[str, VectorIndexConfig],
114
+ ) -> noderesources_pb2.ShardCreated:
115
+ req = NewShardRequest(
116
+ kbid=kbid,
117
+ release_channel=utils_pb2.ReleaseChannel.STABLE,
118
+ vectorsets_configs=vectorsets_configs,
105
119
  )
106
120
 
107
121
  resp = await self.writer.NewShard(req) # type: ignore
@@ -116,28 +130,31 @@ class AbstractIndexNode(metaclass=ABCMeta):
116
130
  resp: noderesources_pb2.ShardId = await self.writer.DeleteShard(req) # type: ignore
117
131
  return resp.id
118
132
 
119
- async def del_vectorset(self, shard_id: str, vectorset: str) -> OpStatus:
120
- req = noderesources_pb2.VectorSetID()
121
- req.shard.id = shard_id
122
- req.vectorset = vectorset
123
- resp = await self.writer.RemoveVectorSet(req) # type: ignore
124
- return resp
125
-
126
- async def set_vectorset(
133
+ async def add_vectorset(
127
134
  self,
128
135
  shard_id: str,
129
136
  vectorset: str,
130
- similarity: utils_pb2.VectorSimilarity.ValueType = utils_pb2.VectorSimilarity.COSINE,
137
+ config: VectorIndexConfig,
131
138
  ) -> OpStatus:
132
- req = NewVectorSetRequest()
133
- req.id.shard.id = shard_id
134
- req.id.vectorset = vectorset
135
- req.similarity = similarity
139
+ req = NewVectorSetRequest(
140
+ id=noderesources_pb2.VectorSetID(
141
+ shard=noderesources_pb2.ShardId(id=shard_id), vectorset=vectorset
142
+ ),
143
+ config=config,
144
+ )
145
+
136
146
  resp = await self.writer.AddVectorSet(req) # type: ignore
137
147
  return resp
138
148
 
139
- async def get_vectorset(self, shard_id: str) -> noderesources_pb2.VectorSetList:
149
+ async def list_vectorsets(self, shard_id: str) -> list[str]:
140
150
  req = noderesources_pb2.ShardId()
141
151
  req.id = shard_id
142
152
  resp = await self.writer.ListVectorSets(req) # type: ignore
153
+ return [v for v in resp.vectorsets]
154
+
155
+ async def remove_vectorset(self, shard_id: str, vectorset: str) -> OpStatus:
156
+ req = noderesources_pb2.VectorSetID()
157
+ req.shard.id = shard_id
158
+ req.vectorset = vectorset
159
+ resp = await self.writer.RemoveVectorSet(req) # type: ignore
143
160
  return resp
@@ -113,7 +113,7 @@ async def _get_index_node_metadata(
113
113
  channel = get_traced_grpc_channel(grpc_address, "discovery", variant="_writer")
114
114
  if read_replica:
115
115
  # on a read replica, we need to use the replication service
116
- stub = replication_pb2_grpc.ReplicationServiceStub(channel) # type: ignore
116
+ stub = replication_pb2_grpc.ReplicationServiceStub(channel)
117
117
  else:
118
118
  stub = nodewriter_pb2_grpc.NodeWriterStub(channel) # type: ignore
119
119
  try:
@@ -127,9 +127,7 @@ async def _get_index_node_metadata(
127
127
  or None
128
128
  )
129
129
  if read_replica and primary_id is None:
130
- raise Exception(
131
- "Primary node id not found when it is expected to be a read replica"
132
- )
130
+ raise Exception("Primary node id not found when it is expected to be a read replica")
133
131
 
134
132
  return IndexNodeMetadata(
135
133
  node_id=metadata.node_id,
@@ -141,18 +139,14 @@ async def _get_index_node_metadata(
141
139
  )
142
140
 
143
141
 
144
- @backoff.on_exception(
145
- backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=4
146
- )
147
- async def _get_standalone_index_node_metadata(
148
- settings: Settings, address: str
149
- ) -> IndexNodeMetadata:
142
+ @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=4)
143
+ async def _get_standalone_index_node_metadata(settings: Settings, address: str) -> IndexNodeMetadata:
150
144
  if ":" not in address:
151
145
  grpc_address = f"{address}:{settings.standalone_node_port}"
152
146
  else:
153
147
  grpc_address = address
154
148
  channel = get_traced_grpc_channel(grpc_address, "standalone_proxy")
155
- stub = standalone_pb2_grpc.StandaloneClusterServiceStub(channel) # type: ignore
149
+ stub = standalone_pb2_grpc.StandaloneClusterServiceStub(channel)
156
150
  resp: standalone_pb2.NodeInfoResponse = await stub.NodeInfo(standalone_pb2.NodeInfoRequest()) # type: ignore
157
151
  return IndexNodeMetadata(
158
152
  node_id=resp.id,
@@ -177,9 +171,7 @@ class AbstractClusterDiscovery(abc.ABC):
177
171
  async def finalize(self) -> None:
178
172
  """ """
179
173
 
180
- async def _query_node_metadata(
181
- self, address: str, read_replica: bool = False
182
- ) -> IndexNodeMetadata:
174
+ async def _query_node_metadata(self, address: str, read_replica: bool = False) -> IndexNodeMetadata:
183
175
  if self.settings.standalone_mode:
184
176
  return await _get_standalone_index_node_metadata(self.settings, address)
185
177
  else: