nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -17,45 +17,11 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Optional
21
20
 
22
- from nucliadb_protos.resources_pb2 import AllFieldIDs, FieldID
23
-
24
- from nucliadb.common import datamanagers
25
21
  from nucliadb.migrator.context import ExecutionContext
26
- from nucliadb.migrator.migrator import logger
27
22
 
28
23
 
29
24
  async def migrate(context: ExecutionContext) -> None: ...
30
25
 
31
26
 
32
- async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
33
- async for resource_id in datamanagers.resources.iterate_resource_ids(kbid=kbid):
34
- async with context.kv_driver.transaction() as txn:
35
- resource = await datamanagers.resources.get_resource(
36
- txn, kbid=kbid, rid=resource_id
37
- )
38
- if resource is None:
39
- logger.warning(
40
- f"kb={kbid} rid={resource_id}: resource not found. Skipping..."
41
- )
42
- continue
43
-
44
- all_fields: Optional[AllFieldIDs] = await resource.get_all_field_ids()
45
- if all_fields is not None:
46
- logger.warning(
47
- f"kb={kbid} rid={resource_id}: already has all fields key. Skipping..."
48
- )
49
- continue
50
-
51
- # Migrate resource
52
- logger.warning(f"kb={kbid} rid={resource_id}: migrating...")
53
- all_fields = AllFieldIDs()
54
- async for (
55
- field_type,
56
- field_id,
57
- ) in resource._deprecated_scan_fields_ids():
58
- fid = FieldID(field_type=field_type, field=field_id)
59
- all_fields.fields.append(fid)
60
- await resource.set_all_field_ids(all_fields)
61
- await txn.commit()
27
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
@@ -26,7 +26,6 @@ rollover will do the upgrade automatically.
26
26
 
27
27
  """
28
28
 
29
- from nucliadb.common.cluster.rollover import rollover_kb_shards
30
29
  from nucliadb.migrator.context import ExecutionContext
31
30
 
32
31
 
@@ -34,4 +33,7 @@ async def migrate(context: ExecutionContext) -> None: ...
34
33
 
35
34
 
36
35
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
37
- await rollover_kb_shards(context, kbid)
36
+ """
37
+ We only need 1 rollover migration defined at a time; otherwise, we will
38
+ possibly run many for a kb when we only ever need to run one
39
+ """
@@ -27,23 +27,23 @@ index data loss. Rollover affected KBs
27
27
 
28
28
  import logging
29
29
 
30
- from nucliadb.common.cluster.rollover import rollover_kb_shards
31
30
  from nucliadb.migrator.context import ExecutionContext
32
31
 
33
32
  logger = logging.getLogger(__name__)
34
33
 
35
- AFFECTED_KBS = [
36
- "1efc5a33-bc5a-490c-8b47-b190beee212d",
37
- "f11d6eb9-da5e-4519-ac3d-e304bfa5c354",
38
- "096d9070-f7be-40c8-a24c-19c89072e3ff",
39
- "848f01bc-341a-4346-b473-6b11b76b26eb",
40
- ]
34
+ # AFFECTED_KBS = [
35
+ # "1efc5a33-bc5a-490c-8b47-b190beee212d",
36
+ # "f11d6eb9-da5e-4519-ac3d-e304bfa5c354",
37
+ # "096d9070-f7be-40c8-a24c-19c89072e3ff",
38
+ # "848f01bc-341a-4346-b473-6b11b76b26eb",
39
+ # ]
41
40
 
42
41
 
43
42
  async def migrate(context: ExecutionContext) -> None: ...
44
43
 
45
44
 
46
45
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
47
- if kbid in AFFECTED_KBS:
48
- logger.info(f"Rolling over affected KB: {kbid}")
49
- await rollover_kb_shards(context, kbid)
46
+ """
47
+ We only need 1 rollover migration defined at a time; otherwise, we will
48
+ possibly run many for a kb when we only ever need to run one
49
+ """
@@ -26,7 +26,6 @@ Tikv doesn't really like scanning a lot of keys, so we need to materialize the l
26
26
 
27
27
  import logging
28
28
 
29
- from nucliadb.common import datamanagers
30
29
  from nucliadb.migrator.context import ExecutionContext
31
30
 
32
31
  logger = logging.getLogger(__name__)
@@ -35,18 +34,4 @@ logger = logging.getLogger(__name__)
35
34
  async def migrate(context: ExecutionContext) -> None: ...
36
35
 
37
36
 
38
- async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
39
- async with context.kv_driver.transaction() as txn:
40
- labelset_list = await datamanagers.labels._get_labelset_ids(txn, kbid=kbid)
41
- if labelset_list is not None:
42
- logger.info("No need for labelset list migration", extra={"kbid": kbid})
43
- return
44
-
45
- labelset_list = await datamanagers.labels._deprecated_scan_labelset_ids(
46
- txn, kbid=kbid
47
- )
48
- await datamanagers.labels._set_labelset_ids(
49
- txn, kbid=kbid, labelsets=labelset_list
50
- )
51
- logger.info("Labelset list migrated", extra={"kbid": kbid})
52
- await txn.commit()
37
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
@@ -23,25 +23,20 @@ Allow targeted rollover of KBs
23
23
  """
24
24
 
25
25
  import logging
26
- import os
27
26
 
28
- from nucliadb.common.cluster.rollover import rollover_kb_shards
29
27
  from nucliadb.migrator.context import ExecutionContext
30
28
 
31
29
  logger = logging.getLogger(__name__)
32
30
 
33
31
 
34
- AFFECTED_KBS = [
35
- kbid.strip()
36
- for kbid in os.environ.get("ROLLOVER_KBS", "").split(",")
37
- if kbid.strip()
38
- ]
32
+ # AFFECTED_KBS = [kbid.strip() for kbid in os.environ.get("ROLLOVER_KBS", "").split(",") if kbid.strip()]
39
33
 
40
34
 
41
35
  async def migrate(context: ExecutionContext) -> None: ...
42
36
 
43
37
 
44
38
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
45
- if kbid in AFFECTED_KBS:
46
- logger.info(f"Rolling over affected KB: {kbid}")
47
- await rollover_kb_shards(context, kbid)
39
+ """
40
+ We only need 1 rollover migration defined at a time; otherwise, we will
41
+ possibly run many for a kb when we only ever need to run one
42
+ """
@@ -23,9 +23,7 @@ Allow targeted rollover of KBs
23
23
  """
24
24
 
25
25
  import logging
26
- import os
27
26
 
28
- from nucliadb.common.cluster.rollover import rollover_kb_shards
29
27
  from nucliadb.migrator.context import ExecutionContext
30
28
 
31
29
  logger = logging.getLogger(__name__)
@@ -35,6 +33,7 @@ async def migrate(context: ExecutionContext) -> None: ...
35
33
 
36
34
 
37
35
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
38
- if os.environ.get("RUNNING_ENVIRONMENT", os.environ.get("ENVIRONMENT")) == "stage":
39
- logger.info(f"Rolling over affected KB: {kbid}")
40
- await rollover_kb_shards(context, kbid)
36
+ """
37
+ We only need 1 rollover migration defined at a time; otherwise, we will
38
+ possibly run many for a kb when we only ever need to run one
39
+ """
@@ -24,25 +24,20 @@ Targeted rollover for a specific KB
24
24
  """
25
25
 
26
26
  import logging
27
- import os
28
27
 
29
- from nucliadb.common.cluster.rollover import rollover_kb_shards
30
28
  from nucliadb.migrator.context import ExecutionContext
31
29
 
32
30
  logger = logging.getLogger(__name__)
33
31
 
34
32
 
35
- AFFECTED_KBS = [
36
- kbid.strip()
37
- for kbid in os.environ.get("ROLLOVER_KBS", "").split(",")
38
- if kbid.strip()
39
- ]
33
+ # AFFECTED_KBS = [kbid.strip() for kbid in os.environ.get("ROLLOVER_KBS", "").split(",") if kbid.strip()]
40
34
 
41
35
 
42
36
  async def migrate(context: ExecutionContext) -> None: ...
43
37
 
44
38
 
45
39
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
46
- if kbid in AFFECTED_KBS:
47
- logger.info(f"Rolling over affected KB: {kbid}")
48
- await rollover_kb_shards(context, kbid)
40
+ """
41
+ We only need 1 rollover migration defined at a time; otherwise, we will
42
+ possibly run many for a kb when we only ever need to run one
43
+ """
@@ -25,10 +25,6 @@ Targeted rollover for a specific KBs which still don't have the latest version o
25
25
 
26
26
  import logging
27
27
 
28
- from nucliadb_protos.noderesources_pb2 import ShardCreated
29
-
30
- from nucliadb.common import datamanagers
31
- from nucliadb.common.cluster.rollover import rollover_kb_shards
32
28
  from nucliadb.migrator.context import ExecutionContext
33
29
 
34
30
  logger = logging.getLogger(__name__)
@@ -41,29 +37,30 @@ async def migrate(context: ExecutionContext) -> None: ...
41
37
 
42
38
 
43
39
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
44
- try:
45
- if await has_old_paragraphs_index(context, kbid):
46
- logger.info("Rolling over affected KB", extra={"kbid": kbid})
47
- await rollover_kb_shards(context, kbid)
48
- else:
49
- logger.info(
50
- "KB already has the latest version of the paragraphs index, skipping rollover",
51
- extra={"kbid": kbid},
52
- )
53
- except ShardsObjectNotFound:
54
- logger.warning("KB not found, skipping rollover", extra={"kbid": kbid})
40
+ """
41
+ We only need 1 rollover migration defined at a time; otherwise, we will
42
+ possibly run many for a kb when we only ever need to run one
43
+ """
44
+ # try:
45
+ # if await has_old_paragraphs_index(context, kbid):
46
+ # logger.info("Rolling over affected KB", extra={"kbid": kbid})
47
+ # await rollover_kb_index(context, kbid)
48
+ # else:
49
+ # logger.info(
50
+ # "KB already has the latest version of the paragraphs index, skipping rollover",
51
+ # extra={"kbid": kbid},
52
+ # )
53
+ # except ShardsObjectNotFound:
54
+ # logger.warning("KB not found, skipping rollover", extra={"kbid": kbid})
55
55
 
56
56
 
57
- async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
58
- async with context.kv_driver.transaction(read_only=True) as txn:
59
- shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
60
- if not shards_object:
61
- raise ShardsObjectNotFound()
62
- for shard in shards_object.shards:
63
- for replica in shard.replicas:
64
- if (
65
- replica.shard.paragraph_service
66
- != ShardCreated.ParagraphService.PARAGRAPH_V2
67
- ):
68
- return True
69
- return False
57
+ # async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
58
+ # async with context.kv_driver.transaction(read_only=True) as txn:
59
+ # shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=False)
60
+ # if not shards_object:
61
+ # raise ShardsObjectNotFound()
62
+ # for shard in shards_object.shards:
63
+ # for replica in shard.replicas:
64
+ # if replica.shard.paragraph_service != ShardCreated.ParagraphService.PARAGRAPH_V2:
65
+ # return True
66
+ # return False
@@ -42,7 +42,7 @@ async def migrate(context: ExecutionContext) -> None: ...
42
42
 
43
43
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
44
44
  async with context.kv_driver.transaction() as txn:
45
- shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
45
+ shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
46
46
  if shards is None:
47
47
  logger.error("KB without shards", extra={"kbid": kbid})
48
48
  return
@@ -52,9 +52,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
52
52
  shards.shards[shards.actual].read_only = False
53
53
 
54
54
  # just ensure we're writing it correctly
55
- assert [shard_object.read_only for shard_object in shards.shards].count(
56
- False
57
- ) == 1
55
+ assert [shard_object.read_only for shard_object in shards.shards].count(False) == 1
58
56
 
59
57
  await datamanagers.cluster.update_kb_shards(txn, kbid=kbid, shards=shards)
60
58
  await txn.commit()
@@ -18,17 +18,17 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- """Migration #17
21
+ """Migration #18
22
22
 
23
23
  Due to a bug on backend services, some kbslugs were not properly deleted and got
24
24
  orphan. Let's delete them!
25
25
 
26
26
  """
27
+
27
28
  import logging
28
29
 
29
30
  from nucliadb.common import datamanagers
30
31
  from nucliadb.common.datamanagers.kb import KB_SLUGS_BASE
31
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
32
32
  from nucliadb.migrator.context import ExecutionContext
33
33
 
34
34
  logger = logging.getLogger(__name__)
@@ -36,14 +36,12 @@ logger = logging.getLogger(__name__)
36
36
 
37
37
  async def migrate(context: ExecutionContext) -> None:
38
38
  async with context.kv_driver.transaction() as txn:
39
- async for key in txn.keys(KB_SLUGS_BASE, count=-1):
39
+ async for key in txn.keys(KB_SLUGS_BASE):
40
40
  slug = key.replace(KB_SLUGS_BASE, "")
41
- value = await txn.get(key)
41
+ value = await txn.get(key, for_update=False)
42
42
  if value is None:
43
43
  # KB with slug but without uuid? Seems wrong, let's remove it too
44
- logger.info(
45
- "Removing /kbslugs with empty value", extra={"maindb_key": key}
46
- )
44
+ logger.info("Removing /kbslugs with empty value", extra={"maindb_key": key})
47
45
  await txn.delete(key)
48
46
  continue
49
47
 
@@ -25,10 +25,6 @@ Targeted rollover for a specific KBs which still don't have the latest version o
25
25
 
26
26
  import logging
27
27
 
28
- from nucliadb_protos.noderesources_pb2 import ShardCreated
29
-
30
- from nucliadb.common import datamanagers
31
- from nucliadb.common.cluster.rollover import rollover_kb_shards
32
28
  from nucliadb.migrator.context import ExecutionContext
33
29
 
34
30
  logger = logging.getLogger(__name__)
@@ -41,29 +37,30 @@ async def migrate(context: ExecutionContext) -> None: ...
41
37
 
42
38
 
43
39
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
44
- try:
45
- if await has_old_paragraphs_index(context, kbid):
46
- logger.info("Rolling over affected KB", extra={"kbid": kbid})
47
- await rollover_kb_shards(context, kbid)
48
- else:
49
- logger.info(
50
- "KB already has the latest version of the paragraphs index, skipping rollover",
51
- extra={"kbid": kbid},
52
- )
53
- except ShardsObjectNotFound:
54
- logger.warning("KB not found, skipping rollover", extra={"kbid": kbid})
40
+ """
41
+ We only need 1 rollover migration defined at a time; otherwise, we will
42
+ possibly run many for a kb when we only ever need to run one
43
+ """
44
+ # try:
45
+ # if await has_old_paragraphs_index(context, kbid):
46
+ # logger.info("Rolling over affected KB", extra={"kbid": kbid})
47
+ # await rollover_kb_index(context, kbid)
48
+ # else:
49
+ # logger.info(
50
+ # "KB already has the latest version of the paragraphs index, skipping rollover",
51
+ # extra={"kbid": kbid},
52
+ # )
53
+ # except ShardsObjectNotFound:
54
+ # logger.warning("KB not found, skipping rollover", extra={"kbid": kbid})
55
55
 
56
56
 
57
- async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
58
- async with context.kv_driver.transaction(read_only=True) as txn:
59
- shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
60
- if not shards_object:
61
- raise ShardsObjectNotFound()
62
- for shard in shards_object.shards:
63
- for replica in shard.replicas:
64
- if (
65
- replica.shard.paragraph_service
66
- != ShardCreated.ParagraphService.PARAGRAPH_V3
67
- ):
68
- return True
69
- return False
57
+ # async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
58
+ # async with context.kv_driver.transaction(read_only=True) as txn:
59
+ # shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
60
+ # if not shards_object:
61
+ # raise ShardsObjectNotFound()
62
+ # for shard in shards_object.shards:
63
+ # for replica in shard.replicas:
64
+ # if replica.shard.paragraph_service != ShardCreated.ParagraphService.PARAGRAPH_V3:
65
+ # return True
66
+ # return False
@@ -29,7 +29,7 @@ create new shards in the remaining nodes.
29
29
  import logging
30
30
 
31
31
  from nucliadb.common import datamanagers
32
- from nucliadb.common.cluster.rollover import rollover_kb_shards
32
+ from nucliadb.common.cluster.rollover import rollover_kb_index
33
33
  from nucliadb.common.cluster.settings import settings as cluster_settings
34
34
  from nucliadb.migrator.context import ExecutionContext
35
35
 
@@ -56,11 +56,11 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
56
56
  return
57
57
 
58
58
  logger.info("Rolling over affected KB", extra={"kbid": kbid})
59
- await rollover_kb_shards(context, kbid, drain_nodes=drain_node_ids)
59
+ await rollover_kb_index(context, kbid, drain_nodes=drain_node_ids)
60
60
 
61
61
 
62
62
  async def kb_has_shards_on_drain_nodes(kbid: str, drain_node_ids: list[str]) -> bool:
63
- async with datamanagers.with_transaction(read_only=True) as txn:
63
+ async with datamanagers.with_ro_transaction() as txn:
64
64
  shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
65
65
  if not shards:
66
66
  logger.warning("Shards object not found", extra={"kbid": kbid})
@@ -17,31 +17,28 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from unittest import mock
21
20
 
22
- import pytest
21
+ """Migration #21
23
22
 
24
- from nucliadb.standalone.run import run, run_async_nucliadb
25
- from nucliadb.standalone.settings import Settings
23
+ With the new vectorsets implementation, we need to store some information on
24
+ maindb. As the key "/kbs/{kbid}/vectorsets" was already used at some point, this
25
+ migration will ensure to overwrite the key and set the new value
26
26
 
27
- STANDALONE_RUN = "nucliadb.standalone.run"
27
+ """
28
28
 
29
+ import logging
29
30
 
30
- @pytest.fixture(scope="function", autouse=True)
31
- def mocked_deps():
32
- with (
33
- mock.patch("uvicorn.Server.run"),
34
- mock.patch("nucliadb.standalone.run.parser", return_value=Settings()),
35
- mock.patch(f"{STANDALONE_RUN}.get_latest_nucliadb", return_value="1.0.0"),
36
- mock.patch("uvicorn.Server.startup"),
37
- mock.patch(f"{STANDALONE_RUN}.run_migrations"),
38
- ):
39
- yield
31
+ from nucliadb.common import datamanagers
32
+ from nucliadb.migrator.context import ExecutionContext
40
33
 
34
+ logger = logging.getLogger(__name__)
41
35
 
42
- def test_run():
43
- run()
44
36
 
37
+ async def migrate(context: ExecutionContext) -> None: ...
45
38
 
46
- async def test_run_async_nucliadb():
47
- await run_async_nucliadb(Settings())
39
+
40
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
41
+ async with context.kv_driver.transaction() as txn:
42
+ logger.info(f"Overwriting vectorsets key", extra={"kbid": kbid})
43
+ await datamanagers.vectorsets.initialize(txn, kbid=kbid)
44
+ await txn.commit()
@@ -17,22 +17,27 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from unittest.mock import Mock
21
20
 
22
- from starlette.routing import Mount
21
+ """Migration #22
23
22
 
24
- from nucliadb.openapi import extract_openapi, is_versioned_route
25
- from nucliadb.search.app import application
23
+ There was a bug while ingesting/indexing that made paragraphs not being properly
24
+ removed in some cases. This rollover migration ensures data is consistently
25
+ indexed.
26
26
 
27
+ """
27
28
 
28
- def get_route(path):
29
- return Mount(path=path, app=Mock())
29
+ import logging
30
30
 
31
+ from nucliadb.migrator.context import ExecutionContext
31
32
 
32
- def test_is_versioned_route():
33
- assert is_versioned_route(get_route(path="/api/v1/search"))
34
- assert not is_versioned_route(get_route(path="/metrics"))
33
+ logger = logging.getLogger(__name__)
35
34
 
36
35
 
37
- def test_extract_openapi():
38
- assert extract_openapi(application, "1", "commitid", "nucliadb_search")
36
+ async def migrate(context: ExecutionContext) -> None: ...
37
+
38
+
39
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
40
+ """
41
+ We only need 1 rollover migration defined at a time; otherwise, we will
42
+ possibly run many for a kb when we only ever need to run one
43
+ """
@@ -0,0 +1,80 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #23
22
+
23
+ Backfill the data into the PG catalog
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common import datamanagers
31
+ from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
+ from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
33
+ from nucliadb.migrator.context import ExecutionContext
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ async def migrate(context: ExecutionContext) -> None: ...
39
+
40
+
41
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
42
+ if not isinstance(context.kv_driver, PGDriver):
43
+ return
44
+
45
+ BATCH_SIZE = 100
46
+ async with context.kv_driver.transaction() as txn:
47
+ txn = cast(PGTransaction, txn)
48
+ continue_sql = ""
49
+ while True:
50
+ async with txn.connection.cursor() as cur:
51
+ # Get list of resources except those already in the catalog
52
+ await cur.execute(
53
+ f"""
54
+ SELECT SPLIT_PART(key, '/', 5)::UUID FROM resources
55
+ LEFT JOIN catalog ON kbid = %s AND SPLIT_PART(key, '/', 5)::UUID = rid
56
+ WHERE key SIMILAR TO %s
57
+ AND rid IS NULL
58
+ {continue_sql}
59
+ ORDER BY key
60
+ LIMIT %s
61
+ """,
62
+ (kbid, f"/kbs/{kbid}/r/[a-f0-9]*", BATCH_SIZE),
63
+ )
64
+ resources_to_index = [r[0] for r in await cur.fetchall()]
65
+ if len(resources_to_index) == 0:
66
+ return
67
+
68
+ # Index each resource
69
+ for rid in resources_to_index:
70
+ rid = str(rid).replace("-", "")
71
+ resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
72
+ if resource is None:
73
+ logger.warning(f"Could not load resource {rid} for kbid {kbid}")
74
+ continue
75
+
76
+ await resource.compute_global_tags(resource.indexer)
77
+ await pgcatalog_update(txn, kbid, resource)
78
+
79
+ await txn.commit()
80
+ continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"