nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -25,10 +25,8 @@ from pydantic_settings import BaseSettings
25
25
 
26
26
 
27
27
  class DriverConfig(Enum):
28
- REDIS = "redis"
29
- TIKV = "tikv"
30
28
  PG = "pg"
31
- LOCAL = "local"
29
+ LOCAL = "local" # Not recommended for production
32
30
  NOT_SET = "notset" # setting not provided
33
31
 
34
32
  @classmethod
@@ -42,19 +40,7 @@ class DriverConfig(Enum):
42
40
 
43
41
 
44
42
  class DriverSettings(BaseSettings):
45
- driver: DriverConfig = Field(
46
- default=DriverConfig.NOT_SET, description="K/V storage driver"
47
- )
48
- driver_redis_url: Optional[str] = Field(
49
- default=None, description="Redis URL. Example: redis://localhost:6379"
50
- )
51
- driver_tikv_url: Optional[list[str]] = Field(
52
- default=None,
53
- description=(
54
- "TiKV PD (Placement Driver) URLs. The URL to the cluster manager of"
55
- "TiKV. Example: '[\"tikv-pd.svc:2379\"]'"
56
- ),
57
- )
43
+ driver: DriverConfig = Field(default=DriverConfig.PG, description="K/V storage driver")
58
44
  driver_local_url: Optional[str] = Field(
59
45
  default=None,
60
46
  description="Local path to store data on file system. Example: /nucliadb/data/main",
@@ -63,13 +49,17 @@ class DriverSettings(BaseSettings):
63
49
  default=None,
64
50
  description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.", # noqa
65
51
  )
52
+ driver_pg_connection_pool_min_size: int = Field(
53
+ default=10,
54
+ description="PostgreSQL min pool size. The minimum number of connections to the PostgreSQL server.",
55
+ )
66
56
  driver_pg_connection_pool_max_size: int = Field(
67
57
  default=20,
68
58
  description="PostgreSQL max pool size. The maximum number of connections to the PostgreSQL server.",
69
59
  )
70
- driver_tikv_connection_pool_size: int = Field(
71
- default=3,
72
- description="TiKV max pool size. The maximum number of connections to the TiKV server.",
60
+ driver_pg_connection_pool_acquire_timeout_ms: int = Field(
61
+ default=1000,
62
+ description="PostgreSQL pool acquire timeout in ms. The maximum time to wait until a connection becomes available.",
73
63
  )
74
64
 
75
65
 
@@ -87,7 +77,7 @@ class Settings(DriverSettings):
87
77
  total_replicas: int = 1 # number of ingest processor replicas in the cluster
88
78
  nuclia_partitions: int = 50
89
79
 
90
- max_receive_message_length: int = 4
80
+ max_receive_message_length: int = 500 # In MB
91
81
 
92
82
  # Search query timeouts
93
83
  relation_search_timeout: float = 10.0
nucliadb/ingest/utils.py CHANGED
@@ -19,9 +19,8 @@
19
19
  #
20
20
  from typing import Optional
21
21
 
22
- from nucliadb_protos.writer_pb2_grpc import WriterStub
23
-
24
22
  from nucliadb.common.maindb.utils import setup_driver
23
+ from nucliadb_protos.writer_pb2_grpc import WriterStub
25
24
  from nucliadb_utils.grpc import get_traced_grpc_channel
26
25
  from nucliadb_utils.settings import nucliadb_settings
27
26
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
@@ -37,11 +36,9 @@ async def start_ingest(service_name: Optional[str] = None):
37
36
  if nucliadb_settings.nucliadb_ingest is not None:
38
37
  # Its distributed lets create a GRPC client
39
38
  # We want Jaeger telemetry enabled
40
- channel = get_traced_grpc_channel(
41
- nucliadb_settings.nucliadb_ingest, service_name or "ingest"
42
- )
39
+ channel = get_traced_grpc_channel(nucliadb_settings.nucliadb_ingest, service_name or "ingest")
43
40
  set_utility(Utility.CHANNEL, channel)
44
- ingest = WriterStub(channel) # type: ignore
41
+ ingest = WriterStub(channel)
45
42
  set_utility(Utility.INGEST, ingest)
46
43
  else:
47
44
  # Its not distributed create a ingest
@@ -20,16 +20,21 @@
20
20
  import contextlib
21
21
  import json
22
22
  import logging
23
+ import os
24
+ from abc import ABC, abstractmethod
23
25
  from collections.abc import AsyncIterator
24
- from enum import Enum
26
+ from enum import Enum, IntEnum
25
27
  from typing import Any, Optional, Union
26
28
 
27
29
  import backoff
28
30
  import httpx
29
31
  from fastapi import Request, Response
30
32
  from fastapi.responses import StreamingResponse
33
+ from lru import LRU
31
34
  from pydantic import BaseModel, Field, model_validator
35
+ from typing_extensions import Self
32
36
 
37
+ from nucliadb_protos import knowledgebox_pb2, utils_pb2
33
38
  from nucliadb_telemetry import errors
34
39
  from nucliadb_utils.settings import is_onprem_nucliadb, nuclia_settings
35
40
 
@@ -50,7 +55,32 @@ WHITELISTED_HEADERS = {
50
55
 
51
56
  class LearningService(Enum):
52
57
  CONFIG = "config"
53
- COLLECTOR = "collector-api"
58
+
59
+
60
+ class SimilarityFunction(IntEnum):
61
+ # Keep this in sync with learning config repo
62
+ # It's an IntEnum to match the protobuf definition
63
+ DOT = 0
64
+ COSINE = 1
65
+
66
+
67
+ class SemanticConfig(BaseModel):
68
+ # Keep this in sync with learning config repo
69
+ similarity: SimilarityFunction
70
+ size: int
71
+ threshold: float
72
+ matryoshka_dims: list[int] = []
73
+
74
+ def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
75
+ semantic_model = knowledgebox_pb2.SemanticModelMetadata()
76
+ LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
77
+ SimilarityFunction.COSINE: utils_pb2.VectorSimilarity.COSINE,
78
+ SimilarityFunction.DOT: utils_pb2.VectorSimilarity.DOT,
79
+ }
80
+ semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[self.similarity]
81
+ semantic_model.vector_dimension = self.size
82
+ semantic_model.matryoshka_dimensions.extend(self.matryoshka_dims)
83
+ return semantic_model
54
84
 
55
85
 
56
86
  # Subset of learning configuration of nucliadb's interest. Look at
@@ -69,52 +99,104 @@ class LearningConfiguration(BaseModel):
69
99
  default=None, alias="semantic_matryoshka_dims"
70
100
  )
71
101
 
72
- @model_validator(mode="after")
102
+ semantic_models: list[str] = Field(default_factory=list)
103
+
104
+ # This is where the config for each semantic model (aka vectorsets) is returned
105
+ semantic_model_configs: dict[str, SemanticConfig] = Field(default_factory=dict)
106
+
107
+ @model_validator(mode="before")
73
108
  @classmethod
74
- def validate_matryoshka_and_vector_dimension_consistency(cls, values):
75
- vector_size = values.semantic_vector_size
76
- matryoshka_dimensions = values.semantic_matryoshka_dimensions or []
109
+ def maintain_bw_compatibility_with_single_model_configs(cls, data: Any) -> Any:
110
+ if isinstance(data, dict):
111
+ if not data.get("semantic_model", None) and len(data.get("semantic_models", [])) > 0:
112
+ data["semantic_model"] = data["semantic_models"][0]
113
+ return data
114
+
115
+ @model_validator(mode="after")
116
+ def validate_matryoshka_and_vector_dimension_consistency(self) -> Self:
117
+ vector_size = self.semantic_vector_size
118
+ matryoshka_dimensions = self.semantic_matryoshka_dimensions or []
77
119
  if (
78
120
  len(matryoshka_dimensions) > 0
79
121
  and vector_size is not None
80
122
  and vector_size not in matryoshka_dimensions
81
123
  ):
82
- raise ValueError(
83
- "Semantic vector size is inconsistent with matryoshka dimensions"
84
- )
85
- return values
124
+ raise ValueError("Semantic vector size is inconsistent with matryoshka dimensions")
125
+ return self
126
+
127
+ def into_semantic_models_metadata(
128
+ self,
129
+ ) -> dict[str, knowledgebox_pb2.SemanticModelMetadata]:
130
+ result = {}
131
+ for model_name, config in self.semantic_model_configs.items():
132
+ result[model_name] = config.into_semantic_model_metadata()
133
+ return result
134
+
135
+ def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
136
+ semantic_model = knowledgebox_pb2.SemanticModelMetadata()
137
+
138
+ LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
139
+ "cosine": utils_pb2.VectorSimilarity.COSINE,
140
+ "dot": utils_pb2.VectorSimilarity.DOT,
141
+ }
142
+ semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[
143
+ self.semantic_vector_similarity.lower()
144
+ ]
145
+
146
+ if self.semantic_vector_size is not None:
147
+ semantic_model.vector_dimension = self.semantic_vector_size
148
+ else:
149
+ logger.warning("Vector dimension not set!")
150
+
151
+ if self.semantic_matryoshka_dimensions is not None:
152
+ semantic_model.matryoshka_dimensions.extend(self.semantic_matryoshka_dimensions)
153
+
154
+ return semantic_model
155
+
156
+
157
+ class ProxiedLearningConfigError(Exception):
158
+ def __init__(self, status_code: int, content: bytes, content_type: str):
159
+ self.status_code = status_code
160
+ self.content = content
161
+ self.content_type = content_type
162
+
163
+
164
+ def raise_for_status(response: httpx.Response) -> None:
165
+ try:
166
+ response.raise_for_status()
167
+ except httpx.HTTPStatusError as err:
168
+ content_type = err.response.headers.get("Content-Type", "application/json")
169
+ raise ProxiedLearningConfigError(
170
+ status_code=err.response.status_code,
171
+ content=err.response.content,
172
+ content_type=content_type,
173
+ )
86
174
 
87
175
 
88
176
  async def get_configuration(
89
177
  kbid: str,
90
178
  ) -> Optional[LearningConfiguration]:
91
- async with learning_config_client() as client:
92
- resp = await client.get(f"config/{kbid}")
93
- try:
94
- resp.raise_for_status()
95
- except httpx.HTTPStatusError as err:
96
- if err.response.status_code == 404:
97
- return None
98
- raise
99
- return LearningConfiguration.parse_obj(resp.json())
179
+ return await learning_config_service().get_configuration(kbid)
100
180
 
101
181
 
102
182
  async def set_configuration(
103
183
  kbid: str,
104
184
  config: dict[str, Any],
105
185
  ) -> LearningConfiguration:
106
- async with learning_config_client() as client:
107
- resp = await client.post(f"config/{kbid}", json=config)
108
- resp.raise_for_status()
109
- return LearningConfiguration.parse_obj(resp.json())
186
+ return await learning_config_service().set_configuration(kbid, config)
187
+
188
+
189
+ async def update_configuration(
190
+ kbid: str,
191
+ config: dict[str, Any],
192
+ ) -> None:
193
+ return await learning_config_service().update_configuration(kbid, config)
110
194
 
111
195
 
112
196
  async def delete_configuration(
113
197
  kbid: str,
114
198
  ) -> None:
115
- async with learning_config_client() as client:
116
- resp = await client.delete(f"config/{kbid}")
117
- resp.raise_for_status()
199
+ return await learning_config_service().delete_configuration(kbid)
118
200
 
119
201
 
120
202
  async def learning_config_proxy(
@@ -132,21 +214,6 @@ async def learning_config_proxy(
132
214
  )
133
215
 
134
216
 
135
- async def learning_collector_proxy(
136
- request: Request,
137
- method: str,
138
- url: str,
139
- extra_headers: Optional[dict[str, str]] = None,
140
- ) -> Union[Response, StreamingResponse]:
141
- return await proxy(
142
- service=LearningService.COLLECTOR,
143
- request=request,
144
- method=method,
145
- url=url,
146
- extra_headers=extra_headers,
147
- )
148
-
149
-
150
217
  def is_white_listed_header(header: str) -> bool:
151
218
  return header.lower() in WHITELISTED_HEADERS
152
219
 
@@ -238,13 +305,9 @@ async def proxy(
238
305
 
239
306
  def get_base_url(service: LearningService) -> str:
240
307
  if is_onprem_nucliadb():
241
- nuclia_public_url = nuclia_settings.nuclia_public_url.format(
242
- zone=nuclia_settings.nuclia_zone
243
- )
308
+ nuclia_public_url = nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone)
244
309
  return f"{nuclia_public_url}/api/v1"
245
- learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(
246
- service=service.value
247
- )
310
+ learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(service=service.value)
248
311
  return f"{learning_svc_base_url}/api/v1/internal"
249
312
 
250
313
 
@@ -273,9 +336,7 @@ async def service_client(
273
336
  # This is a workaround to be able to run integration tests that start nucliadb with docker.
274
337
  # The learning APIs are not available in the docker setup, so we use a dummy client.
275
338
  client = DummyClient(base_url=base_url, headers=headers)
276
- logger.warning(
277
- "Using dummy client. If you see this in production, something is wrong."
278
- )
339
+ logger.warning("Using dummy client. If you see this in production, something is wrong.")
279
340
  else:
280
341
  client = httpx.AsyncClient(base_url=base_url, headers=headers) # type: ignore
281
342
  try:
@@ -324,14 +385,31 @@ class DummyClient(httpx.AsyncClient):
324
385
  return self._handle_request("DELETE", *args, **kwargs)
325
386
 
326
387
  def get_config(self, *args: Any, **kwargs: Any):
388
+ size = 768 if os.environ.get("TEST_SENTENCE_ENCODER") == "multilingual-2023-02-21" else 512
327
389
  lconfig = LearningConfiguration(
328
390
  semantic_model="multilingual",
329
391
  semantic_vector_similarity="cosine",
330
- semantic_vector_size=None,
392
+ semantic_vector_size=size,
331
393
  semantic_threshold=None,
332
394
  semantic_matryoshka_dims=[],
395
+ semantic_model_configs={
396
+ "multilingual": SemanticConfig(
397
+ similarity=SimilarityFunction.COSINE,
398
+ size=size,
399
+ threshold=0,
400
+ matryoshka_dims=[],
401
+ )
402
+ },
333
403
  )
334
- return self._response(content=lconfig.dict())
404
+ return self._response(content=lconfig.model_dump())
405
+
406
+ def post_config(self, *args: Any, **kwargs: Any):
407
+ # simulate post that returns the created config
408
+ return self.get_config(*args, **kwargs)
409
+
410
+ def patch_config(self, *args: Any, **kwargs: Any):
411
+ # simulate patch that returns the updated config
412
+ return self.get_config(*args, **kwargs)
335
413
 
336
414
  async def request( # type: ignore
337
415
  self,
@@ -341,9 +419,7 @@ class DummyClient(httpx.AsyncClient):
341
419
  content=None,
342
420
  headers=None,
343
421
  ) -> httpx.Response:
344
- return self._handle_request(
345
- method, url, params=params, content=content, headers=headers
346
- )
422
+ return self._handle_request(method, url, params=params, content=content, headers=headers)
347
423
 
348
424
  def _handle_request(self, *args: Any, **kwargs: Any) -> httpx.Response:
349
425
  """
@@ -357,3 +433,114 @@ class DummyClient(httpx.AsyncClient):
357
433
  return getattr(self, method)(*args, **kwargs)
358
434
  else:
359
435
  return self._response()
436
+
437
+
438
+ class LearningConfigService(ABC):
439
+ @abstractmethod
440
+ async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]: ...
441
+
442
+ @abstractmethod
443
+ async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration: ...
444
+
445
+ @abstractmethod
446
+ async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None: ...
447
+
448
+ @abstractmethod
449
+ async def delete_configuration(self, kbid: str) -> None: ...
450
+
451
+
452
+ class ProxiedLearningConfig(LearningConfigService):
453
+ async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
454
+ async with self._client() as client:
455
+ resp = await client.get(f"config/{kbid}")
456
+ try:
457
+ raise_for_status(resp)
458
+ except ProxiedLearningConfigError as err:
459
+ if err.status_code == 404:
460
+ return None
461
+ raise
462
+ return LearningConfiguration.model_validate(resp.json())
463
+
464
+ async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
465
+ async with self._client() as client:
466
+ resp = await client.post(f"config/{kbid}", json=config)
467
+ raise_for_status(resp)
468
+ return LearningConfiguration.model_validate(resp.json())
469
+
470
+ async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
471
+ async with self._client() as client:
472
+ resp = await client.patch(f"config/{kbid}", json=config)
473
+ raise_for_status(resp)
474
+ return
475
+
476
+ async def delete_configuration(self, kbid: str) -> None:
477
+ async with self._client() as client:
478
+ resp = await client.delete(f"config/{kbid}")
479
+ raise_for_status(resp)
480
+
481
+ @contextlib.asynccontextmanager
482
+ async def _client(self) -> AsyncIterator[httpx.AsyncClient]:
483
+ async with httpx.AsyncClient(
484
+ base_url=get_base_url(LearningService.CONFIG),
485
+ headers=get_auth_headers(),
486
+ ) as client:
487
+ yield client
488
+
489
+
490
+ _IN_MEMORY_CONFIGS: dict[str, LearningConfiguration]
491
+ _IN_MEMORY_CONFIGS = LRU(50) # type: ignore
492
+
493
+
494
+ class InMemoryLearningConfig(LearningConfigService):
495
+ def __init__(self):
496
+ self.in_memory_configs = {}
497
+
498
+ async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
499
+ return _IN_MEMORY_CONFIGS.get(kbid, None)
500
+
501
+ async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
502
+ if not config:
503
+ # generate a default config
504
+ default_model = os.environ.get("TEST_SENTENCE_ENCODER", "multilingual")
505
+ size = 768 if default_model == "multilingual-2023-02-21" else 512
506
+ # XXX for some reason, we override the model name and set this one
507
+ # default_model = "multilingual"
508
+ learning_config = LearningConfiguration(
509
+ semantic_model=default_model,
510
+ semantic_vector_similarity="cosine",
511
+ semantic_vector_size=size,
512
+ semantic_threshold=None,
513
+ semantic_matryoshka_dims=[],
514
+ semantic_models=[default_model],
515
+ semantic_model_configs={
516
+ default_model: SemanticConfig(
517
+ similarity=SimilarityFunction.COSINE,
518
+ size=size,
519
+ threshold=0,
520
+ matryoshka_dims=[],
521
+ )
522
+ },
523
+ )
524
+
525
+ else:
526
+ learning_config = LearningConfiguration.model_validate(config)
527
+
528
+ _IN_MEMORY_CONFIGS[kbid] = learning_config
529
+ return learning_config
530
+
531
+ async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
532
+ if kbid not in _IN_MEMORY_CONFIGS:
533
+ raise ValueError(f"Configuration for kbid {kbid} not found")
534
+ learning_config = _IN_MEMORY_CONFIGS[kbid]
535
+ learning_config = learning_config.model_copy(update=config)
536
+ _IN_MEMORY_CONFIGS[kbid] = learning_config
537
+
538
+ async def delete_configuration(self, kbid: str) -> None:
539
+ _IN_MEMORY_CONFIGS.pop(kbid, None)
540
+
541
+
542
+ def learning_config_service() -> LearningConfigService:
543
+ if nuclia_settings.dummy_learning_services:
544
+ return InMemoryLearningConfig()
545
+ else:
546
+ return ProxiedLearningConfig()
@@ -20,12 +20,14 @@
20
20
  from __future__ import annotations
21
21
 
22
22
  import asyncio
23
- from typing import AsyncGenerator, Callable
23
+ from typing import AsyncGenerator, Callable, Tuple, cast
24
24
 
25
25
  from nucliadb import logger
26
26
  from nucliadb.common import datamanagers
27
27
  from nucliadb.common.cluster import manager as cluster_manager
28
28
  from nucliadb.common.context import ApplicationContext
29
+ from nucliadb.common.maindb.pg import PGDriver
30
+ from nucliadb.common.maindb.utils import get_driver
29
31
  from nucliadb.migrator.datamanager import MigrationsDataManager
30
32
  from nucliadb_telemetry import metrics
31
33
  from nucliadb_telemetry.logs import setup_logging
@@ -34,9 +36,9 @@ from nucliadb_utils.fastapi.run import serve_metrics
34
36
 
35
37
  SHARD_COUNT = metrics.Gauge("nucliadb_node_shard_count", labels={"node": ""})
36
38
 
37
- MIGRATION_COUNT = metrics.Gauge(
38
- "nucliadb_migration", labels={"type": "", "version": ""}
39
- )
39
+ MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "version": ""})
40
+
41
+ PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
40
42
 
41
43
 
42
44
  async def update_node_metrics(context: ApplicationContext):
@@ -57,7 +59,7 @@ async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
57
59
  """
58
60
  Return a list of all KB ids.
59
61
  """
60
- async with context.kv_driver.transaction() as txn:
62
+ async with context.kv_driver.transaction(read_only=True) as txn:
61
63
  async for kbid, _ in datamanagers.kb.get_kbs(txn):
62
64
  yield kbid
63
65
 
@@ -72,9 +74,7 @@ async def update_migration_metrics(context: ApplicationContext):
72
74
  mdm = MigrationsDataManager(context.kv_driver)
73
75
  global_info = await mdm.get_global_info()
74
76
  if global_info is not None:
75
- MIGRATION_COUNT.set(
76
- 1, labels=dict(type="global", version=str(global_info.current_version))
77
- )
77
+ MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
78
78
 
79
79
  version_count: dict[str, int] = {}
80
80
  async for kbid in iter_kbids(context):
@@ -88,9 +88,25 @@ async def update_migration_metrics(context: ApplicationContext):
88
88
  MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
89
89
 
90
90
 
91
- async def run_exporter_task(
92
- context: ApplicationContext, exporter_task: Callable, interval: int
93
- ):
91
+ async def update_resource_metrics(context: ApplicationContext):
92
+ """
93
+ Report the number of pending resources older than some estimated processing time
94
+ """
95
+ driver = get_driver()
96
+ if not isinstance(driver, PGDriver):
97
+ return
98
+
99
+ async with driver._get_connection() as conn, conn.cursor() as cur:
100
+ await cur.execute(
101
+ "SELECT COUNT(*) FROM catalog "
102
+ "WHERE labels @> '{/n/s/PENDING}' "
103
+ "AND COALESCE(modified_at, created_at) BETWEEN NOW() - INTERVAL '1 month' AND NOW() - INTERVAL '6 hours'"
104
+ )
105
+ count = cast(Tuple[int], await cur.fetchone())[0]
106
+ PENDING_RESOURCE_COUNT.set(count)
107
+
108
+
109
+ async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
94
110
  """
95
111
  Run coroutine infinitely, catching exceptions and logging them.
96
112
  It will wait for the interval before running again.
@@ -100,9 +116,7 @@ async def run_exporter_task(
100
116
  try:
101
117
  await exporter_task(context)
102
118
  except Exception:
103
- logger.error(
104
- f"Error on exporter task {exporter_task.__name__}", exc_info=True
105
- )
119
+ logger.error(f"Error on exporter task {exporter_task.__name__}", exc_info=True)
106
120
  await asyncio.sleep(interval)
107
121
  except asyncio.CancelledError:
108
122
  pass
@@ -114,12 +128,9 @@ async def run_exporter(context: ApplicationContext):
114
128
  for export_task, interval in [
115
129
  (update_node_metrics, 10),
116
130
  (update_migration_metrics, 60 * 3),
131
+ (update_resource_metrics, 60 * 5),
117
132
  ]:
118
- tasks.append(
119
- asyncio.create_task(
120
- run_exporter_task(context, export_task, interval=interval)
121
- )
122
- )
133
+ tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))
123
134
  try:
124
135
  while True:
125
136
  await asyncio.sleep(10)
@@ -39,9 +39,7 @@ class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
39
39
  exposed_headers.append(PROCESS_TIME_HEADER)
40
40
  response.headers[ACCESS_CONTROL_EXPOSE_HEADER] = ",".join(exposed_headers)
41
41
 
42
- async def dispatch(
43
- self, request: Request, call_next: RequestResponseEndpoint
44
- ) -> Response:
42
+ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
45
43
  response = None
46
44
  start = time.perf_counter()
47
45
  try:
@@ -53,9 +53,7 @@ def validate():
53
53
  versions = set()
54
54
  for migration in migrations:
55
55
  if migration.version in versions:
56
- raise MigrationValidationError(
57
- f"Migration {migration.version} is duplicated"
58
- )
56
+ raise MigrationValidationError(f"Migration {migration.version} is duplicated")
59
57
  versions.add(migration.version)
60
58
 
61
59
 
@@ -47,17 +47,19 @@ class MigrationsDataManager:
47
47
  self.driver = driver
48
48
 
49
49
  async def schedule_all_kbs(self, target_version: int) -> None:
50
+ # Get all kb ids
51
+ async with self.driver.transaction(read_only=True) as txn:
52
+ kbids = [kbid async for kbid, _ in datamanagers.kb.get_kbs(txn)]
53
+ # Schedule the migrations
50
54
  async with self.driver.transaction() as txn:
51
- async for kbid, _ in datamanagers.kb.get_kbs(txn):
52
- await txn.set(
53
- MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode()
54
- )
55
+ for kbid in kbids:
56
+ await txn.set(MIGRATIONS_KEY.format(kbid=kbid), str(target_version).encode())
55
57
  await txn.commit()
56
58
 
57
- async def get_kb_migrations(self, limit: int = 100) -> list[str]:
59
+ async def get_kb_migrations(self) -> list[str]:
58
60
  keys = []
59
61
  async with self.driver.transaction() as txn:
60
- async for key in txn.keys(MIGRATIONS_CONTAINER_KEY, count=limit):
62
+ async for key in txn.keys(MIGRATIONS_CONTAINER_KEY):
61
63
  keys.append(key.split("/")[-1])
62
64
 
63
65
  return keys
@@ -68,7 +70,7 @@ class MigrationsDataManager:
68
70
  await txn.commit()
69
71
 
70
72
  async def get_kb_info(self, kbid: str) -> Optional[KnowledgeBoxInfo]:
71
- async with self.driver.transaction() as txn:
73
+ async with self.driver.transaction(read_only=True) as txn:
72
74
  kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
73
75
  if kb_config is None:
74
76
  return None
@@ -76,7 +78,7 @@ class MigrationsDataManager:
76
78
 
77
79
  async def update_kb_info(self, *, kbid: str, current_version: int) -> None:
78
80
  async with self.driver.transaction() as txn:
79
- kb_config = await datamanagers.kb.get_config(txn, kbid=kbid)
81
+ kb_config = await datamanagers.kb.get_config(txn, kbid=kbid, for_update=True)
80
82
  if kb_config is None:
81
83
  raise Exception(f"KB {kbid} does not exist")
82
84
  kb_config.migration_version = current_version
@@ -84,15 +86,13 @@ class MigrationsDataManager:
84
86
  await txn.commit()
85
87
 
86
88
  async def get_global_info(self) -> GlobalInfo:
87
- async with self.driver.transaction() as txn:
89
+ async with self.driver.transaction(read_only=True) as txn:
88
90
  raw_pb = await txn.get(MIGRATION_INFO_KEY)
89
91
  if raw_pb is None:
90
92
  return GlobalInfo(current_version=0, target_version=None)
91
93
  pb = migrations_pb2.MigrationInfo()
92
94
  pb.ParseFromString(raw_pb)
93
- return GlobalInfo(
94
- current_version=pb.current_version, target_version=pb.target_version
95
- )
95
+ return GlobalInfo(current_version=pb.current_version, target_version=pb.target_version)
96
96
 
97
97
  async def update_global_info(
98
98
  self,
@@ -101,7 +101,7 @@ class MigrationsDataManager:
101
101
  target_version: Union[int, None, _Unset] = _UNSET,
102
102
  ) -> None:
103
103
  async with self.driver.transaction() as txn:
104
- raw_pb = await txn.get(MIGRATION_INFO_KEY)
104
+ raw_pb = await txn.get(MIGRATION_INFO_KEY, for_update=True)
105
105
  pb = migrations_pb2.MigrationInfo()
106
106
  if raw_pb is not None:
107
107
  pb.ParseFromString(raw_pb)