nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -20,14 +20,13 @@
20
20
  from enum import Enum
21
21
  from typing import Optional
22
22
 
23
- from pydantic import BaseSettings, Field
23
+ from pydantic import Field
24
+ from pydantic_settings import BaseSettings
24
25
 
25
26
 
26
- class DriverConfig(str, Enum):
27
- REDIS = "redis"
28
- TIKV = "tikv"
27
+ class DriverConfig(Enum):
29
28
  PG = "pg"
30
- LOCAL = "local"
29
+ LOCAL = "local" # Not recommended for production
31
30
  NOT_SET = "notset" # setting not provided
32
31
 
33
32
  @classmethod
@@ -41,19 +40,7 @@ class DriverConfig(str, Enum):
41
40
 
42
41
 
43
42
  class DriverSettings(BaseSettings):
44
- driver: DriverConfig = Field(
45
- default=DriverConfig.NOT_SET, description="K/V storage driver"
46
- )
47
- driver_redis_url: Optional[str] = Field(
48
- default=None, description="Redis URL. Example: redis://localhost:6379"
49
- )
50
- driver_tikv_url: Optional[list[str]] = Field(
51
- default=None,
52
- description=(
53
- "TiKV PD (Placement Driver) URLs. The URL to the cluster manager of"
54
- "TiKV. Example: '[\"tikv-pd.svc:2379\"]'"
55
- ),
56
- )
43
+ driver: DriverConfig = Field(default=DriverConfig.PG, description="K/V storage driver")
57
44
  driver_local_url: Optional[str] = Field(
58
45
  default=None,
59
46
  description="Local path to store data on file system. Example: /nucliadb/data/main",
@@ -62,13 +49,17 @@ class DriverSettings(BaseSettings):
62
49
  default=None,
63
50
  description="PostgreSQL DSN. The connection string to the PG server. Example: postgres://username:password@postgres:5432/nucliadb.", # noqa
64
51
  )
52
+ driver_pg_connection_pool_min_size: int = Field(
53
+ default=10,
54
+ description="PostgreSQL min pool size. The minimum number of connections to the PostgreSQL server.",
55
+ )
65
56
  driver_pg_connection_pool_max_size: int = Field(
66
57
  default=20,
67
58
  description="PostgreSQL max pool size. The maximum number of connections to the PostgreSQL server.",
68
59
  )
69
- driver_tikv_connection_pool_size: int = Field(
70
- default=3,
71
- description="TiKV max pool size. The maximum number of connections to the TiKV server.",
60
+ driver_pg_connection_pool_acquire_timeout_ms: int = Field(
61
+ default=1000,
62
+ description="PostgreSQL pool acquire timeout in ms. The maximum time to wait until a connection becomes available.",
72
63
  )
73
64
 
74
65
 
@@ -86,7 +77,7 @@ class Settings(DriverSettings):
86
77
  total_replicas: int = 1 # number of ingest processor replicas in the cluster
87
78
  nuclia_partitions: int = 50
88
79
 
89
- max_receive_message_length: int = 4
80
+ max_receive_message_length: int = 500 # In MB
90
81
 
91
82
  # Search query timeouts
92
83
  relation_search_timeout: float = 10.0
nucliadb/ingest/utils.py CHANGED
@@ -19,9 +19,8 @@
19
19
  #
20
20
  from typing import Optional
21
21
 
22
- from nucliadb_protos.writer_pb2_grpc import WriterStub
23
-
24
22
  from nucliadb.common.maindb.utils import setup_driver
23
+ from nucliadb_protos.writer_pb2_grpc import WriterStub
25
24
  from nucliadb_utils.grpc import get_traced_grpc_channel
26
25
  from nucliadb_utils.settings import nucliadb_settings
27
26
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
@@ -37,11 +36,9 @@ async def start_ingest(service_name: Optional[str] = None):
37
36
  if nucliadb_settings.nucliadb_ingest is not None:
38
37
  # Its distributed lets create a GRPC client
39
38
  # We want Jaeger telemetry enabled
40
- channel = get_traced_grpc_channel(
41
- nucliadb_settings.nucliadb_ingest, service_name or "ingest"
42
- )
39
+ channel = get_traced_grpc_channel(nucliadb_settings.nucliadb_ingest, service_name or "ingest")
43
40
  set_utility(Utility.CHANNEL, channel)
44
- ingest = WriterStub(channel) # type: ignore
41
+ ingest = WriterStub(channel)
45
42
  set_utility(Utility.INGEST, ingest)
46
43
  else:
47
44
  # Its not distributed create a ingest
@@ -20,16 +20,21 @@
20
20
  import contextlib
21
21
  import json
22
22
  import logging
23
+ import os
24
+ from abc import ABC, abstractmethod
23
25
  from collections.abc import AsyncIterator
24
- from enum import Enum
26
+ from enum import Enum, IntEnum
25
27
  from typing import Any, Optional, Union
26
28
 
27
29
  import backoff
28
30
  import httpx
29
31
  from fastapi import Request, Response
30
32
  from fastapi.responses import StreamingResponse
31
- from pydantic import BaseModel
33
+ from lru import LRU
34
+ from pydantic import BaseModel, Field, model_validator
35
+ from typing_extensions import Self
32
36
 
37
+ from nucliadb_protos import knowledgebox_pb2, utils_pb2
33
38
  from nucliadb_telemetry import errors
34
39
  from nucliadb_utils.settings import is_onprem_nucliadb, nuclia_settings
35
40
 
@@ -48,48 +53,150 @@ WHITELISTED_HEADERS = {
48
53
  }
49
54
 
50
55
 
51
- class LearningService(str, Enum):
56
+ class LearningService(Enum):
52
57
  CONFIG = "config"
53
- COLLECTOR = "collector-api"
54
58
 
55
59
 
60
+ class SimilarityFunction(IntEnum):
61
+ # Keep this in sync with learning config repo
62
+ # It's an IntEnum to match the protobuf definition
63
+ DOT = 0
64
+ COSINE = 1
65
+
66
+
67
+ class SemanticConfig(BaseModel):
68
+ # Keep this in sync with learning config repo
69
+ similarity: SimilarityFunction
70
+ size: int
71
+ threshold: float
72
+ matryoshka_dims: list[int] = []
73
+
74
+ def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
75
+ semantic_model = knowledgebox_pb2.SemanticModelMetadata()
76
+ LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
77
+ SimilarityFunction.COSINE: utils_pb2.VectorSimilarity.COSINE,
78
+ SimilarityFunction.DOT: utils_pb2.VectorSimilarity.DOT,
79
+ }
80
+ semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[self.similarity]
81
+ semantic_model.vector_dimension = self.size
82
+ semantic_model.matryoshka_dimensions.extend(self.matryoshka_dims)
83
+ return semantic_model
84
+
85
+
86
+ # Subset of learning configuration of nucliadb's interest. Look at
87
+ # learning_config models for more fields
56
88
  class LearningConfiguration(BaseModel):
57
89
  semantic_model: str
90
+ # aka similarity function
58
91
  semantic_vector_similarity: str
59
- semantic_vector_size: Optional[int]
60
- semantic_threshold: Optional[float]
92
+ # aka vector_dimension
93
+ semantic_vector_size: Optional[int] = None
94
+ # aka min_score
95
+ semantic_threshold: Optional[float] = None
96
+ # List of possible subdivisions of the matryoshka embeddings (if the model
97
+ # supports it)
98
+ semantic_matryoshka_dimensions: Optional[list[int]] = Field(
99
+ default=None, alias="semantic_matryoshka_dims"
100
+ )
101
+
102
+ semantic_models: list[str] = Field(default_factory=list)
103
+
104
+ # This is where the config for each semantic model (aka vectorsets) is returned
105
+ semantic_model_configs: dict[str, SemanticConfig] = Field(default_factory=dict)
106
+
107
+ @model_validator(mode="before")
108
+ @classmethod
109
+ def maintain_bw_compatibility_with_single_model_configs(cls, data: Any) -> Any:
110
+ if isinstance(data, dict):
111
+ if not data.get("semantic_model", None) and len(data.get("semantic_models", [])) > 0:
112
+ data["semantic_model"] = data["semantic_models"][0]
113
+ return data
114
+
115
+ @model_validator(mode="after")
116
+ def validate_matryoshka_and_vector_dimension_consistency(self) -> Self:
117
+ vector_size = self.semantic_vector_size
118
+ matryoshka_dimensions = self.semantic_matryoshka_dimensions or []
119
+ if (
120
+ len(matryoshka_dimensions) > 0
121
+ and vector_size is not None
122
+ and vector_size not in matryoshka_dimensions
123
+ ):
124
+ raise ValueError("Semantic vector size is inconsistent with matryoshka dimensions")
125
+ return self
126
+
127
+ def into_semantic_models_metadata(
128
+ self,
129
+ ) -> dict[str, knowledgebox_pb2.SemanticModelMetadata]:
130
+ result = {}
131
+ for model_name, config in self.semantic_model_configs.items():
132
+ result[model_name] = config.into_semantic_model_metadata()
133
+ return result
134
+
135
+ def into_semantic_model_metadata(self) -> knowledgebox_pb2.SemanticModelMetadata:
136
+ semantic_model = knowledgebox_pb2.SemanticModelMetadata()
137
+
138
+ LEARNING_SIMILARITY_FUNCTION_TO_PROTO = {
139
+ "cosine": utils_pb2.VectorSimilarity.COSINE,
140
+ "dot": utils_pb2.VectorSimilarity.DOT,
141
+ }
142
+ semantic_model.similarity_function = LEARNING_SIMILARITY_FUNCTION_TO_PROTO[
143
+ self.semantic_vector_similarity.lower()
144
+ ]
145
+
146
+ if self.semantic_vector_size is not None:
147
+ semantic_model.vector_dimension = self.semantic_vector_size
148
+ else:
149
+ logger.warning("Vector dimension not set!")
150
+
151
+ if self.semantic_matryoshka_dimensions is not None:
152
+ semantic_model.matryoshka_dimensions.extend(self.semantic_matryoshka_dimensions)
153
+
154
+ return semantic_model
155
+
156
+
157
+ class ProxiedLearningConfigError(Exception):
158
+ def __init__(self, status_code: int, content: bytes, content_type: str):
159
+ self.status_code = status_code
160
+ self.content = content
161
+ self.content_type = content_type
162
+
163
+
164
+ def raise_for_status(response: httpx.Response) -> None:
165
+ try:
166
+ response.raise_for_status()
167
+ except httpx.HTTPStatusError as err:
168
+ content_type = err.response.headers.get("Content-Type", "application/json")
169
+ raise ProxiedLearningConfigError(
170
+ status_code=err.response.status_code,
171
+ content=err.response.content,
172
+ content_type=content_type,
173
+ )
61
174
 
62
175
 
63
176
  async def get_configuration(
64
177
  kbid: str,
65
178
  ) -> Optional[LearningConfiguration]:
66
- async with learning_config_client() as client:
67
- resp = await client.get(f"config/{kbid}")
68
- try:
69
- resp.raise_for_status()
70
- except httpx.HTTPStatusError as err:
71
- if err.response.status_code == 404:
72
- return None
73
- raise
74
- return LearningConfiguration.parse_obj(resp.json())
179
+ return await learning_config_service().get_configuration(kbid)
75
180
 
76
181
 
77
182
  async def set_configuration(
78
183
  kbid: str,
79
184
  config: dict[str, Any],
80
185
  ) -> LearningConfiguration:
81
- async with learning_config_client() as client:
82
- resp = await client.post(f"config/{kbid}", json=config)
83
- resp.raise_for_status()
84
- return LearningConfiguration.parse_obj(resp.json())
186
+ return await learning_config_service().set_configuration(kbid, config)
187
+
188
+
189
+ async def update_configuration(
190
+ kbid: str,
191
+ config: dict[str, Any],
192
+ ) -> None:
193
+ return await learning_config_service().update_configuration(kbid, config)
85
194
 
86
195
 
87
196
  async def delete_configuration(
88
197
  kbid: str,
89
198
  ) -> None:
90
- async with learning_config_client() as client:
91
- resp = await client.delete(f"config/{kbid}")
92
- resp.raise_for_status()
199
+ return await learning_config_service().delete_configuration(kbid)
93
200
 
94
201
 
95
202
  async def learning_config_proxy(
@@ -107,21 +214,6 @@ async def learning_config_proxy(
107
214
  )
108
215
 
109
216
 
110
- async def learning_collector_proxy(
111
- request: Request,
112
- method: str,
113
- url: str,
114
- extra_headers: Optional[dict[str, str]] = None,
115
- ) -> Union[Response, StreamingResponse]:
116
- return await proxy(
117
- service=LearningService.COLLECTOR,
118
- request=request,
119
- method=method,
120
- url=url,
121
- extra_headers=extra_headers,
122
- )
123
-
124
-
125
217
  def is_white_listed_header(header: str) -> bool:
126
218
  return header.lower() in WHITELISTED_HEADERS
127
219
 
@@ -213,13 +305,9 @@ async def proxy(
213
305
 
214
306
  def get_base_url(service: LearningService) -> str:
215
307
  if is_onprem_nucliadb():
216
- nuclia_public_url = nuclia_settings.nuclia_public_url.format(
217
- zone=nuclia_settings.nuclia_zone
218
- )
308
+ nuclia_public_url = nuclia_settings.nuclia_public_url.format(zone=nuclia_settings.nuclia_zone)
219
309
  return f"{nuclia_public_url}/api/v1"
220
- learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(
221
- service=service.value
222
- )
310
+ learning_svc_base_url = nuclia_settings.learning_internal_svc_base_url.format(service=service.value)
223
311
  return f"{learning_svc_base_url}/api/v1/internal"
224
312
 
225
313
 
@@ -248,9 +336,7 @@ async def service_client(
248
336
  # This is a workaround to be able to run integration tests that start nucliadb with docker.
249
337
  # The learning APIs are not available in the docker setup, so we use a dummy client.
250
338
  client = DummyClient(base_url=base_url, headers=headers)
251
- logger.warning(
252
- "Using dummy client. If you see this in production, something is wrong."
253
- )
339
+ logger.warning("Using dummy client. If you see this in production, something is wrong.")
254
340
  else:
255
341
  client = httpx.AsyncClient(base_url=base_url, headers=headers) # type: ignore
256
342
  try:
@@ -299,13 +385,31 @@ class DummyClient(httpx.AsyncClient):
299
385
  return self._handle_request("DELETE", *args, **kwargs)
300
386
 
301
387
  def get_config(self, *args: Any, **kwargs: Any):
388
+ size = 768 if os.environ.get("TEST_SENTENCE_ENCODER") == "multilingual-2023-02-21" else 512
302
389
  lconfig = LearningConfiguration(
303
390
  semantic_model="multilingual",
304
391
  semantic_vector_similarity="cosine",
305
- semantic_vector_size=None,
392
+ semantic_vector_size=size,
306
393
  semantic_threshold=None,
394
+ semantic_matryoshka_dims=[],
395
+ semantic_model_configs={
396
+ "multilingual": SemanticConfig(
397
+ similarity=SimilarityFunction.COSINE,
398
+ size=size,
399
+ threshold=0,
400
+ matryoshka_dims=[],
401
+ )
402
+ },
307
403
  )
308
- return self._response(content=lconfig.dict())
404
+ return self._response(content=lconfig.model_dump())
405
+
406
+ def post_config(self, *args: Any, **kwargs: Any):
407
+ # simulate post that returns the created config
408
+ return self.get_config(*args, **kwargs)
409
+
410
+ def patch_config(self, *args: Any, **kwargs: Any):
411
+ # simulate patch that returns the updated config
412
+ return self.get_config(*args, **kwargs)
309
413
 
310
414
  async def request( # type: ignore
311
415
  self,
@@ -315,9 +419,7 @@ class DummyClient(httpx.AsyncClient):
315
419
  content=None,
316
420
  headers=None,
317
421
  ) -> httpx.Response:
318
- return self._handle_request(
319
- method, url, params=params, content=content, headers=headers
320
- )
422
+ return self._handle_request(method, url, params=params, content=content, headers=headers)
321
423
 
322
424
  def _handle_request(self, *args: Any, **kwargs: Any) -> httpx.Response:
323
425
  """
@@ -331,3 +433,114 @@ class DummyClient(httpx.AsyncClient):
331
433
  return getattr(self, method)(*args, **kwargs)
332
434
  else:
333
435
  return self._response()
436
+
437
+
438
+ class LearningConfigService(ABC):
439
+ @abstractmethod
440
+ async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]: ...
441
+
442
+ @abstractmethod
443
+ async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration: ...
444
+
445
+ @abstractmethod
446
+ async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None: ...
447
+
448
+ @abstractmethod
449
+ async def delete_configuration(self, kbid: str) -> None: ...
450
+
451
+
452
+ class ProxiedLearningConfig(LearningConfigService):
453
+ async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
454
+ async with self._client() as client:
455
+ resp = await client.get(f"config/{kbid}")
456
+ try:
457
+ raise_for_status(resp)
458
+ except ProxiedLearningConfigError as err:
459
+ if err.status_code == 404:
460
+ return None
461
+ raise
462
+ return LearningConfiguration.model_validate(resp.json())
463
+
464
+ async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
465
+ async with self._client() as client:
466
+ resp = await client.post(f"config/{kbid}", json=config)
467
+ raise_for_status(resp)
468
+ return LearningConfiguration.model_validate(resp.json())
469
+
470
+ async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
471
+ async with self._client() as client:
472
+ resp = await client.patch(f"config/{kbid}", json=config)
473
+ raise_for_status(resp)
474
+ return
475
+
476
+ async def delete_configuration(self, kbid: str) -> None:
477
+ async with self._client() as client:
478
+ resp = await client.delete(f"config/{kbid}")
479
+ raise_for_status(resp)
480
+
481
+ @contextlib.asynccontextmanager
482
+ async def _client(self) -> AsyncIterator[httpx.AsyncClient]:
483
+ async with httpx.AsyncClient(
484
+ base_url=get_base_url(LearningService.CONFIG),
485
+ headers=get_auth_headers(),
486
+ ) as client:
487
+ yield client
488
+
489
+
490
+ _IN_MEMORY_CONFIGS: dict[str, LearningConfiguration]
491
+ _IN_MEMORY_CONFIGS = LRU(50) # type: ignore
492
+
493
+
494
+ class InMemoryLearningConfig(LearningConfigService):
495
+ def __init__(self):
496
+ self.in_memory_configs = {}
497
+
498
+ async def get_configuration(self, kbid: str) -> Optional[LearningConfiguration]:
499
+ return _IN_MEMORY_CONFIGS.get(kbid, None)
500
+
501
+ async def set_configuration(self, kbid: str, config: dict[str, Any]) -> LearningConfiguration:
502
+ if not config:
503
+ # generate a default config
504
+ default_model = os.environ.get("TEST_SENTENCE_ENCODER", "multilingual")
505
+ size = 768 if default_model == "multilingual-2023-02-21" else 512
506
+ # XXX for some reason, we override the model name and set this one
507
+ # default_model = "multilingual"
508
+ learning_config = LearningConfiguration(
509
+ semantic_model=default_model,
510
+ semantic_vector_similarity="cosine",
511
+ semantic_vector_size=size,
512
+ semantic_threshold=None,
513
+ semantic_matryoshka_dims=[],
514
+ semantic_models=[default_model],
515
+ semantic_model_configs={
516
+ default_model: SemanticConfig(
517
+ similarity=SimilarityFunction.COSINE,
518
+ size=size,
519
+ threshold=0,
520
+ matryoshka_dims=[],
521
+ )
522
+ },
523
+ )
524
+
525
+ else:
526
+ learning_config = LearningConfiguration.model_validate(config)
527
+
528
+ _IN_MEMORY_CONFIGS[kbid] = learning_config
529
+ return learning_config
530
+
531
+ async def update_configuration(self, kbid: str, config: dict[str, Any]) -> None:
532
+ if kbid not in _IN_MEMORY_CONFIGS:
533
+ raise ValueError(f"Configuration for kbid {kbid} not found")
534
+ learning_config = _IN_MEMORY_CONFIGS[kbid]
535
+ learning_config = learning_config.model_copy(update=config)
536
+ _IN_MEMORY_CONFIGS[kbid] = learning_config
537
+
538
+ async def delete_configuration(self, kbid: str) -> None:
539
+ _IN_MEMORY_CONFIGS.pop(kbid, None)
540
+
541
+
542
+ def learning_config_service() -> LearningConfigService:
543
+ if nuclia_settings.dummy_learning_services:
544
+ return InMemoryLearningConfig()
545
+ else:
546
+ return ProxiedLearningConfig()
@@ -20,12 +20,14 @@
20
20
  from __future__ import annotations
21
21
 
22
22
  import asyncio
23
- from typing import AsyncGenerator, Callable
23
+ from typing import AsyncGenerator, Callable, Tuple, cast
24
24
 
25
25
  from nucliadb import logger
26
26
  from nucliadb.common import datamanagers
27
27
  from nucliadb.common.cluster import manager as cluster_manager
28
28
  from nucliadb.common.context import ApplicationContext
29
+ from nucliadb.common.maindb.pg import PGDriver
30
+ from nucliadb.common.maindb.utils import get_driver
29
31
  from nucliadb.migrator.datamanager import MigrationsDataManager
30
32
  from nucliadb_telemetry import metrics
31
33
  from nucliadb_telemetry.logs import setup_logging
@@ -34,9 +36,9 @@ from nucliadb_utils.fastapi.run import serve_metrics
34
36
 
35
37
  SHARD_COUNT = metrics.Gauge("nucliadb_node_shard_count", labels={"node": ""})
36
38
 
37
- MIGRATION_COUNT = metrics.Gauge(
38
- "nucliadb_migration", labels={"type": "", "version": ""}
39
- )
39
+ MIGRATION_COUNT = metrics.Gauge("nucliadb_migration", labels={"type": "", "version": ""})
40
+
41
+ PENDING_RESOURCE_COUNT = metrics.Gauge("nucliadb_pending_resources_count")
40
42
 
41
43
 
42
44
  async def update_node_metrics(context: ApplicationContext):
@@ -57,7 +59,7 @@ async def iter_kbids(context: ApplicationContext) -> AsyncGenerator[str, None]:
57
59
  """
58
60
  Return a list of all KB ids.
59
61
  """
60
- async with context.kv_driver.transaction() as txn:
62
+ async with context.kv_driver.transaction(read_only=True) as txn:
61
63
  async for kbid, _ in datamanagers.kb.get_kbs(txn):
62
64
  yield kbid
63
65
 
@@ -72,9 +74,7 @@ async def update_migration_metrics(context: ApplicationContext):
72
74
  mdm = MigrationsDataManager(context.kv_driver)
73
75
  global_info = await mdm.get_global_info()
74
76
  if global_info is not None:
75
- MIGRATION_COUNT.set(
76
- 1, labels=dict(type="global", version=str(global_info.current_version))
77
- )
77
+ MIGRATION_COUNT.set(1, labels=dict(type="global", version=str(global_info.current_version)))
78
78
 
79
79
  version_count: dict[str, int] = {}
80
80
  async for kbid in iter_kbids(context):
@@ -88,9 +88,25 @@ async def update_migration_metrics(context: ApplicationContext):
88
88
  MIGRATION_COUNT.set(count, labels=dict(type="kb", version=version))
89
89
 
90
90
 
91
- async def run_exporter_task(
92
- context: ApplicationContext, exporter_task: Callable, interval: int
93
- ):
91
+ async def update_resource_metrics(context: ApplicationContext):
92
+ """
93
+ Report the number of pending resources older than some estimated processing time
94
+ """
95
+ driver = get_driver()
96
+ if not isinstance(driver, PGDriver):
97
+ return
98
+
99
+ async with driver._get_connection() as conn, conn.cursor() as cur:
100
+ await cur.execute(
101
+ "SELECT COUNT(*) FROM catalog "
102
+ "WHERE labels @> '{/n/s/PENDING}' "
103
+ "AND COALESCE(modified_at, created_at) BETWEEN NOW() - INTERVAL '1 month' AND NOW() - INTERVAL '6 hours'"
104
+ )
105
+ count = cast(Tuple[int], await cur.fetchone())[0]
106
+ PENDING_RESOURCE_COUNT.set(count)
107
+
108
+
109
+ async def run_exporter_task(context: ApplicationContext, exporter_task: Callable, interval: int):
94
110
  """
95
111
  Run coroutine infinitely, catching exceptions and logging them.
96
112
  It will wait for the interval before running again.
@@ -100,9 +116,7 @@ async def run_exporter_task(
100
116
  try:
101
117
  await exporter_task(context)
102
118
  except Exception:
103
- logger.error(
104
- f"Error on exporter task {exporter_task.__name__}", exc_info=True
105
- )
119
+ logger.error(f"Error on exporter task {exporter_task.__name__}", exc_info=True)
106
120
  await asyncio.sleep(interval)
107
121
  except asyncio.CancelledError:
108
122
  pass
@@ -114,12 +128,9 @@ async def run_exporter(context: ApplicationContext):
114
128
  for export_task, interval in [
115
129
  (update_node_metrics, 10),
116
130
  (update_migration_metrics, 60 * 3),
131
+ (update_resource_metrics, 60 * 5),
117
132
  ]:
118
- tasks.append(
119
- asyncio.create_task(
120
- run_exporter_task(context, export_task, interval=interval)
121
- )
122
- )
133
+ tasks.append(asyncio.create_task(run_exporter_task(context, export_task, interval=interval)))
123
134
  try:
124
135
  while True:
125
136
  await asyncio.sleep(10)
@@ -39,9 +39,7 @@ class ProcessTimeHeaderMiddleware(BaseHTTPMiddleware):
39
39
  exposed_headers.append(PROCESS_TIME_HEADER)
40
40
  response.headers[ACCESS_CONTROL_EXPOSE_HEADER] = ",".join(exposed_headers)
41
41
 
42
- async def dispatch(
43
- self, request: Request, call_next: RequestResponseEndpoint
44
- ) -> Response:
42
+ async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
45
43
  response = None
46
44
  start = time.perf_counter()
47
45
  try:
@@ -53,9 +53,7 @@ def validate():
53
53
  versions = set()
54
54
  for migration in migrations:
55
55
  if migration.version in versions:
56
- raise MigrationValidationError(
57
- f"Migration {migration.version} is duplicated"
58
- )
56
+ raise MigrationValidationError(f"Migration {migration.version} is duplicated")
59
57
  versions.add(migration.version)
60
58
 
61
59