nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -25,8 +25,9 @@ from typing import Optional, Union
25
25
  import backoff
26
26
  import nats
27
27
  import nats.js.api
28
+ import nats.js.errors
28
29
  from nats.aio.client import Msg
29
- from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
30
+ from nats.js import JetStreamContext
30
31
 
31
32
  from nucliadb.common.cluster.exceptions import ShardsNotFound
32
33
  from nucliadb.common.maindb.driver import Driver
@@ -34,16 +35,18 @@ from nucliadb.common.maindb.exceptions import ConflictError
34
35
  from nucliadb.ingest import logger
35
36
  from nucliadb.ingest.orm.exceptions import DeadletteredError, SequenceOrderViolation
36
37
  from nucliadb.ingest.orm.processor import Processor, sequence_manager
38
+ from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
37
39
  from nucliadb_telemetry import context, errors, metrics
38
40
  from nucliadb_utils import const
39
41
  from nucliadb_utils.cache.pubsub import PubSubDriver
40
42
  from nucliadb_utils.nats import MessageProgressUpdater, NatsConnectionManager
41
43
  from nucliadb_utils.settings import nats_consumer_settings
42
44
  from nucliadb_utils.storages.storage import Storage
45
+ from nucliadb_utils.utilities import has_feature
43
46
 
44
47
  consumer_observer = metrics.Observer(
45
48
  "message_processor",
46
- labels={"source": ""},
49
+ labels={"source": "", "partition": ""},
47
50
  buckets=[
48
51
  0.01,
49
52
  0.025,
@@ -83,40 +86,62 @@ class IngestConsumer:
83
86
 
84
87
  self.lock = lock or asyncio.Lock()
85
88
  self.processor = Processor(driver, storage, pubsub, partition)
89
+ self.subscription: Optional[JetStreamContext.PullSubscription] = None
90
+
91
+ async def ack_message(self, msg: Msg, kbid: Optional[str] = None):
92
+ context = {}
93
+ if kbid:
94
+ context["kbid"] = kbid
95
+ if has_feature(const.Features.NATS_SYNC_ACK, default=False, context=context):
96
+ await msg.ack_sync(timeout=10)
97
+ else:
98
+ await msg.ack()
86
99
 
87
100
  async def initialize(self):
88
101
  await self.setup_nats_subscription()
89
102
  self.initialized = True
90
103
 
104
+ async def finalize(self):
105
+ if self.initialized:
106
+ await self.teardown_nats_subscription()
107
+ self.initialized = False
108
+
109
+ async def teardown_nats_subscription(self):
110
+ if self.subscription is not None:
111
+ try:
112
+ await self.nats_connection_manager.unsubscribe(self.subscription)
113
+ except nats.errors.ConnectionClosedError:
114
+ logger.warning("Connection closed while unsubscribing")
115
+ pass
116
+ self.subscription = None
117
+
91
118
  async def setup_nats_subscription(self):
92
119
  last_seqid = await sequence_manager.get_last_seqid(self.driver, self.partition)
93
120
  if last_seqid is None:
94
121
  last_seqid = 1
95
122
  subject = const.Streams.INGEST.subject.format(partition=self.partition)
96
- await self.nats_connection_manager.subscribe(
97
- subject=subject,
98
- queue=const.Streams.INGEST.group.format(partition=self.partition),
123
+ durable_name = const.Streams.INGEST.group.format(partition=self.partition)
124
+ self.subscription = await self.nats_connection_manager.pull_subscribe(
99
125
  stream=const.Streams.INGEST.name,
100
- flow_control=True,
126
+ subject=subject,
127
+ durable=durable_name,
101
128
  cb=self.subscription_worker,
102
129
  subscription_lost_cb=self.setup_nats_subscription,
103
130
  config=nats.js.api.ConsumerConfig(
131
+ durable_name=durable_name,
104
132
  deliver_policy=nats.js.api.DeliverPolicy.BY_START_SEQUENCE,
105
133
  opt_start_seq=last_seqid,
106
134
  ack_policy=nats.js.api.AckPolicy.EXPLICIT,
107
- max_ack_pending=nats_consumer_settings.nats_max_ack_pending,
135
+ max_ack_pending=1,
108
136
  max_deliver=nats_consumer_settings.nats_max_deliver,
109
137
  ack_wait=nats_consumer_settings.nats_ack_wait,
110
- idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
111
138
  ),
112
139
  )
113
140
  logger.info(
114
- f"Subscribed to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
141
+ f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
115
142
  )
116
143
 
117
- @backoff.on_exception(
118
- backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
119
- )
144
+ @backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
120
145
  async def _process(self, pb: BrokerMessage, seqid: int):
121
146
  await self.processor.process(pb, seqid, self.partition)
122
147
 
@@ -142,18 +167,29 @@ class IngestConsumer:
142
167
  logger.warning("Could not delete blob reference", exc_info=True)
143
168
 
144
169
  async def subscription_worker(self, msg: Msg):
170
+ kbid: Optional[str] = None
145
171
  subject = msg.subject
146
172
  reply = msg.reply
147
173
  seqid = int(reply.split(".")[5])
148
174
  message_source = "<msg source not set>"
175
+ num_delivered = msg.metadata.num_delivered
176
+ if num_delivered > 1:
177
+ logger.warning(
178
+ "Message has been redelivered",
179
+ extra={
180
+ "seqid": seqid,
181
+ "subject": subject,
182
+ "reply": reply,
183
+ "num_delivered": num_delivered,
184
+ },
185
+ )
149
186
  start = time.monotonic()
150
187
 
151
- async with MessageProgressUpdater(
152
- msg, nats_consumer_settings.nats_ack_wait * 0.66
153
- ), self.lock:
154
- logger.info(
155
- f"Message processing: subject:{subject}, seqid: {seqid}, reply: {reply}"
156
- )
188
+ async with (
189
+ MessageProgressUpdater(msg, nats_consumer_settings.nats_ack_wait * 0.66),
190
+ self.lock,
191
+ ):
192
+ logger.info(f"Message processing: subject:{subject}, seqid: {seqid}, reply: {reply}")
157
193
  try:
158
194
  pb = await self.get_broker_message(msg)
159
195
  if pb.source == pb.MessageSource.PROCESSOR:
@@ -169,36 +205,39 @@ class IngestConsumer:
169
205
  f"Received from {message_source} on {pb.kbid}/{pb.uuid} seq {seqid} partition {self.partition} at {time}" # noqa
170
206
  )
171
207
  context.add_context({"kbid": pb.kbid, "rid": pb.uuid})
172
-
208
+ kbid = pb.kbid
173
209
  try:
174
- with consumer_observer(
175
- {
176
- "source": "writer"
177
- if pb.source == pb.MessageSource.WRITER
178
- else "processor"
179
- }
180
- ):
210
+ source = "writer" if pb.source == pb.MessageSource.WRITER else "processor"
211
+ with consumer_observer({"source": source, "partition": self.partition}):
181
212
  await self._process(pb, seqid)
182
213
  except SequenceOrderViolation as err:
183
- log_func = logger.error
184
- if seqid == err.last_seqid: # pragma: no cover
185
- # Occasional retries of the last processed message may happen
186
- log_func = logger.warning
187
- log_func(
188
- f"Old txn: DISCARD (nucliadb seqid: {seqid}, partition: {self.partition}). Current seqid: {err.last_seqid}" # noqa
214
+ logger.log(
215
+ level=logging.ERROR if seqid < err.last_seqid else logging.WARNING,
216
+ msg="Old txn. Discarding message",
217
+ extra={
218
+ "stored_seqid": err.last_seqid,
219
+ "message_seqid": seqid,
220
+ "partition": self.partition,
221
+ "kbid": pb.kbid,
222
+ "msg_delivered_count": msg.metadata.num_delivered,
223
+ },
189
224
  )
190
225
  else:
191
226
  message_type_name = pb.MessageType.Name(pb.type)
192
227
  time_to_process = time.monotonic() - start
193
- log_level = (
194
- logging.INFO if time_to_process < 10 else logging.WARNING
195
- )
228
+ log_level = logging.INFO if time_to_process < 10 else logging.WARNING
196
229
  logger.log(
197
230
  log_level,
198
- f"Successfully processed {message_type_name} message from \
199
- {message_source}. kb: {pb.kbid}, resource: {pb.uuid}, \
200
- nucliadb seqid: {seqid}, partition: {self.partition} as {audit_time}, \
201
- total time: {time_to_process:.2f}s",
231
+ f"Successfully processed {message_type_name} message",
232
+ extra={
233
+ "kbid": pb.kbid,
234
+ "rid": pb.uuid,
235
+ "message_source": message_source,
236
+ "nucliadb_seqid": seqid,
237
+ "partition": self.partition,
238
+ "total_time": time_to_process,
239
+ "audit_time": audit_time,
240
+ },
202
241
  )
203
242
  except DeadletteredError as e:
204
243
  # Messages that have been sent to deadletter at some point
@@ -209,7 +248,8 @@ class IngestConsumer:
209
248
  f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
210
249
  f"Check sentry for more details: {str(e)}"
211
250
  )
212
- await msg.ack()
251
+ await self.ack_message(msg, kbid)
252
+ logger.info("Message acked because of deadletter", extra={"seqid": seqid})
213
253
  except (ShardsNotFound,) as e:
214
254
  # Any messages that for some unexpected inconsistency have failed and won't be tried again
215
255
  # as we cannot do anything about it
@@ -220,7 +260,8 @@ class IngestConsumer:
220
260
  f"This message has been dropped and won't be retried again"
221
261
  f"Check sentry for more details: {str(e)}"
222
262
  )
223
- await msg.ack()
263
+ await self.ack_message(msg, kbid)
264
+ logger.info("Message acked because of drop", extra={"seqid": seqid})
224
265
  except Exception as e:
225
266
  # Unhandled exceptions that need to be retried after a small delay
226
267
  errors.capture_exception(e)
@@ -230,10 +271,12 @@ class IngestConsumer:
230
271
  f"Check sentry for more details: {str(e)}"
231
272
  )
232
273
  await msg.nak()
274
+ logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})
233
275
  raise e
234
276
  else:
235
277
  # Successful processing
236
- await msg.ack()
278
+ await self.ack_message(msg, kbid)
279
+ logger.info("Message acked because of success", extra={"seqid": seqid})
237
280
  await self.clean_broker_message(msg)
238
281
 
239
282
 
@@ -250,28 +293,29 @@ class IngestProcessedConsumer(IngestConsumer):
250
293
 
251
294
  async def setup_nats_subscription(self):
252
295
  subject = const.Streams.INGEST_PROCESSED.subject
253
- await self.nats_connection_manager.subscribe(
254
- subject=subject,
255
- queue=const.Streams.INGEST_PROCESSED.group,
296
+ durable_name = const.Streams.INGEST_PROCESSED.group
297
+ self.subscription = await self.nats_connection_manager.pull_subscribe(
256
298
  stream=const.Streams.INGEST_PROCESSED.name,
257
- flow_control=True,
299
+ subject=subject,
300
+ durable=durable_name,
258
301
  cb=self.subscription_worker,
259
302
  subscription_lost_cb=self.setup_nats_subscription,
260
303
  config=nats.js.api.ConsumerConfig(
304
+ durable_name=durable_name,
261
305
  ack_policy=nats.js.api.AckPolicy.EXPLICIT,
262
- max_ack_pending=100, # custom ack pending here
306
+ deliver_policy=nats.js.api.DeliverPolicy.ALL,
307
+ # We set it to 20 because we don't care about order here and we want to be able to HPA based
308
+ # on the number of pending messages in the queue.
309
+ max_ack_pending=20,
263
310
  max_deliver=nats_consumer_settings.nats_max_deliver,
264
311
  ack_wait=nats_consumer_settings.nats_ack_wait,
265
- idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
266
312
  ),
267
313
  )
268
314
  logger.info(
269
- f"Subscribed to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
315
+ f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
270
316
  )
271
317
 
272
- @backoff.on_exception(
273
- backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
274
- )
318
+ @backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
275
319
  async def _process(self, pb: BrokerMessage, seqid: int):
276
320
  """
277
321
  We are setting `transaction_check` to False here because we can not mix
@@ -29,6 +29,7 @@ from nucliadb_protos import writer_pb2
29
29
  from nucliadb_utils import const
30
30
  from nucliadb_utils.cache.pubsub import PubSubDriver
31
31
  from nucliadb_utils.storages.storage import Storage
32
+ from nucliadb_utils.utilities import get_audit
32
33
 
33
34
  from .utils import DelayedTaskHandler
34
35
 
@@ -82,25 +83,22 @@ class MaterializerHandler:
82
83
  notification.ParseFromString(data)
83
84
 
84
85
  if (
85
- notification.action
86
- != writer_pb2.Notification.Action.COMMIT # only on commits
86
+ notification.action != writer_pb2.Notification.Action.COMMIT # only on commits
87
87
  or notification.write_type
88
88
  == writer_pb2.Notification.WriteType.MODIFIED # only on new resources and deletes
89
89
  ):
90
90
  return
91
91
 
92
- self.task_handler.schedule(
93
- notification.kbid, partial(self.process, notification.kbid)
94
- )
92
+ self.task_handler.schedule(notification.kbid, partial(self.process, notification.kbid))
95
93
 
96
94
  async def process(self, kbid: str) -> None:
97
95
  logger.info(f"Materializing knowledgebox", extra={"kbid": kbid})
98
- async with datamanagers.with_transaction(read_only=True) as txn:
99
- value = await datamanagers.resources.calculate_number_of_resources(
100
- txn, kbid=kbid
101
- )
96
+ async with datamanagers.with_ro_transaction() as txn:
97
+ value = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
102
98
  async with datamanagers.with_transaction() as txn:
103
- await datamanagers.resources.set_number_of_resources(
104
- txn, kbid=kbid, value=value
105
- )
99
+ await datamanagers.resources.set_number_of_resources(txn, kbid=kbid, value=value)
106
100
  await txn.commit()
101
+
102
+ audit = get_audit()
103
+ if audit:
104
+ audit.report_resources(kbid=kbid, resources=value)
@@ -21,10 +21,7 @@ import asyncio
21
21
  import base64
22
22
  from typing import Optional
23
23
 
24
- import nats
25
- import nats.errors
26
24
  from aiohttp.client_exceptions import ClientConnectorError
27
- from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
28
25
 
29
26
  from nucliadb.common import datamanagers
30
27
  from nucliadb.common.http_clients.processing import ProcessingHTTPClient, get_nua_api_id
@@ -32,11 +29,13 @@ from nucliadb.common.maindb.driver import Driver
32
29
  from nucliadb.ingest import logger, logger_activity
33
30
  from nucliadb.ingest.orm.exceptions import ReallyStopPulling
34
31
  from nucliadb.ingest.orm.processor import Processor
32
+ from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
35
33
  from nucliadb_telemetry import errors
36
34
  from nucliadb_utils import const
37
35
  from nucliadb_utils.cache.pubsub import PubSubDriver
38
36
  from nucliadb_utils.settings import nuclia_settings
39
37
  from nucliadb_utils.storages.storage import Storage
38
+ from nucliadb_utils.transaction import MaxTransactionSizeExceededError
40
39
  from nucliadb_utils.utilities import get_storage, get_transaction_utility
41
40
 
42
41
 
@@ -78,9 +77,7 @@ class PullWorker:
78
77
  data = base64.b64decode(payload)
79
78
  pb.ParseFromString(data)
80
79
 
81
- logger.debug(
82
- f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}"
83
- )
80
+ logger.debug(f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}")
84
81
 
85
82
  if not self.local_subscriber:
86
83
  transaction_utility = get_transaction_utility()
@@ -93,11 +90,9 @@ class PullWorker:
93
90
  # send to separate processor
94
91
  target_subject=const.Streams.INGEST_PROCESSED.subject,
95
92
  )
96
- except nats.errors.MaxPayloadError:
93
+ except MaxTransactionSizeExceededError:
97
94
  storage = await get_storage()
98
- stored_key = await storage.set_stream_message(
99
- kbid=pb.kbid, rid=pb.uuid, data=data
100
- )
95
+ stored_key = await storage.set_stream_message(kbid=pb.kbid, rid=pb.uuid, data=data)
101
96
  referenced_pb = BrokerMessageBlobReference(
102
97
  uuid=pb.uuid, kbid=pb.kbid, storage_key=stored_key
103
98
  )
@@ -141,9 +136,7 @@ class PullWorker:
141
136
  try:
142
137
  pull_type_id = get_nua_api_id()
143
138
  except Exception as exc:
144
- logger.exception(
145
- "Could not read NUA API Key. Can not start pull worker"
146
- )
139
+ logger.exception("Could not read NUA API Key. Can not start pull worker")
147
140
  raise ReallyStopPulling() from exc
148
141
  else:
149
142
  pull_type_id = "main"
@@ -152,7 +145,7 @@ class PullWorker:
152
145
  logger.info(f"Collecting from NucliaDB Cloud {self.partition} partition")
153
146
  while True:
154
147
  try:
155
- async with datamanagers.with_transaction() as txn:
148
+ async with datamanagers.with_ro_transaction() as txn:
156
149
  cursor = await datamanagers.processing.get_pull_offset(
157
150
  txn, pull_type_id=pull_type_id, partition=self.partition
158
151
  )
@@ -176,9 +169,7 @@ class PullWorker:
176
169
  await self.handle_message(payload)
177
170
  except Exception as e:
178
171
  errors.capture_exception(e)
179
- logger.exception(
180
- "Error while pulling and processing message/s"
181
- )
172
+ logger.exception("Error while pulling and processing message/s")
182
173
  raise e
183
174
  async with datamanagers.with_transaction() as txn:
184
175
  await datamanagers.processing.set_pull_offset(
@@ -189,9 +180,7 @@ class PullWorker:
189
180
  )
190
181
  await txn.commit()
191
182
  elif data.status == "empty":
192
- logger_activity.debug(
193
- f"No messages waiting in partition #{self.partition}"
194
- )
183
+ logger_activity.debug(f"No messages waiting in partition #{self.partition}")
195
184
  await asyncio.sleep(self.pull_time_empty_backoff)
196
185
  else:
197
186
  logger.info(f"Proxy pull answered with error: {data}")
@@ -202,9 +191,7 @@ class PullWorker:
202
191
  KeyboardInterrupt,
203
192
  SystemExit,
204
193
  ):
205
- logger.info(
206
- f"Pull task for partition #{self.partition} was canceled, exiting"
207
- )
194
+ logger.info(f"Pull task for partition #{self.partition} was canceled, exiting")
208
195
  raise ReallyStopPulling()
209
196
 
210
197
  except ClientConnectorError:
@@ -214,14 +201,12 @@ class PullWorker:
214
201
  )
215
202
  await asyncio.sleep(self.pull_time_error_backoff)
216
203
 
217
- except nats.errors.MaxPayloadError as e:
204
+ except MaxTransactionSizeExceededError as e:
218
205
  if data is not None:
219
206
  payload_length = 0
220
207
  if data.payload:
221
208
  payload_length = len(base64.b64decode(data.payload))
222
- logger.error(
223
- f"Message too big for transaction: {payload_length}"
224
- )
209
+ logger.error(f"Message too big for transaction: {payload_length}")
225
210
  raise e
226
211
  except Exception:
227
212
  logger.exception("Unhandled error pulling messages from processing")
@@ -45,9 +45,7 @@ from .shard_creator import ShardCreatorHandler
45
45
  def _handle_task_result(task: asyncio.Task) -> None:
46
46
  e = task.exception()
47
47
  if e:
48
- logger.exception(
49
- "Loop stopped by exception. This should not happen. Exiting.", exc_info=e
50
- )
48
+ logger.exception("Loop stopped by exception. This should not happen. Exiting.", exc_info=e)
51
49
  sys.exit(1)
52
50
 
53
51
 
@@ -87,9 +85,7 @@ async def start_ingest_consumers(
87
85
  if transaction_settings.transaction_local:
88
86
  raise ConfigurationError("Can not start ingest consumers in local mode")
89
87
 
90
- while len(
91
- manager.get_index_nodes()
92
- ) == 0 and running_settings.running_environment not in (
88
+ while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
93
89
  "local",
94
90
  "test",
95
91
  ):
@@ -101,9 +97,9 @@ async def start_ingest_consumers(
101
97
  storage = await get_storage(service_name=service_name or SERVICE_NAME)
102
98
  nats_connection_manager = get_nats_manager()
103
99
 
104
- max_concurrent_processing = asyncio.Semaphore(
105
- settings.max_concurrent_ingest_processing
106
- )
100
+ max_concurrent_processing = asyncio.Semaphore(settings.max_concurrent_ingest_processing)
101
+
102
+ consumer_finalizers = []
107
103
 
108
104
  for partition in settings.partitions:
109
105
  consumer = IngestConsumer(
@@ -115,8 +111,15 @@ async def start_ingest_consumers(
115
111
  lock=max_concurrent_processing,
116
112
  )
117
113
  await consumer.initialize()
114
+ consumer_finalizers.append(consumer.finalize)
118
115
 
119
- return nats_connection_manager.finalize
116
+ async def _finalize():
117
+ # Finalize all the consumers and the nats connection manager
118
+ for consumer_finalize in consumer_finalizers:
119
+ await consumer_finalize()
120
+ await nats_connection_manager.finalize()
121
+
122
+ return _finalize
120
123
 
121
124
 
122
125
  async def start_ingest_processed_consumer(
@@ -132,9 +135,7 @@ async def start_ingest_processed_consumer(
132
135
  if transaction_settings.transaction_local:
133
136
  raise ConfigurationError("Can not start ingest consumers in local mode")
134
137
 
135
- while len(
136
- manager.get_index_nodes()
137
- ) == 0 and running_settings.running_environment not in (
138
+ while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
138
139
  "local",
139
140
  "test",
140
141
  ):
@@ -159,22 +160,22 @@ async def start_ingest_processed_consumer(
159
160
 
160
161
 
161
162
  async def start_auditor() -> Callable[[], Awaitable[None]]:
162
- driver = await setup_driver()
163
163
  audit = get_audit()
164
164
  assert audit is not None
165
+
165
166
  pubsub = await get_pubsub()
166
167
  assert pubsub is not None, "Pubsub is not configured"
167
168
  storage = await get_storage(service_name=SERVICE_NAME)
168
- index_auditor = IndexAuditHandler(driver=driver, audit=audit, pubsub=pubsub)
169
- resource_writes_auditor = ResourceWritesAuditHandler(
170
- driver=driver, storage=storage, audit=audit, pubsub=pubsub
171
- )
169
+ index_auditor = IndexAuditHandler(audit=audit, pubsub=pubsub)
170
+ resource_writes_auditor = ResourceWritesAuditHandler(storage=storage, audit=audit, pubsub=pubsub)
172
171
 
173
172
  await index_auditor.initialize()
174
173
  await resource_writes_auditor.initialize()
175
174
 
176
175
  return partial(
177
- asyncio.gather, index_auditor.finalize(), resource_writes_auditor.finalize() # type: ignore
176
+ asyncio.gather,
177
+ index_auditor.finalize(),
178
+ resource_writes_auditor.finalize(), # type: ignore
178
179
  )
179
180
 
180
181
 
@@ -22,7 +22,7 @@ import logging
22
22
  import uuid
23
23
  from functools import partial
24
24
 
25
- from nucliadb.common import datamanagers, locking
25
+ from nucliadb.common import locking
26
26
  from nucliadb.common.cluster.manager import choose_node
27
27
  from nucliadb.common.cluster.utils import get_shard_manager
28
28
  from nucliadb.common.maindb.driver import Driver
@@ -82,21 +82,19 @@ class ShardCreatorHandler:
82
82
  metrics.total_messages.inc({"type": "shard_creator", "action": "ignored"})
83
83
  return
84
84
 
85
- self.task_handler.schedule(
86
- notification.kbid, partial(self.process_kb, notification.kbid)
87
- )
85
+ self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
88
86
  metrics.total_messages.inc({"type": "shard_creator", "action": "scheduled"})
89
87
 
90
88
  @metrics.handler_histo.wrap({"type": "shard_creator"})
91
89
  async def process_kb(self, kbid: str) -> None:
92
90
  logger.info({"message": "Processing notification for kbid", "kbid": kbid})
93
91
  async with self.driver.transaction(read_only=True) as txn:
94
- kb_shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
95
92
  current_shard = await self.shard_manager.get_current_active_shard(txn, kbid)
96
93
 
97
- if kb_shards is None or current_shard is None:
94
+ if current_shard is None:
98
95
  logger.error(
99
- "Processing a notification for a nonexistent", extra={"kbid": kbid}
96
+ "Processing a notification for KB with no current shard",
97
+ extra={"kbid": kbid},
100
98
  )
101
99
  return
102
100
 
@@ -105,13 +103,8 @@ class ShardCreatorHandler:
105
103
  async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
106
104
  # remember, a lock will do at least 1+ reads and 1 write.
107
105
  # with heavy writes, this adds some simple k/v pressure
108
- node, shard_id = choose_node(current_shard)
106
+ node, shard_id = choose_node(current_shard, use_nidx=True)
109
107
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
110
108
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
111
109
  )
112
- await self.shard_manager.maybe_create_new_shard(
113
- kbid,
114
- shard.paragraphs,
115
- shard.fields,
116
- kb_shards.release_channel,
117
- )
110
+ await self.shard_manager.maybe_create_new_shard(kbid, shard.paragraphs)
@@ -48,9 +48,7 @@ class DelayedTaskHandler:
48
48
  for task in list(self.outstanding_tasks.values()):
49
49
  await task
50
50
 
51
- def schedule(
52
- self, key: str, handler: Callable[[], Coroutine[None, None, None]]
53
- ) -> None:
51
+ def schedule(self, key: str, handler: Callable[[], Coroutine[None, None, None]]) -> None:
54
52
  if key in self.to_process:
55
53
  # already waiting to process this key, ignore
56
54
  return