nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
nucliadb/ingest/app.py CHANGED
@@ -18,10 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
+ import importlib.metadata
21
22
  from typing import Awaitable, Callable
22
23
 
23
- import pkg_resources
24
-
25
24
  from nucliadb import health
26
25
  from nucliadb.common.cluster.discovery.utils import (
27
26
  setup_cluster_discovery,
@@ -30,10 +29,12 @@ from nucliadb.common.cluster.discovery.utils import (
30
29
  from nucliadb.common.cluster.settings import settings as cluster_settings
31
30
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
32
31
  from nucliadb.common.context import ApplicationContext
32
+ from nucliadb.common.nidx import start_nidx_utility
33
33
  from nucliadb.export_import.tasks import get_exports_consumer, get_imports_consumer
34
34
  from nucliadb.ingest import SERVICE_NAME
35
35
  from nucliadb.ingest.consumer import service as consumer_service
36
36
  from nucliadb.ingest.partitions import assign_partitions
37
+ from nucliadb.ingest.processing import start_processing_engine, stop_processing_engine
37
38
  from nucliadb.ingest.service import start_grpc
38
39
  from nucliadb.ingest.settings import settings
39
40
  from nucliadb_telemetry import errors
@@ -46,10 +47,12 @@ from nucliadb_utils.utilities import (
46
47
  start_audit_utility,
47
48
  start_indexing_utility,
48
49
  start_nats_manager,
50
+ start_partitioning_utility,
49
51
  start_transaction_utility,
50
52
  stop_audit_utility,
51
53
  stop_indexing_utility,
52
54
  stop_nats_manager,
55
+ stop_partitioning_utility,
53
56
  stop_transaction_utility,
54
57
  )
55
58
 
@@ -59,15 +62,17 @@ async def initialize() -> list[Callable[[], Awaitable[None]]]:
59
62
 
60
63
  await setup_cluster()
61
64
  await start_transaction_utility(SERVICE_NAME)
62
- if (
63
- not cluster_settings.standalone_mode
64
- and indexing_settings.index_jetstream_servers is not None
65
- ):
65
+ if not cluster_settings.standalone_mode and indexing_settings.index_jetstream_servers is not None:
66
66
  await start_indexing_utility(SERVICE_NAME)
67
67
 
68
+ start_partitioning_utility()
69
+
70
+ await start_nidx_utility()
71
+
68
72
  await start_audit_utility(SERVICE_NAME)
69
73
 
70
74
  finalizers = [
75
+ stop_partitioning_utility,
71
76
  stop_transaction_utility,
72
77
  stop_indexing_utility,
73
78
  stop_audit_utility,
@@ -123,8 +128,7 @@ async def main_consumer(): # pragma: no cover
123
128
  ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
124
129
 
125
130
  await run_until_exit(
126
- [grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown]
127
- + finalizers
131
+ [grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown] + finalizers
128
132
  )
129
133
 
130
134
 
@@ -138,12 +142,13 @@ async def main_orm_grpc(): # pragma: no cover
138
142
  async def main_ingest_processed_consumer(): # pragma: no cover
139
143
  finalizers = await initialize()
140
144
 
145
+ await start_processing_engine()
141
146
  metrics_server = await serve_metrics()
142
147
  grpc_health_finalizer = await health.start_grpc_health_service(settings.grpc_port)
143
148
  consumer = await consumer_service.start_ingest_processed_consumer(SERVICE_NAME)
144
149
 
145
150
  await run_until_exit(
146
- [grpc_health_finalizer, consumer, metrics_server.shutdown] + finalizers
151
+ [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine] + finalizers
147
152
  )
148
153
 
149
154
 
@@ -181,10 +186,9 @@ async def main_subscriber_workers(): # pragma: no cover
181
186
 
182
187
  def setup_configuration(): # pragma: no cover
183
188
  setup_logging()
184
-
185
189
  assign_partitions(settings)
186
190
 
187
- errors.setup_error_handling(pkg_resources.get_distribution("nucliadb").version)
191
+ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
188
192
 
189
193
  if asyncio._get_running_loop() is not None:
190
194
  raise RuntimeError("cannot be called from a running event loop")
@@ -23,9 +23,11 @@ import logging
23
23
  import uuid
24
24
  from functools import partial
25
25
 
26
+ from nucliadb.common import datamanagers
26
27
  from nucliadb.common.cluster.exceptions import ShardsNotFound
27
28
  from nucliadb.common.cluster.manager import choose_node
28
29
  from nucliadb.common.cluster.utils import get_shard_manager
30
+ from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
29
31
  from nucliadb_protos import audit_pb2, nodereader_pb2, noderesources_pb2, writer_pb2
30
32
  from nucliadb_utils import const
31
33
  from nucliadb_utils.audit.audit import AuditStorage
@@ -91,16 +93,14 @@ class IndexAuditHandler:
91
93
  metrics.total_messages.inc({"action": "ignored", "type": "audit_counter"})
92
94
  return
93
95
 
94
- self.task_handler.schedule(
95
- notification.kbid, partial(self.process_kb, notification.kbid)
96
- )
96
+ self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
97
97
  metrics.total_messages.inc({"action": "scheduled", "type": "audit_counter"})
98
98
 
99
99
  @metrics.handler_histo.wrap({"type": "audit_counter"})
100
100
  async def process_kb(self, kbid: str) -> None:
101
101
  try:
102
- shard_groups: list[writer_pb2.ShardObject] = (
103
- await self.shard_manager.get_shards_by_kbid(kbid)
102
+ shard_groups: list[writer_pb2.ShardObject] = await self.shard_manager.get_shards_by_kbid(
103
+ kbid
104
104
  )
105
105
  except ShardsNotFound:
106
106
  logger.warning(f"No shards found for kbid {kbid}, skipping")
@@ -112,7 +112,8 @@ class IndexAuditHandler:
112
112
  total_paragraphs = 0
113
113
 
114
114
  for shard_obj in shard_groups:
115
- node, shard_id = choose_node(shard_obj)
115
+ # TODO: Uses node for auditing, don't want to suddenly change metrics
116
+ node, shard_id = choose_node(shard_obj, use_nidx=False)
116
117
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
117
118
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
118
119
  )
@@ -120,12 +121,18 @@ class IndexAuditHandler:
120
121
  total_fields += shard.fields
121
122
  total_paragraphs += shard.paragraphs
122
123
 
123
- await self.audit.report(
124
+ async with datamanagers.with_ro_transaction() as txn:
125
+ num_vectorsets = (
126
+ len([vs async for vs in datamanagers.vectorsets.iter(txn=txn, kbid=kbid)]) or 1
127
+ )
128
+
129
+ self.audit.report_storage(
124
130
  kbid=kbid,
125
- audit_type=audit_pb2.AuditRequest.AuditType.INDEXED,
126
- kb_counter=audit_pb2.AuditKBCounter(
127
- fields=total_fields, paragraphs=total_paragraphs
128
- ),
131
+ paragraphs=total_paragraphs,
132
+ fields=total_fields,
133
+ bytes=total_paragraphs # This is an estimation of bytes stored in a KB
134
+ * AVG_PARAGRAPH_SIZE_BYTES
135
+ * num_vectorsets,
129
136
  )
130
137
 
131
138
 
@@ -170,21 +177,16 @@ class ResourceWritesAuditHandler:
170
177
  return
171
178
 
172
179
  message_audit: writer_pb2.Audit = notification.message_audit
173
- if (
174
- message_audit.message_source
175
- == writer_pb2.BrokerMessage.MessageSource.PROCESSOR
176
- ):
180
+ if message_audit.message_source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
177
181
  metrics.total_messages.inc({"action": "ignored", "type": "audit_fields"})
178
182
  return
179
183
 
180
- logger.info(
181
- {"message": "Processing field audit for kbid", "kbid": notification.kbid}
182
- )
184
+ logger.info({"message": "Processing field audit for kbid", "kbid": notification.kbid})
183
185
 
184
186
  metrics.total_messages.inc({"action": "scheduled", "type": "audit_fields"})
185
187
  with metrics.handler_histo({"type": "audit_fields"}):
186
188
  when = message_audit.when if message_audit.HasField("when") else None
187
- await self.audit.report(
189
+ self.audit.report_and_send(
188
190
  kbid=message_audit.kbid,
189
191
  when=when,
190
192
  user=message_audit.user,
@@ -25,8 +25,9 @@ from typing import Optional, Union
25
25
  import backoff
26
26
  import nats
27
27
  import nats.js.api
28
+ import nats.js.errors
28
29
  from nats.aio.client import Msg
29
- from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
30
+ from nats.js import JetStreamContext
30
31
 
31
32
  from nucliadb.common.cluster.exceptions import ShardsNotFound
32
33
  from nucliadb.common.maindb.driver import Driver
@@ -34,16 +35,18 @@ from nucliadb.common.maindb.exceptions import ConflictError
34
35
  from nucliadb.ingest import logger
35
36
  from nucliadb.ingest.orm.exceptions import DeadletteredError, SequenceOrderViolation
36
37
  from nucliadb.ingest.orm.processor import Processor, sequence_manager
38
+ from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
37
39
  from nucliadb_telemetry import context, errors, metrics
38
40
  from nucliadb_utils import const
39
41
  from nucliadb_utils.cache.pubsub import PubSubDriver
40
42
  from nucliadb_utils.nats import MessageProgressUpdater, NatsConnectionManager
41
43
  from nucliadb_utils.settings import nats_consumer_settings
42
44
  from nucliadb_utils.storages.storage import Storage
45
+ from nucliadb_utils.utilities import has_feature
43
46
 
44
47
  consumer_observer = metrics.Observer(
45
48
  "message_processor",
46
- labels={"source": ""},
49
+ labels={"source": "", "partition": ""},
47
50
  buckets=[
48
51
  0.01,
49
52
  0.025,
@@ -83,40 +86,62 @@ class IngestConsumer:
83
86
 
84
87
  self.lock = lock or asyncio.Lock()
85
88
  self.processor = Processor(driver, storage, pubsub, partition)
89
+ self.subscription: Optional[JetStreamContext.PullSubscription] = None
90
+
91
+ async def ack_message(self, msg: Msg, kbid: Optional[str] = None):
92
+ context = {}
93
+ if kbid:
94
+ context["kbid"] = kbid
95
+ if has_feature(const.Features.NATS_SYNC_ACK, default=False, context=context):
96
+ await msg.ack_sync(timeout=10)
97
+ else:
98
+ await msg.ack()
86
99
 
87
100
  async def initialize(self):
88
101
  await self.setup_nats_subscription()
89
102
  self.initialized = True
90
103
 
104
+ async def finalize(self):
105
+ if self.initialized:
106
+ await self.teardown_nats_subscription()
107
+ self.initialized = False
108
+
109
+ async def teardown_nats_subscription(self):
110
+ if self.subscription is not None:
111
+ try:
112
+ await self.nats_connection_manager.unsubscribe(self.subscription)
113
+ except nats.errors.ConnectionClosedError:
114
+ logger.warning("Connection closed while unsubscribing")
115
+ pass
116
+ self.subscription = None
117
+
91
118
  async def setup_nats_subscription(self):
92
119
  last_seqid = await sequence_manager.get_last_seqid(self.driver, self.partition)
93
120
  if last_seqid is None:
94
121
  last_seqid = 1
95
122
  subject = const.Streams.INGEST.subject.format(partition=self.partition)
96
- await self.nats_connection_manager.subscribe(
97
- subject=subject,
98
- queue=const.Streams.INGEST.group.format(partition=self.partition),
123
+ durable_name = const.Streams.INGEST.group.format(partition=self.partition)
124
+ self.subscription = await self.nats_connection_manager.pull_subscribe(
99
125
  stream=const.Streams.INGEST.name,
100
- flow_control=True,
126
+ subject=subject,
127
+ durable=durable_name,
101
128
  cb=self.subscription_worker,
102
129
  subscription_lost_cb=self.setup_nats_subscription,
103
130
  config=nats.js.api.ConsumerConfig(
131
+ durable_name=durable_name,
104
132
  deliver_policy=nats.js.api.DeliverPolicy.BY_START_SEQUENCE,
105
133
  opt_start_seq=last_seqid,
106
134
  ack_policy=nats.js.api.AckPolicy.EXPLICIT,
107
- max_ack_pending=nats_consumer_settings.nats_max_ack_pending,
135
+ max_ack_pending=1,
108
136
  max_deliver=nats_consumer_settings.nats_max_deliver,
109
137
  ack_wait=nats_consumer_settings.nats_ack_wait,
110
- idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
111
138
  ),
112
139
  )
113
140
  logger.info(
114
- f"Subscribed to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
141
+ f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST.name} from {last_seqid}"
115
142
  )
116
143
 
117
- @backoff.on_exception(
118
- backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
119
- )
144
+ @backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
120
145
  async def _process(self, pb: BrokerMessage, seqid: int):
121
146
  await self.processor.process(pb, seqid, self.partition)
122
147
 
@@ -142,19 +167,29 @@ class IngestConsumer:
142
167
  logger.warning("Could not delete blob reference", exc_info=True)
143
168
 
144
169
  async def subscription_worker(self, msg: Msg):
170
+ kbid: Optional[str] = None
145
171
  subject = msg.subject
146
172
  reply = msg.reply
147
173
  seqid = int(reply.split(".")[5])
148
174
  message_source = "<msg source not set>"
175
+ num_delivered = msg.metadata.num_delivered
176
+ if num_delivered > 1:
177
+ logger.warning(
178
+ "Message has been redelivered",
179
+ extra={
180
+ "seqid": seqid,
181
+ "subject": subject,
182
+ "reply": reply,
183
+ "num_delivered": num_delivered,
184
+ },
185
+ )
149
186
  start = time.monotonic()
150
187
 
151
188
  async with (
152
189
  MessageProgressUpdater(msg, nats_consumer_settings.nats_ack_wait * 0.66),
153
190
  self.lock,
154
191
  ):
155
- logger.info(
156
- f"Message processing: subject:{subject}, seqid: {seqid}, reply: {reply}"
157
- )
192
+ logger.info(f"Message processing: subject:{subject}, seqid: {seqid}, reply: {reply}")
158
193
  try:
159
194
  pb = await self.get_broker_message(msg)
160
195
  if pb.source == pb.MessageSource.PROCESSOR:
@@ -170,32 +205,27 @@ class IngestConsumer:
170
205
  f"Received from {message_source} on {pb.kbid}/{pb.uuid} seq {seqid} partition {self.partition} at {time}" # noqa
171
206
  )
172
207
  context.add_context({"kbid": pb.kbid, "rid": pb.uuid})
173
-
208
+ kbid = pb.kbid
174
209
  try:
175
- with consumer_observer(
176
- {
177
- "source": (
178
- "writer"
179
- if pb.source == pb.MessageSource.WRITER
180
- else "processor"
181
- )
182
- }
183
- ):
210
+ source = "writer" if pb.source == pb.MessageSource.WRITER else "processor"
211
+ with consumer_observer({"source": source, "partition": self.partition}):
184
212
  await self._process(pb, seqid)
185
213
  except SequenceOrderViolation as err:
186
- log_func = logger.error
187
- if seqid == err.last_seqid: # pragma: no cover
188
- # Occasional retries of the last processed message may happen
189
- log_func = logger.warning
190
- log_func(
191
- f"Old txn: DISCARD (nucliadb seqid: {seqid}, partition: {self.partition}). Current seqid: {err.last_seqid}" # noqa
214
+ logger.log(
215
+ level=logging.ERROR if seqid < err.last_seqid else logging.WARNING,
216
+ msg="Old txn. Discarding message",
217
+ extra={
218
+ "stored_seqid": err.last_seqid,
219
+ "message_seqid": seqid,
220
+ "partition": self.partition,
221
+ "kbid": pb.kbid,
222
+ "msg_delivered_count": msg.metadata.num_delivered,
223
+ },
192
224
  )
193
225
  else:
194
226
  message_type_name = pb.MessageType.Name(pb.type)
195
227
  time_to_process = time.monotonic() - start
196
- log_level = (
197
- logging.INFO if time_to_process < 10 else logging.WARNING
198
- )
228
+ log_level = logging.INFO if time_to_process < 10 else logging.WARNING
199
229
  logger.log(
200
230
  log_level,
201
231
  f"Successfully processed {message_type_name} message",
@@ -218,7 +248,8 @@ class IngestConsumer:
218
248
  f"A copy of the message has been stored on {self.processor.storage.deadletter_bucket}. "
219
249
  f"Check sentry for more details: {str(e)}"
220
250
  )
221
- await msg.ack()
251
+ await self.ack_message(msg, kbid)
252
+ logger.info("Message acked because of deadletter", extra={"seqid": seqid})
222
253
  except (ShardsNotFound,) as e:
223
254
  # Any messages that for some unexpected inconsistency have failed and won't be tried again
224
255
  # as we cannot do anything about it
@@ -229,7 +260,8 @@ class IngestConsumer:
229
260
  f"This message has been dropped and won't be retried again"
230
261
  f"Check sentry for more details: {str(e)}"
231
262
  )
232
- await msg.ack()
263
+ await self.ack_message(msg, kbid)
264
+ logger.info("Message acked because of drop", extra={"seqid": seqid})
233
265
  except Exception as e:
234
266
  # Unhandled exceptions that need to be retried after a small delay
235
267
  errors.capture_exception(e)
@@ -239,10 +271,12 @@ class IngestConsumer:
239
271
  f"Check sentry for more details: {str(e)}"
240
272
  )
241
273
  await msg.nak()
274
+ logger.info("Message nacked because of unhandled error", extra={"seqid": seqid})
242
275
  raise e
243
276
  else:
244
277
  # Successful processing
245
- await msg.ack()
278
+ await self.ack_message(msg, kbid)
279
+ logger.info("Message acked because of success", extra={"seqid": seqid})
246
280
  await self.clean_broker_message(msg)
247
281
 
248
282
 
@@ -259,28 +293,29 @@ class IngestProcessedConsumer(IngestConsumer):
259
293
 
260
294
  async def setup_nats_subscription(self):
261
295
  subject = const.Streams.INGEST_PROCESSED.subject
262
- await self.nats_connection_manager.subscribe(
263
- subject=subject,
264
- queue=const.Streams.INGEST_PROCESSED.group,
296
+ durable_name = const.Streams.INGEST_PROCESSED.group
297
+ self.subscription = await self.nats_connection_manager.pull_subscribe(
265
298
  stream=const.Streams.INGEST_PROCESSED.name,
266
- flow_control=True,
299
+ subject=subject,
300
+ durable=durable_name,
267
301
  cb=self.subscription_worker,
268
302
  subscription_lost_cb=self.setup_nats_subscription,
269
303
  config=nats.js.api.ConsumerConfig(
304
+ durable_name=durable_name,
270
305
  ack_policy=nats.js.api.AckPolicy.EXPLICIT,
271
- max_ack_pending=100, # custom ack pending here
306
+ deliver_policy=nats.js.api.DeliverPolicy.ALL,
307
+ # We set it to 20 because we don't care about order here and we want to be able to HPA based
308
+ # on the number of pending messages in the queue.
309
+ max_ack_pending=20,
272
310
  max_deliver=nats_consumer_settings.nats_max_deliver,
273
311
  ack_wait=nats_consumer_settings.nats_ack_wait,
274
- idle_heartbeat=nats_consumer_settings.nats_idle_heartbeat,
275
312
  ),
276
313
  )
277
314
  logger.info(
278
- f"Subscribed to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
315
+ f"Subscribed pull consumer to {subject} on stream {const.Streams.INGEST_PROCESSED.name}"
279
316
  )
280
317
 
281
- @backoff.on_exception(
282
- backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4
283
- )
318
+ @backoff.on_exception(backoff.expo, (ConflictError,), jitter=backoff.random_jitter, max_tries=4)
284
319
  async def _process(self, pb: BrokerMessage, seqid: int):
285
320
  """
286
321
  We are setting `transaction_check` to False here because we can not mix
@@ -83,27 +83,20 @@ class MaterializerHandler:
83
83
  notification.ParseFromString(data)
84
84
 
85
85
  if (
86
- notification.action
87
- != writer_pb2.Notification.Action.COMMIT # only on commits
86
+ notification.action != writer_pb2.Notification.Action.COMMIT # only on commits
88
87
  or notification.write_type
89
88
  == writer_pb2.Notification.WriteType.MODIFIED # only on new resources and deletes
90
89
  ):
91
90
  return
92
91
 
93
- self.task_handler.schedule(
94
- notification.kbid, partial(self.process, notification.kbid)
95
- )
92
+ self.task_handler.schedule(notification.kbid, partial(self.process, notification.kbid))
96
93
 
97
94
  async def process(self, kbid: str) -> None:
98
95
  logger.info(f"Materializing knowledgebox", extra={"kbid": kbid})
99
- async with datamanagers.with_transaction(read_only=True) as txn:
100
- value = await datamanagers.resources.calculate_number_of_resources(
101
- txn, kbid=kbid
102
- )
96
+ async with datamanagers.with_ro_transaction() as txn:
97
+ value = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
103
98
  async with datamanagers.with_transaction() as txn:
104
- await datamanagers.resources.set_number_of_resources(
105
- txn, kbid=kbid, value=value
106
- )
99
+ await datamanagers.resources.set_number_of_resources(txn, kbid=kbid, value=value)
107
100
  await txn.commit()
108
101
 
109
102
  audit = get_audit()
@@ -21,10 +21,7 @@ import asyncio
21
21
  import base64
22
22
  from typing import Optional
23
23
 
24
- import nats
25
- import nats.errors
26
24
  from aiohttp.client_exceptions import ClientConnectorError
27
- from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
28
25
 
29
26
  from nucliadb.common import datamanagers
30
27
  from nucliadb.common.http_clients.processing import ProcessingHTTPClient, get_nua_api_id
@@ -32,11 +29,13 @@ from nucliadb.common.maindb.driver import Driver
32
29
  from nucliadb.ingest import logger, logger_activity
33
30
  from nucliadb.ingest.orm.exceptions import ReallyStopPulling
34
31
  from nucliadb.ingest.orm.processor import Processor
32
+ from nucliadb_protos.writer_pb2 import BrokerMessage, BrokerMessageBlobReference
35
33
  from nucliadb_telemetry import errors
36
34
  from nucliadb_utils import const
37
35
  from nucliadb_utils.cache.pubsub import PubSubDriver
38
36
  from nucliadb_utils.settings import nuclia_settings
39
37
  from nucliadb_utils.storages.storage import Storage
38
+ from nucliadb_utils.transaction import MaxTransactionSizeExceededError
40
39
  from nucliadb_utils.utilities import get_storage, get_transaction_utility
41
40
 
42
41
 
@@ -78,9 +77,7 @@ class PullWorker:
78
77
  data = base64.b64decode(payload)
79
78
  pb.ParseFromString(data)
80
79
 
81
- logger.debug(
82
- f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}"
83
- )
80
+ logger.debug(f"Resource: {pb.uuid} KB: {pb.kbid} ProcessingID: {pb.processing_id}")
84
81
 
85
82
  if not self.local_subscriber:
86
83
  transaction_utility = get_transaction_utility()
@@ -93,11 +90,9 @@ class PullWorker:
93
90
  # send to separate processor
94
91
  target_subject=const.Streams.INGEST_PROCESSED.subject,
95
92
  )
96
- except nats.errors.MaxPayloadError:
93
+ except MaxTransactionSizeExceededError:
97
94
  storage = await get_storage()
98
- stored_key = await storage.set_stream_message(
99
- kbid=pb.kbid, rid=pb.uuid, data=data
100
- )
95
+ stored_key = await storage.set_stream_message(kbid=pb.kbid, rid=pb.uuid, data=data)
101
96
  referenced_pb = BrokerMessageBlobReference(
102
97
  uuid=pb.uuid, kbid=pb.kbid, storage_key=stored_key
103
98
  )
@@ -141,9 +136,7 @@ class PullWorker:
141
136
  try:
142
137
  pull_type_id = get_nua_api_id()
143
138
  except Exception as exc:
144
- logger.exception(
145
- "Could not read NUA API Key. Can not start pull worker"
146
- )
139
+ logger.exception("Could not read NUA API Key. Can not start pull worker")
147
140
  raise ReallyStopPulling() from exc
148
141
  else:
149
142
  pull_type_id = "main"
@@ -152,7 +145,7 @@ class PullWorker:
152
145
  logger.info(f"Collecting from NucliaDB Cloud {self.partition} partition")
153
146
  while True:
154
147
  try:
155
- async with datamanagers.with_transaction() as txn:
148
+ async with datamanagers.with_ro_transaction() as txn:
156
149
  cursor = await datamanagers.processing.get_pull_offset(
157
150
  txn, pull_type_id=pull_type_id, partition=self.partition
158
151
  )
@@ -176,9 +169,7 @@ class PullWorker:
176
169
  await self.handle_message(payload)
177
170
  except Exception as e:
178
171
  errors.capture_exception(e)
179
- logger.exception(
180
- "Error while pulling and processing message/s"
181
- )
172
+ logger.exception("Error while pulling and processing message/s")
182
173
  raise e
183
174
  async with datamanagers.with_transaction() as txn:
184
175
  await datamanagers.processing.set_pull_offset(
@@ -189,9 +180,7 @@ class PullWorker:
189
180
  )
190
181
  await txn.commit()
191
182
  elif data.status == "empty":
192
- logger_activity.debug(
193
- f"No messages waiting in partition #{self.partition}"
194
- )
183
+ logger_activity.debug(f"No messages waiting in partition #{self.partition}")
195
184
  await asyncio.sleep(self.pull_time_empty_backoff)
196
185
  else:
197
186
  logger.info(f"Proxy pull answered with error: {data}")
@@ -202,9 +191,7 @@ class PullWorker:
202
191
  KeyboardInterrupt,
203
192
  SystemExit,
204
193
  ):
205
- logger.info(
206
- f"Pull task for partition #{self.partition} was canceled, exiting"
207
- )
194
+ logger.info(f"Pull task for partition #{self.partition} was canceled, exiting")
208
195
  raise ReallyStopPulling()
209
196
 
210
197
  except ClientConnectorError:
@@ -214,14 +201,12 @@ class PullWorker:
214
201
  )
215
202
  await asyncio.sleep(self.pull_time_error_backoff)
216
203
 
217
- except nats.errors.MaxPayloadError as e:
204
+ except MaxTransactionSizeExceededError as e:
218
205
  if data is not None:
219
206
  payload_length = 0
220
207
  if data.payload:
221
208
  payload_length = len(base64.b64decode(data.payload))
222
- logger.error(
223
- f"Message too big for transaction: {payload_length}"
224
- )
209
+ logger.error(f"Message too big for transaction: {payload_length}")
225
210
  raise e
226
211
  except Exception:
227
212
  logger.exception("Unhandled error pulling messages from processing")