nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,752 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ import logging
22
+ from typing import Optional
23
+
24
+ import aiohttp.client_exceptions
25
+ import nats.errors
26
+ import nats.js.errors
27
+
28
+ from nucliadb.common import datamanagers, locking
29
+ from nucliadb.common.cluster.settings import settings as cluster_settings
30
+ from nucliadb.common.cluster.utils import get_shard_manager
31
+ from nucliadb.common.external_index_providers.base import ExternalIndexManager
32
+ from nucliadb.common.external_index_providers.manager import get_external_index_manager
33
+ from nucliadb.common.maindb.driver import Driver, Transaction
34
+ from nucliadb.common.maindb.exceptions import ConflictError, MaindbServerError
35
+ from nucliadb.ingest.orm.exceptions import (
36
+ DeadletteredError,
37
+ InvalidBrokerMessage,
38
+ ResourceNotIndexable,
39
+ SequenceOrderViolation,
40
+ )
41
+ from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
42
+ from nucliadb.ingest.orm.metrics import processor_observer
43
+ from nucliadb.ingest.orm.processor import sequence_manager
44
+ from nucliadb.ingest.orm.processor.auditing import collect_audit_fields
45
+ from nucliadb.ingest.orm.processor.data_augmentation import (
46
+ get_generated_fields,
47
+ send_generated_fields_to_process,
48
+ )
49
+ from nucliadb.ingest.orm.resource import Resource
50
+ from nucliadb_protos import (
51
+ knowledgebox_pb2,
52
+ noderesources_pb2,
53
+ nodewriter_pb2,
54
+ resources_pb2,
55
+ writer_pb2,
56
+ )
57
+ from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
58
+ from nucliadb_telemetry import errors
59
+ from nucliadb_utils import const
60
+ from nucliadb_utils.cache.pubsub import PubSubDriver
61
+ from nucliadb_utils.storages.storage import Storage
62
+ from nucliadb_utils.utilities import get_storage, has_feature
63
+
64
+ from .pgcatalog import pgcatalog_delete, pgcatalog_update
65
+
66
+ logger = logging.getLogger("ingest-processor")
67
+
68
+ MESSAGE_TO_NOTIFICATION_SOURCE = {
69
+ writer_pb2.BrokerMessage.MessageSource.WRITER: writer_pb2.NotificationSource.WRITER,
70
+ writer_pb2.BrokerMessage.MessageSource.PROCESSOR: writer_pb2.NotificationSource.PROCESSOR,
71
+ }
72
+
73
+
74
+ def validate_indexable_resource(resource: noderesources_pb2.Resource) -> None:
75
+ """
76
+ It would be more optimal to move this to another layer but it'd also make the code
77
+ more difficult to grok and test because we'd need to move processable check and throw
78
+ an exception in the middle of a bunch of processing logic.
79
+
80
+ As it is implemented right now, we just do the check if a resource is indexable right
81
+ before we actually try to index it and not buried it somewhere else in the code base.
82
+
83
+ This is still an edge case.
84
+ """
85
+ num_paragraphs = 0
86
+ for _, fparagraph in resource.paragraphs.items():
87
+ # this count should not be very expensive to do since we don't have
88
+ # a lot of different fields and we just do a count on a dict
89
+ num_paragraphs += len(fparagraph.paragraphs)
90
+
91
+ if num_paragraphs > cluster_settings.max_resource_paragraphs:
92
+ raise ResourceNotIndexable(
93
+ "Resource has too many paragraphs. "
94
+ f"Supported: {cluster_settings.max_resource_paragraphs} , Number: {num_paragraphs}"
95
+ )
96
+
97
+
98
+ class Processor:
99
+ """
100
+ This class is responsible for processing messages from the broker
101
+ and attempts to manage sequencing correctly with a txn id implementation.
102
+
103
+ The "txn" in this implementation is oriented around the sequence id of
104
+ messages coming through the message broker.
105
+
106
+ Not all writes are going to have a transaction id. For example, writes
107
+ coming from processor can be coming through a different channel
108
+ and can not use the txn id
109
+ """
110
+
111
+ messages: dict[str, list[writer_pb2.BrokerMessage]]
112
+
113
+ def __init__(
114
+ self,
115
+ driver: Driver,
116
+ storage: Storage,
117
+ pubsub: Optional[PubSubDriver] = None,
118
+ partition: Optional[str] = None,
119
+ ):
120
+ self.messages = {}
121
+ self.driver = driver
122
+ self.storage = storage
123
+ self.partition = partition
124
+ self.pubsub = pubsub
125
+ self.index_node_shard_manager = get_shard_manager()
126
+
127
+ async def process(
128
+ self,
129
+ message: writer_pb2.BrokerMessage,
130
+ seqid: int,
131
+ partition: Optional[str] = None,
132
+ transaction_check: bool = True,
133
+ ) -> None:
134
+ partition = partition if self.partition is None else self.partition
135
+ if partition is None:
136
+ raise AttributeError("Can't process message from unknown partition")
137
+
138
+ # When running in transactional mode, we need to check that
139
+ # that the current message doesn't violate the sequence order for the
140
+ # current partition
141
+ if transaction_check:
142
+ last_seqid = await sequence_manager.get_last_seqid(self.driver, partition)
143
+ if last_seqid is not None and seqid <= last_seqid:
144
+ raise SequenceOrderViolation(last_seqid)
145
+
146
+ if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
147
+ await self.delete_resource(message, seqid, partition, transaction_check)
148
+ elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
149
+ await self.txn([message], seqid, partition, transaction_check)
150
+ elif message.type == writer_pb2.BrokerMessage.MessageType.MULTI:
151
+ # XXX Not supported right now
152
+ # MULTI, COMMIT and ROLLBACK are all not supported in transactional mode right now
153
+ # This concept is probably not tenable with current architecture because
154
+ # of how nats works and how we would need to manage rollbacks.
155
+ # XXX Should this be removed?
156
+ await self.multi(message, seqid)
157
+ elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
158
+ await self.commit(message, seqid, partition)
159
+ elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
160
+ await self.rollback(message, seqid, partition)
161
+
162
+ async def get_resource_uuid(self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage) -> str:
163
+ if message.uuid is None:
164
+ uuid = await kb.get_resource_uuid_by_slug(message.slug)
165
+ else:
166
+ uuid = message.uuid
167
+ return uuid
168
+
169
+ @processor_observer.wrap({"type": "delete_resource"})
170
+ async def delete_resource(
171
+ self,
172
+ message: writer_pb2.BrokerMessage,
173
+ seqid: int,
174
+ partition: str,
175
+ transaction_check: bool = True,
176
+ ) -> None:
177
+ async with self.driver.transaction() as txn:
178
+ try:
179
+ kb = KnowledgeBox(txn, self.storage, message.kbid)
180
+
181
+ uuid = await self.get_resource_uuid(kb, message)
182
+ async with locking.distributed_lock(
183
+ locking.RESOURCE_INDEX_LOCK.format(kbid=message.kbid, resource_id=uuid)
184
+ ):
185
+ # we need to have a lock at indexing time because we don't know if
186
+ # a resource was in the process of being moved when a delete occurred
187
+ shard_id = await datamanagers.resources.get_resource_shard_id(
188
+ txn, kbid=message.kbid, rid=uuid
189
+ )
190
+ if shard_id is None:
191
+ logger.warning(f"Resource {uuid} does not exist")
192
+ else:
193
+ shard = await kb.get_resource_shard(shard_id)
194
+ if shard is None:
195
+ raise AttributeError("Shard not available")
196
+ await pgcatalog_delete(txn, message.kbid, uuid)
197
+ external_index_manager = await get_external_index_manager(kbid=message.kbid)
198
+ if external_index_manager is not None:
199
+ await self.external_index_delete_resource(external_index_manager, uuid)
200
+ else:
201
+ await self.index_node_shard_manager.delete_resource(
202
+ shard, message.uuid, seqid, partition, message.kbid
203
+ )
204
+ try:
205
+ await kb.delete_resource(message.uuid)
206
+ except Exception as exc:
207
+ await txn.abort()
208
+ await self.notify_abort(
209
+ partition=partition,
210
+ seqid=seqid,
211
+ multi=message.multiid,
212
+ kbid=message.kbid,
213
+ rid=message.uuid,
214
+ source=message.source,
215
+ )
216
+ raise exc
217
+ finally:
218
+ if txn.open:
219
+ if transaction_check:
220
+ await sequence_manager.set_last_seqid(txn, partition, seqid)
221
+ await txn.commit()
222
+ await self.notify_commit(
223
+ partition=partition,
224
+ seqid=seqid,
225
+ multi=message.multiid,
226
+ message=message,
227
+ write_type=writer_pb2.Notification.WriteType.DELETED,
228
+ )
229
+
230
+ @processor_observer.wrap({"type": "commit_slug"})
231
+ async def commit_slug(self, resource: Resource) -> None:
232
+ # Slug may have conflicts as its not partitioned properly,
233
+ # so we commit it in a different transaction to make it as short as possible
234
+ prev_txn = resource.txn
235
+ try:
236
+ async with self.driver.transaction() as txn:
237
+ resource.txn = txn
238
+ await resource.set_slug()
239
+ await txn.commit()
240
+ finally:
241
+ resource.txn = prev_txn
242
+
243
+ @processor_observer.wrap({"type": "txn"})
244
+ async def txn(
245
+ self,
246
+ messages: list[writer_pb2.BrokerMessage],
247
+ seqid: int,
248
+ partition: str,
249
+ transaction_check: bool = True,
250
+ ) -> None:
251
+ if len(messages) == 0:
252
+ return None
253
+
254
+ kbid = messages[0].kbid
255
+ if not await datamanagers.atomic.kb.exists_kb(kbid=kbid):
256
+ logger.info(f"KB {kbid} is deleted: skiping txn")
257
+ if transaction_check:
258
+ async with datamanagers.with_rw_transaction() as txn:
259
+ await sequence_manager.set_last_seqid(txn, partition, seqid)
260
+ await txn.commit()
261
+ return None
262
+
263
+ async with self.driver.transaction() as txn:
264
+ try:
265
+ multi = messages[0].multiid
266
+ kb = KnowledgeBox(txn, self.storage, kbid)
267
+ uuid = await self.get_resource_uuid(kb, messages[0])
268
+ resource: Optional[Resource] = None
269
+ handled_exception = None
270
+ created = False
271
+
272
+ for message in messages:
273
+ if resource is not None:
274
+ assert resource.uuid == message.uuid
275
+
276
+ if message.source == writer_pb2.BrokerMessage.MessageSource.WRITER:
277
+ resource = await kb.get(uuid)
278
+ if resource is None:
279
+ # It's a new resource
280
+ resource = await kb.add_resource(uuid, message.slug, message.basic)
281
+ created = True
282
+ else:
283
+ # It's an update from writer for an existing resource
284
+ ...
285
+
286
+ elif message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
287
+ resource = await kb.get(uuid)
288
+ if resource is None:
289
+ logger.info(
290
+ f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
291
+ )
292
+ continue
293
+ else:
294
+ # It's an update from processor for an existing resource
295
+ ...
296
+
297
+ generated_fields = await get_generated_fields(message, resource)
298
+ if generated_fields.is_not_empty():
299
+ await send_generated_fields_to_process(
300
+ kbid, resource, generated_fields, message
301
+ )
302
+ # TODO: remove this when processor sends the field set
303
+ for generated_text in generated_fields.texts:
304
+ message.texts[
305
+ generated_text
306
+ ].generated_by.data_augmentation.SetInParent()
307
+
308
+ else:
309
+ raise InvalidBrokerMessage(f"Unknown broker message source: {message.source}")
310
+
311
+ # apply changes from the broker message to the resource
312
+ await self.apply_resource(message, resource, update=(not created))
313
+
314
+ # index message
315
+
316
+ if resource:
317
+ await resource.compute_global_text()
318
+ await resource.compute_global_tags(resource.indexer)
319
+ await resource.compute_security(resource.indexer)
320
+ if message.reindex:
321
+ # when reindexing, let's just generate full new index message
322
+ resource.replace_indexer(await resource.generate_index_message(reindex=True))
323
+
324
+ if resource and resource.modified:
325
+ await pgcatalog_update(txn, kbid, resource)
326
+ await self.index_resource( # noqa
327
+ resource=resource,
328
+ txn=txn,
329
+ uuid=uuid,
330
+ kbid=kbid,
331
+ seqid=seqid,
332
+ partition=partition,
333
+ kb=kb,
334
+ source=messages_source(messages),
335
+ )
336
+ if transaction_check:
337
+ await sequence_manager.set_last_seqid(txn, partition, seqid)
338
+ await txn.commit()
339
+
340
+ if created:
341
+ await self.commit_slug(resource)
342
+
343
+ await self.notify_commit(
344
+ partition=partition,
345
+ seqid=seqid,
346
+ multi=multi,
347
+ message=message,
348
+ write_type=(
349
+ writer_pb2.Notification.WriteType.CREATED
350
+ if created
351
+ else writer_pb2.Notification.WriteType.MODIFIED
352
+ ),
353
+ )
354
+ elif resource and resource.modified is False:
355
+ await txn.abort()
356
+ await self.notify_abort(
357
+ partition=partition,
358
+ seqid=seqid,
359
+ multi=multi,
360
+ kbid=kbid,
361
+ rid=uuid,
362
+ source=message.source,
363
+ )
364
+ logger.warning("This message did not modify the resource")
365
+ except (
366
+ asyncio.TimeoutError,
367
+ asyncio.CancelledError,
368
+ aiohttp.client_exceptions.ClientError,
369
+ ConflictError,
370
+ MaindbServerError,
371
+ nats.errors.NoRespondersError,
372
+ nats.js.errors.NoStreamResponseError,
373
+ ): # pragma: no cover
374
+ # Unhandled exceptions here that should bubble and hard fail
375
+ # XXX We swallow too many exceptions here!
376
+ await self.notify_abort(
377
+ partition=partition,
378
+ seqid=seqid,
379
+ multi=multi,
380
+ kbid=kbid,
381
+ rid=uuid,
382
+ source=message.source,
383
+ )
384
+ raise
385
+ except Exception as exc:
386
+ # As we are in the middle of a transaction, we cannot let the exception raise directly
387
+ # as we need to do some cleanup. The exception will be reraised at the end of the function
388
+ # and then handled by the top caller, so errors can be handled in the same place.
389
+ await self.deadletter(messages, partition, seqid)
390
+ await self.notify_abort(
391
+ partition=partition,
392
+ seqid=seqid,
393
+ multi=multi,
394
+ kbid=kbid,
395
+ rid=uuid,
396
+ source=message.source,
397
+ )
398
+ handled_exception = exc
399
+ finally:
400
+ if resource is not None:
401
+ resource.clean()
402
+ # txn should be already commited or aborted, but in the event of an exception
403
+ # it could be left open. Make sure to close it if it's still open
404
+ if txn.open:
405
+ await txn.abort()
406
+
407
+ if handled_exception is not None:
408
+ if seqid == -1:
409
+ raise handled_exception
410
+ else:
411
+ if resource is not None:
412
+ await self._mark_resource_error(kb, resource, partition, seqid)
413
+ raise DeadletteredError() from handled_exception
414
+
415
+ return None
416
+
417
+ async def get_or_assign_resource_shard(
418
+ self, txn: Transaction, kb: KnowledgeBox, uuid: str
419
+ ) -> writer_pb2.ShardObject:
420
+ kbid = kb.kbid
421
+ async with locking.distributed_lock(
422
+ locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=uuid)
423
+ ):
424
+ # we need to have a lock at indexing time because we don't know if
425
+ # a resource was move to another shard while it was being indexed
426
+ shard_id = await datamanagers.resources.get_resource_shard_id(txn, kbid=kbid, rid=uuid)
427
+
428
+ shard = None
429
+ if shard_id is not None:
430
+ # Resource already has a shard assigned
431
+ shard = await kb.get_resource_shard(shard_id)
432
+ if shard is None:
433
+ raise AttributeError("Shard not available")
434
+ else:
435
+ # It's a new resource, get KB's current active shard to place new resource on
436
+ shard = await self.index_node_shard_manager.get_current_active_shard(txn, kbid)
437
+ if shard is None:
438
+ # No current shard available, create a new one
439
+ shard = await self.index_node_shard_manager.create_shard_by_kbid(txn, kbid)
440
+ await datamanagers.resources.set_resource_shard_id(
441
+ txn, kbid=kbid, rid=uuid, shard=shard.shard
442
+ )
443
+ return shard
444
+
445
+ @processor_observer.wrap({"type": "index_resource"})
446
+ async def index_resource(
447
+ self,
448
+ resource: Resource,
449
+ txn: Transaction,
450
+ uuid: str,
451
+ kbid: str,
452
+ seqid: int,
453
+ partition: str,
454
+ kb: KnowledgeBox,
455
+ source: nodewriter_pb2.IndexMessageSource.ValueType,
456
+ ) -> None:
457
+ validate_indexable_resource(resource.indexer.brain)
458
+ shard = await self.get_or_assign_resource_shard(txn, kb, uuid)
459
+ index_message = resource.indexer.brain
460
+ external_index_manager = await get_external_index_manager(kbid=kbid)
461
+ if external_index_manager is not None:
462
+ await self.external_index_add_resource(external_index_manager, uuid, index_message)
463
+ else:
464
+ await self.index_node_shard_manager.add_resource(
465
+ shard,
466
+ index_message,
467
+ seqid,
468
+ partition=partition,
469
+ kb=kbid,
470
+ source=source,
471
+ )
472
+
473
+ async def external_index_delete_resource(
474
+ self, external_index_manager: ExternalIndexManager, resource_uuid: str
475
+ ):
476
+ if self.should_skip_external_index(external_index_manager):
477
+ logger.warning(
478
+ "Skipping external index delete resource",
479
+ extra={
480
+ "kbid": external_index_manager.kbid,
481
+ "rid": resource_uuid,
482
+ "provider": external_index_manager.type.value,
483
+ },
484
+ )
485
+ return
486
+ await external_index_manager.delete_resource(resource_uuid=resource_uuid)
487
+
488
+ def should_skip_external_index(self, external_index_manager: ExternalIndexManager) -> bool:
489
+ """
490
+ This is a safety measure to skip external indexing in case that the external index provider is not working.
491
+ As we don't want to block the ingestion pipeline, this is a temporary measure until we implement async consumers
492
+ to index to external indexes.
493
+ """
494
+ kbid = external_index_manager.kbid
495
+ provider_type = external_index_manager.type.value
496
+ return has_feature(
497
+ const.Features.SKIP_EXTERNAL_INDEX,
498
+ context={"kbid": kbid, "provider": provider_type},
499
+ default=False,
500
+ )
501
+
502
+ async def external_index_add_resource(
503
+ self,
504
+ external_index_manager: ExternalIndexManager,
505
+ resource_uuid: str,
506
+ index_message: PBBrainResource,
507
+ ):
508
+ if not has_vectors_operation(index_message):
509
+ return
510
+ if self.should_skip_external_index(external_index_manager):
511
+ logger.warning(
512
+ "Skipping external index for resource",
513
+ extra={
514
+ "kbid": external_index_manager.kbid,
515
+ "rid": resource_uuid,
516
+ "provider": external_index_manager.type.value,
517
+ },
518
+ )
519
+ return
520
+ await external_index_manager.index_resource(
521
+ resource_uuid=resource_uuid, resource_data=index_message
522
+ )
523
+
524
+ async def multi(self, message: writer_pb2.BrokerMessage, seqid: int) -> None:
525
+ self.messages.setdefault(message.multiid, []).append(message)
526
+
527
+ async def commit(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
528
+ if message.multiid not in self.messages:
529
+ # Error
530
+ logger.error(f"Closed multi {message.multiid}")
531
+ await self.deadletter([message], partition, seqid)
532
+ else:
533
+ await self.txn(self.messages[message.multiid], seqid, partition)
534
+
535
+ async def rollback(self, message: writer_pb2.BrokerMessage, seqid: int, partition: str) -> None:
536
+ # Error
537
+ logger.error(f"Closed multi {message.multiid}")
538
+ del self.messages[message.multiid]
539
+ await self.notify_abort(
540
+ partition=partition,
541
+ seqid=seqid,
542
+ multi=message.multiid,
543
+ kbid=message.kbid,
544
+ rid=message.uuid,
545
+ source=message.source,
546
+ )
547
+
548
+ async def deadletter(
549
+ self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
550
+ ) -> None:
551
+ for seq, message in enumerate(messages):
552
+ await self.storage.deadletter(message, seq, seqid, partition)
553
+
554
+ @processor_observer.wrap({"type": "apply_resource"})
555
+ async def apply_resource(
556
+ self,
557
+ message: writer_pb2.BrokerMessage,
558
+ resource: Resource,
559
+ update: bool = False,
560
+ ):
561
+ """Apply broker message to resource object in the database"""
562
+ if update:
563
+ await self.maybe_update_resource_basic(resource, message)
564
+
565
+ if message.HasField("origin"):
566
+ await resource.set_origin(message.origin)
567
+
568
+ if message.HasField("extra"):
569
+ await resource.set_extra(message.extra)
570
+
571
+ if message.HasField("security"):
572
+ await resource.set_security(message.security)
573
+
574
+ await resource.apply_fields(message)
575
+ await resource.apply_extracted(message)
576
+
577
+ async def maybe_update_resource_basic(
578
+ self, resource: Resource, message: writer_pb2.BrokerMessage
579
+ ) -> None:
580
+ basic_field_updates = message.HasField("basic")
581
+ deleted_fields = len(message.delete_fields) > 0
582
+ if not (basic_field_updates or deleted_fields):
583
+ return
584
+
585
+ await resource.set_basic(
586
+ message.basic,
587
+ deleted_fields=message.delete_fields, # type: ignore
588
+ )
589
+
590
+ async def get_extended_audit_data(self, message: writer_pb2.BrokerMessage) -> writer_pb2.Audit:
591
+ message_audit = writer_pb2.Audit()
592
+ message_audit.CopyFrom(message.audit)
593
+ message_audit.kbid = message.kbid
594
+ message_audit.uuid = message.uuid
595
+ message_audit.message_source = message.source
596
+ message_audit.field_metadata.extend([fcmw.field for fcmw in message.field_metadata])
597
+ audit_fields = await collect_audit_fields(self.driver, self.storage, message)
598
+ message_audit.audit_fields.extend(audit_fields)
599
+ return message_audit
600
+
601
+ async def notify_commit(
602
+ self,
603
+ *,
604
+ partition: str,
605
+ seqid: int,
606
+ multi: str,
607
+ message: writer_pb2.BrokerMessage,
608
+ write_type: writer_pb2.Notification.WriteType.ValueType,
609
+ ):
610
+ message_audit = await self.get_extended_audit_data(message)
611
+ notification = writer_pb2.Notification(
612
+ partition=int(partition),
613
+ seqid=seqid,
614
+ multi=multi,
615
+ uuid=message.uuid,
616
+ kbid=message.kbid,
617
+ action=writer_pb2.Notification.Action.COMMIT,
618
+ write_type=write_type,
619
+ source=MESSAGE_TO_NOTIFICATION_SOURCE[message.source],
620
+ processing_errors=len(message.errors) > 0,
621
+ message_audit=message_audit,
622
+ )
623
+
624
+ await self.notify(
625
+ const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=message.kbid),
626
+ notification.SerializeToString(),
627
+ )
628
+
629
+ async def notify_abort(
630
+ self,
631
+ *,
632
+ partition: str,
633
+ seqid: int,
634
+ multi: str,
635
+ kbid: str,
636
+ rid: str,
637
+ source: writer_pb2.BrokerMessage.MessageSource.ValueType,
638
+ ):
639
+ message = writer_pb2.Notification(
640
+ partition=int(partition),
641
+ seqid=seqid,
642
+ multi=multi,
643
+ uuid=rid,
644
+ kbid=kbid,
645
+ action=writer_pb2.Notification.ABORT,
646
+ source=MESSAGE_TO_NOTIFICATION_SOURCE[source],
647
+ )
648
+ await self.notify(
649
+ const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=kbid),
650
+ message.SerializeToString(),
651
+ )
652
+
653
+ async def notify(self, channel, payload: bytes):
654
+ if self.pubsub is not None:
655
+ await self.pubsub.publish(channel, payload)
656
+
657
+ async def _mark_resource_error(
658
+ self, kb: KnowledgeBox, resource: Optional[Resource], partition: str, seqid: int
659
+ ) -> None:
660
+ """
661
+ Unhandled error processing, try to mark resource as error
662
+ """
663
+ if resource is None or resource.basic is None:
664
+ logger.info(f"Skip when resource does not even have basic metadata: {resource}")
665
+ return
666
+ try:
667
+ async with self.driver.transaction() as txn:
668
+ kb.txn = resource.txn = txn
669
+
670
+ shard_id = await datamanagers.resources.get_resource_shard_id(
671
+ txn, kbid=kb.kbid, rid=resource.uuid
672
+ )
673
+ shard = None
674
+ if shard_id is not None:
675
+ shard = await kb.get_resource_shard(shard_id)
676
+ if shard is None:
677
+ logger.warning(
678
+ "Unable to mark resource as error, shard is None. "
679
+ "This should not happen so you did something special to get here."
680
+ )
681
+ return
682
+
683
+ resource.basic.metadata.status = resources_pb2.Metadata.Status.ERROR
684
+ await resource.set_basic(resource.basic)
685
+ await txn.commit()
686
+
687
+ resource.indexer.set_processing_status(
688
+ basic=resource.basic, previous_status=resource._previous_status
689
+ )
690
+ await self.index_node_shard_manager.add_resource(
691
+ shard, resource.indexer.brain, seqid, partition=partition, kb=kb.kbid
692
+ )
693
+ except Exception:
694
+ logger.warning("Error while marking resource as error", exc_info=True)
695
+
696
+ # KB tools
697
+ # XXX: Why are these utility functions here?
698
+ async def get_kb_obj(
699
+ self, txn: Transaction, kbid: knowledgebox_pb2.KnowledgeBoxID
700
+ ) -> Optional[KnowledgeBox]:
701
+ uuid: Optional[str] = kbid.uuid
702
+ if uuid == "":
703
+ uuid = await datamanagers.kb.get_kb_uuid(txn, slug=kbid.slug)
704
+
705
+ if uuid is None:
706
+ return None
707
+
708
+ if not (await datamanagers.kb.exists_kb(txn, kbid=uuid)):
709
+ return None
710
+
711
+ storage = await get_storage()
712
+ kbobj = KnowledgeBox(txn, storage, uuid)
713
+ return kbobj
714
+
715
+
716
+ def messages_source(messages: list[writer_pb2.BrokerMessage]):
717
+ from_writer = all(
718
+ (message.source == writer_pb2.BrokerMessage.MessageSource.WRITER for message in messages)
719
+ )
720
+ from_processor = all(
721
+ (message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR for message in messages)
722
+ )
723
+ if from_writer:
724
+ source = nodewriter_pb2.IndexMessageSource.WRITER
725
+ elif from_processor:
726
+ source = nodewriter_pb2.IndexMessageSource.PROCESSOR
727
+ else: # pragma: no cover
728
+ msg = "Processor received multiple broker messages with different sources in the same txn!"
729
+ logger.error(msg)
730
+ errors.capture_exception(Exception(msg))
731
+ source = nodewriter_pb2.IndexMessageSource.PROCESSOR
732
+ return source
733
+
734
+
735
+ def has_vectors_operation(index_message: PBBrainResource) -> bool:
736
+ """
737
+ Returns True if the index message has any vectors to index or to delete.
738
+ """
739
+ if (
740
+ len(index_message.sentences_to_delete) > 0
741
+ or len(index_message.paragraphs_to_delete) > 0
742
+ or any([len(deletions.items) for deletions in index_message.vector_prefixes_to_delete.values()])
743
+ ):
744
+ return True
745
+ for field_paragraphs in index_message.paragraphs.values():
746
+ for paragraph in field_paragraphs.paragraphs.values():
747
+ if len(paragraph.sentences) > 0:
748
+ return True
749
+ for vectorset_sentences in paragraph.vectorsets_sentences.values():
750
+ if len(vectorset_sentences.sentences) > 0:
751
+ return True
752
+ return False