nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -16,704 +16,6 @@
16
16
  #
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import asyncio
21
- import logging
22
- from typing import Optional
23
-
24
- import aiohttp.client_exceptions
25
-
26
- from nucliadb.common import datamanagers, locking
27
- from nucliadb.common.cluster.settings import settings as cluster_settings
28
- from nucliadb.common.cluster.utils import get_shard_manager
29
- from nucliadb.common.maindb.driver import Driver, Transaction
30
- from nucliadb.common.maindb.exceptions import ConflictError
31
- from nucliadb.ingest.orm.exceptions import (
32
- DeadletteredError,
33
- KnowledgeBoxConflict,
34
- ResourceNotIndexable,
35
- SequenceOrderViolation,
36
- )
37
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
38
- from nucliadb.ingest.orm.metrics import processor_observer
39
- from nucliadb.ingest.orm.processor import sequence_manager
40
- from nucliadb.ingest.orm.processor.auditing import collect_audit_fields
41
- from nucliadb.ingest.orm.resource import Resource
42
- from nucliadb_protos import (
43
- knowledgebox_pb2,
44
- noderesources_pb2,
45
- nodewriter_pb2,
46
- resources_pb2,
47
- utils_pb2,
48
- writer_pb2,
49
- )
50
- from nucliadb_telemetry import errors
51
- from nucliadb_utils import const
52
- from nucliadb_utils.cache.pubsub import PubSubDriver
53
- from nucliadb_utils.storages.storage import Storage
54
- from nucliadb_utils.utilities import get_storage
55
-
56
- logger = logging.getLogger(__name__)
57
-
58
-
59
- MESSAGE_TO_NOTIFICATION_SOURCE = {
60
- writer_pb2.BrokerMessage.MessageSource.WRITER: writer_pb2.NotificationSource.WRITER,
61
- writer_pb2.BrokerMessage.MessageSource.PROCESSOR: writer_pb2.NotificationSource.PROCESSOR,
62
- }
63
-
64
-
65
- def validate_indexable_resource(resource: noderesources_pb2.Resource) -> None:
66
- """
67
- It would be more optimal to move this to another layer but it'd also make the code
68
- more difficult to grok and test because we'd need to move processable check and throw
69
- an exception in the middle of a bunch of processing logic.
70
-
71
- As it is implemented right now, we just do the check if a resource is indexable right
72
- before we actually try to index it and not buried it somewhere else in the code base.
73
-
74
- This is still an edge case.
75
- """
76
- num_paragraphs = 0
77
- for _, fparagraph in resource.paragraphs.items():
78
- # this count should not be very expensive to do since we don't have
79
- # a lot of different fields and we just do a count on a dict
80
- num_paragraphs += len(fparagraph.paragraphs)
81
-
82
- if num_paragraphs > cluster_settings.max_resource_paragraphs:
83
- raise ResourceNotIndexable(
84
- "Resource has too many paragraphs. "
85
- f"Supported: {cluster_settings.max_resource_paragraphs} , Number: {num_paragraphs}"
86
- )
87
-
88
-
89
- class Processor:
90
- """
91
- This class is responsible for processing messages from the broker
92
- and attempts to manage sequencing correctly with a txn id implementation.
93
-
94
- The "txn" in this implementation is oriented around the sequence id of
95
- messages coming through the message broker.
96
-
97
- Not all writes are going to have a transaction id. For example, writes
98
- coming from processor can be coming through a different channel
99
- and can not use the txn id
100
- """
101
-
102
- messages: dict[str, list[writer_pb2.BrokerMessage]]
103
-
104
- def __init__(
105
- self,
106
- driver: Driver,
107
- storage: Storage,
108
- pubsub: Optional[PubSubDriver] = None,
109
- partition: Optional[str] = None,
110
- ):
111
- self.messages = {}
112
- self.driver = driver
113
- self.storage = storage
114
- self.partition = partition
115
- self.pubsub = pubsub
116
- self.shard_manager = get_shard_manager()
117
-
118
- async def process(
119
- self,
120
- message: writer_pb2.BrokerMessage,
121
- seqid: int,
122
- partition: Optional[str] = None,
123
- transaction_check: bool = True,
124
- ) -> None:
125
- partition = partition if self.partition is None else self.partition
126
- if partition is None:
127
- raise AttributeError("Can't process message from unknown partition")
128
-
129
- # When running in transactional mode, we need to check that
130
- # that the current message doesn't violate the sequence order for the
131
- # current partition
132
- if transaction_check:
133
- last_seqid = await sequence_manager.get_last_seqid(self.driver, partition)
134
- if last_seqid is not None and seqid <= last_seqid:
135
- raise SequenceOrderViolation(last_seqid)
136
-
137
- if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
138
- await self.delete_resource(message, seqid, partition, transaction_check)
139
- elif message.type == writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT:
140
- await self.txn([message], seqid, partition, transaction_check)
141
- elif message.type == writer_pb2.BrokerMessage.MessageType.MULTI:
142
- # XXX Not supported right now
143
- # MULTI, COMMIT and ROLLBACK are all not supported in transactional mode right now
144
- # This concept is probably not tenable with current architecture because
145
- # of how nats works and how we would need to manage rollbacks.
146
- # XXX Should this be removed?
147
- await self.multi(message, seqid)
148
- elif message.type == writer_pb2.BrokerMessage.MessageType.COMMIT:
149
- await self.commit(message, seqid, partition)
150
- elif message.type == writer_pb2.BrokerMessage.MessageType.ROLLBACK:
151
- await self.rollback(message, seqid, partition)
152
-
153
- async def get_resource_uuid(
154
- self, kb: KnowledgeBox, message: writer_pb2.BrokerMessage
155
- ) -> str:
156
- if message.uuid is None:
157
- uuid = await kb.get_resource_uuid_by_slug(message.slug)
158
- else:
159
- uuid = message.uuid
160
- return uuid
161
-
162
- @processor_observer.wrap({"type": "delete_resource"})
163
- async def delete_resource(
164
- self,
165
- message: writer_pb2.BrokerMessage,
166
- seqid: int,
167
- partition: str,
168
- transaction_check: bool = True,
169
- ) -> None:
170
- txn = await self.driver.begin()
171
- try:
172
- kb = KnowledgeBox(txn, self.storage, message.kbid)
173
-
174
- uuid = await self.get_resource_uuid(kb, message)
175
- async with locking.distributed_lock(
176
- locking.RESOURCE_INDEX_LOCK.format(kbid=message.kbid, resource_id=uuid)
177
- ):
178
- # we need to have a lock at indexing time because we don't know if
179
- # a resource was in the process of being moved when a delete occurred
180
- shard_id = await datamanagers.resources.get_resource_shard_id(
181
- txn, kbid=message.kbid, rid=uuid
182
- )
183
- if shard_id is None:
184
- logger.warning(f"Resource {uuid} does not exist")
185
- else:
186
- shard = await kb.get_resource_shard(shard_id)
187
- if shard is None:
188
- raise AttributeError("Shard not available")
189
-
190
- await self.shard_manager.delete_resource(
191
- shard, message.uuid, seqid, partition, message.kbid
192
- )
193
- try:
194
- await kb.delete_resource(message.uuid)
195
- except Exception as exc:
196
- await txn.abort()
197
- await self.notify_abort(
198
- partition=partition,
199
- seqid=seqid,
200
- multi=message.multiid,
201
- kbid=message.kbid,
202
- rid=message.uuid,
203
- source=message.source,
204
- )
205
- raise exc
206
- finally:
207
- if txn.open:
208
- if transaction_check:
209
- await sequence_manager.set_last_seqid(txn, partition, seqid)
210
- await txn.commit()
211
- await self.notify_commit(
212
- partition=partition,
213
- seqid=seqid,
214
- multi=message.multiid,
215
- message=message,
216
- write_type=writer_pb2.Notification.WriteType.DELETED,
217
- )
218
-
219
- @processor_observer.wrap({"type": "commit_slug"})
220
- async def commit_slug(self, resource: Resource) -> None:
221
- # Slug may have conflicts as its not partitioned properly,
222
- # so we commit it in a different transaction to make it as short as possible
223
- prev_txn = resource.txn
224
- try:
225
- async with self.driver.transaction() as txn:
226
- resource.txn = txn
227
- await resource.set_slug()
228
- await txn.commit()
229
- finally:
230
- resource.txn = prev_txn
231
-
232
- @processor_observer.wrap({"type": "txn"})
233
- async def txn(
234
- self,
235
- messages: list[writer_pb2.BrokerMessage],
236
- seqid: int,
237
- partition: str,
238
- transaction_check: bool = True,
239
- ) -> None:
240
- if len(messages) == 0:
241
- return None
242
-
243
- txn = await self.driver.begin()
244
- kbid = messages[0].kbid
245
- if not await datamanagers.kb.exists_kb(txn, kbid=kbid):
246
- logger.info(f"KB {kbid} is deleted: skiping txn")
247
- if transaction_check:
248
- await sequence_manager.set_last_seqid(txn, partition, seqid)
249
- await txn.commit()
250
- return None
251
-
252
- try:
253
- multi = messages[0].multiid
254
- kb = KnowledgeBox(txn, self.storage, kbid)
255
- uuid = await self.get_resource_uuid(kb, messages[0])
256
- resource: Optional[Resource] = None
257
- handled_exception = None
258
- created = False
259
-
260
- for message in messages:
261
- if resource is not None:
262
- assert resource.uuid == message.uuid
263
- result = await self.apply_resource(message, kb, resource)
264
-
265
- if result is None:
266
- continue
267
-
268
- resource, _created = result
269
- created = created or _created
270
-
271
- if resource:
272
- await resource.compute_global_text()
273
- await resource.compute_global_tags(resource.indexer)
274
- await resource.compute_security(resource.indexer)
275
- if message.reindex:
276
- # when reindexing, let's just generate full new index message
277
- resource.replace_indexer(await resource.generate_index_message())
278
-
279
- if resource and resource.modified:
280
- await self.index_resource( # noqa
281
- resource=resource,
282
- txn=txn,
283
- uuid=uuid,
284
- kbid=kbid,
285
- seqid=seqid,
286
- partition=partition,
287
- kb=kb,
288
- source=messages_source(messages),
289
- )
290
-
291
- if transaction_check:
292
- await sequence_manager.set_last_seqid(txn, partition, seqid)
293
- await txn.commit()
294
-
295
- if created:
296
- await self.commit_slug(resource)
297
-
298
- await self.notify_commit(
299
- partition=partition,
300
- seqid=seqid,
301
- multi=multi,
302
- message=message,
303
- write_type=(
304
- writer_pb2.Notification.WriteType.CREATED
305
- if created
306
- else writer_pb2.Notification.WriteType.MODIFIED
307
- ),
308
- )
309
- elif resource and resource.modified is False:
310
- await txn.abort()
311
- await self.notify_abort(
312
- partition=partition,
313
- seqid=seqid,
314
- multi=multi,
315
- kbid=kbid,
316
- rid=uuid,
317
- source=message.source,
318
- )
319
- logger.warning("This message did not modify the resource")
320
- except (
321
- asyncio.TimeoutError,
322
- asyncio.CancelledError,
323
- aiohttp.client_exceptions.ClientError,
324
- ConflictError,
325
- ): # pragma: no cover
326
- # Unhandled exceptions here that should bubble and hard fail
327
- # XXX We swallow too many exceptions here!
328
- await self.notify_abort(
329
- partition=partition,
330
- seqid=seqid,
331
- multi=multi,
332
- kbid=kbid,
333
- rid=uuid,
334
- source=message.source,
335
- )
336
- raise
337
- except Exception as exc:
338
- # As we are in the middle of a transaction, we cannot let the exception raise directly
339
- # as we need to do some cleanup. The exception will be reraised at the end of the function
340
- # and then handled by the top caller, so errors can be handled in the same place.
341
- await self.deadletter(messages, partition, seqid)
342
- await self.notify_abort(
343
- partition=partition,
344
- seqid=seqid,
345
- multi=multi,
346
- kbid=kbid,
347
- rid=uuid,
348
- source=message.source,
349
- )
350
- handled_exception = exc
351
- finally:
352
- if resource is not None:
353
- resource.clean()
354
- # txn should be already commited or aborted, but in the event of an exception
355
- # it could be left open. Make sure to close it if it's still open
356
- if txn.open:
357
- await txn.abort()
358
-
359
- if handled_exception is not None:
360
- if seqid == -1:
361
- raise handled_exception
362
- else:
363
- if resource is not None:
364
- await self._mark_resource_error(kb, resource, partition, seqid)
365
- raise DeadletteredError() from handled_exception
366
-
367
- return None
368
-
369
- @processor_observer.wrap({"type": "index_resource"})
370
- async def index_resource(
371
- self,
372
- resource: Resource,
373
- txn: Transaction,
374
- uuid: str,
375
- kbid: str,
376
- seqid: int,
377
- partition: str,
378
- kb: KnowledgeBox,
379
- source: nodewriter_pb2.IndexMessageSource.ValueType,
380
- ) -> None:
381
- validate_indexable_resource(resource.indexer.brain)
382
-
383
- async with locking.distributed_lock(
384
- locking.RESOURCE_INDEX_LOCK.format(kbid=kbid, resource_id=uuid)
385
- ):
386
- # we need to have a lock at indexing time because we don't know if
387
- # a resource was move to another shard while it was being indexed
388
- shard_id = await datamanagers.resources.get_resource_shard_id(
389
- txn, kbid=kbid, rid=uuid
390
- )
391
-
392
- shard = None
393
- if shard_id is not None:
394
- shard = await kb.get_resource_shard(shard_id)
395
-
396
- if shard is None:
397
- # It's a new resource, get current active shard to place
398
- # new resource on
399
- shard = await self.shard_manager.get_current_active_shard(txn, kbid)
400
- if shard is None:
401
- # no shard available, create a new one
402
- shard = await self.shard_manager.create_shard_by_kbid(txn, kbid)
403
- await datamanagers.resources.set_resource_shard_id(
404
- txn, kbid=kbid, rid=uuid, shard=shard.shard
405
- )
406
-
407
- if shard is not None:
408
- index_message = resource.indexer.brain
409
- await self.shard_manager.add_resource(
410
- shard,
411
- index_message,
412
- seqid,
413
- partition=partition,
414
- kb=kbid,
415
- source=source,
416
- )
417
- else:
418
- raise AttributeError("Shard is not available")
419
-
420
- async def multi(self, message: writer_pb2.BrokerMessage, seqid: int) -> None:
421
- self.messages.setdefault(message.multiid, []).append(message)
422
-
423
- async def commit(
424
- self, message: writer_pb2.BrokerMessage, seqid: int, partition: str
425
- ) -> None:
426
- if message.multiid not in self.messages:
427
- # Error
428
- logger.error(f"Closed multi {message.multiid}")
429
- await self.deadletter([message], partition, seqid)
430
- else:
431
- await self.txn(self.messages[message.multiid], seqid, partition)
432
-
433
- async def rollback(
434
- self, message: writer_pb2.BrokerMessage, seqid: int, partition: str
435
- ) -> None:
436
- # Error
437
- logger.error(f"Closed multi {message.multiid}")
438
- del self.messages[message.multiid]
439
- await self.notify_abort(
440
- partition=partition,
441
- seqid=seqid,
442
- multi=message.multiid,
443
- kbid=message.kbid,
444
- rid=message.uuid,
445
- source=message.source,
446
- )
447
-
448
- async def deadletter(
449
- self, messages: list[writer_pb2.BrokerMessage], partition: str, seqid: int
450
- ) -> None:
451
- for seq, message in enumerate(messages):
452
- await self.storage.deadletter(message, seq, seqid, partition)
453
-
454
- @processor_observer.wrap({"type": "apply_resource"})
455
- async def apply_resource(
456
- self,
457
- message: writer_pb2.BrokerMessage,
458
- kb: KnowledgeBox,
459
- resource: Optional[Resource] = None,
460
- ) -> Optional[tuple[Resource, bool]]:
461
- """
462
- Convert a broker message into a resource object, and apply it to the database
463
- """
464
- created = False
465
-
466
- if resource is None:
467
- # Make sure we load the resource in case it already exists on db
468
- if message.uuid is None and message.slug:
469
- uuid = await kb.get_resource_uuid_by_slug(message.slug)
470
- else:
471
- uuid = message.uuid
472
- resource = await kb.get(uuid)
473
-
474
- if resource is None and message.source is message.MessageSource.WRITER:
475
- # It's a new resource
476
- resource = await kb.add_resource(uuid, message.slug, message.basic)
477
- created = True
478
- elif resource is not None:
479
- # It's an update of an existing resource, can come either from writer or
480
- # from processing
481
- await self.maybe_update_resource_basic(resource, message)
482
- elif resource is None and message.source is message.MessageSource.PROCESSOR:
483
- # It's a new resource, and somehow we received the message coming from processing before
484
- # the "fast" one, this shouldn't happen
485
- logger.info(
486
- f"Secondary message for resource {message.uuid} and resource does not exist, ignoring"
487
- )
488
- return None
489
-
490
- if resource is None:
491
- return None
492
-
493
- if message.HasField("origin"):
494
- await resource.set_origin(message.origin)
495
-
496
- if message.HasField("extra"):
497
- await resource.set_extra(message.extra)
498
-
499
- if message.HasField("security"):
500
- await resource.set_security(message.security)
501
-
502
- await resource.apply_fields(message)
503
- await resource.apply_extracted(message)
504
- return (resource, created)
505
-
506
- async def maybe_update_resource_basic(
507
- self, resource: Resource, message: writer_pb2.BrokerMessage
508
- ) -> None:
509
- basic_field_updates = message.HasField("basic")
510
- deleted_fields = len(message.delete_fields) > 0
511
- if not (basic_field_updates or deleted_fields):
512
- return
513
-
514
- await resource.set_basic(
515
- message.basic,
516
- deleted_fields=message.delete_fields, # type: ignore
517
- )
518
-
519
- async def get_extended_audit_data(
520
- self, message: writer_pb2.BrokerMessage
521
- ) -> writer_pb2.Audit:
522
- message_audit = writer_pb2.Audit()
523
- message_audit.CopyFrom(message.audit)
524
- message_audit.kbid = message.kbid
525
- message_audit.uuid = message.uuid
526
- message_audit.message_source = message.source
527
- message_audit.field_metadata.extend(
528
- [fcmw.field for fcmw in message.field_metadata]
529
- )
530
- audit_fields = await collect_audit_fields(self.driver, self.storage, message)
531
- message_audit.audit_fields.extend(audit_fields)
532
- return message_audit
533
-
534
- async def notify_commit(
535
- self,
536
- *,
537
- partition: str,
538
- seqid: int,
539
- multi: str,
540
- message: writer_pb2.BrokerMessage,
541
- write_type: writer_pb2.Notification.WriteType.ValueType,
542
- ):
543
- message_audit = await self.get_extended_audit_data(message)
544
- notification = writer_pb2.Notification(
545
- partition=int(partition),
546
- seqid=seqid,
547
- multi=multi,
548
- uuid=message.uuid,
549
- kbid=message.kbid,
550
- action=writer_pb2.Notification.Action.COMMIT,
551
- write_type=write_type,
552
- source=MESSAGE_TO_NOTIFICATION_SOURCE[message.source],
553
- processing_errors=len(message.errors) > 0,
554
- message_audit=message_audit,
555
- )
556
-
557
- await self.notify(
558
- const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=message.kbid),
559
- notification.SerializeToString(),
560
- )
561
-
562
- async def notify_abort(
563
- self,
564
- *,
565
- partition: str,
566
- seqid: int,
567
- multi: str,
568
- kbid: str,
569
- rid: str,
570
- source: writer_pb2.BrokerMessage.MessageSource.ValueType,
571
- ):
572
- message = writer_pb2.Notification(
573
- partition=int(partition),
574
- seqid=seqid,
575
- multi=multi,
576
- uuid=rid,
577
- kbid=kbid,
578
- action=writer_pb2.Notification.ABORT,
579
- source=MESSAGE_TO_NOTIFICATION_SOURCE[source],
580
- )
581
- await self.notify(
582
- const.PubSubChannels.RESOURCE_NOTIFY.format(kbid=kbid),
583
- message.SerializeToString(),
584
- )
585
-
586
- async def notify(self, channel, payload: bytes):
587
- if self.pubsub is not None:
588
- await self.pubsub.publish(channel, payload)
589
-
590
- async def _mark_resource_error(
591
- self, kb: KnowledgeBox, resource: Optional[Resource], partition: str, seqid: int
592
- ) -> None:
593
- """
594
- Unhandled error processing, try to mark resource as error
595
- """
596
- if resource is None or resource.basic is None:
597
- logger.info(
598
- f"Skip when resource does not even have basic metadata: {resource}"
599
- )
600
- return
601
- try:
602
- async with self.driver.transaction() as txn:
603
- kb.txn = resource.txn = txn
604
-
605
- shard_id = await datamanagers.resources.get_resource_shard_id(
606
- txn, kbid=kb.kbid, rid=resource.uuid
607
- )
608
- shard = None
609
- if shard_id is not None:
610
- shard = await kb.get_resource_shard(shard_id)
611
- if shard is None:
612
- logger.warning(
613
- "Unable to mark resource as error, shard is None. "
614
- "This should not happen so you did something special to get here."
615
- )
616
- return
617
-
618
- resource.basic.metadata.status = resources_pb2.Metadata.Status.ERROR
619
- await resource.set_basic(resource.basic)
620
- await txn.commit()
621
-
622
- resource.indexer.set_processing_status(
623
- basic=resource.basic, previous_status=resource._previous_status
624
- )
625
- await self.shard_manager.add_resource(
626
- shard, resource.indexer.brain, seqid, partition=partition, kb=kb.kbid
627
- )
628
- except Exception:
629
- logger.warning("Error while marking resource as error", exc_info=True)
630
-
631
- # KB tools
632
- # XXX: Why are these utility functions here?
633
- async def get_kb_obj(
634
- self, txn: Transaction, kbid: knowledgebox_pb2.KnowledgeBoxID
635
- ) -> Optional[KnowledgeBox]:
636
- uuid: Optional[str] = kbid.uuid
637
- if uuid == "":
638
- uuid = await datamanagers.kb.get_kb_uuid(txn, slug=kbid.slug)
639
-
640
- if uuid is None:
641
- return None
642
-
643
- if not (await datamanagers.kb.exists_kb(txn, kbid=uuid)):
644
- return None
645
-
646
- storage = await get_storage()
647
- kbobj = KnowledgeBox(txn, storage, uuid)
648
- return kbobj
649
-
650
- @processor_observer.wrap({"type": "create_kb"})
651
- async def create_kb(
652
- self,
653
- slug: str,
654
- config: Optional[knowledgebox_pb2.KnowledgeBoxConfig],
655
- semantic_model: knowledgebox_pb2.SemanticModelMetadata,
656
- forceuuid: Optional[str] = None,
657
- release_channel: utils_pb2.ReleaseChannel.ValueType = utils_pb2.ReleaseChannel.STABLE,
658
- ) -> str:
659
- async with self.driver.transaction() as txn:
660
- try:
661
- uuid, failed = await KnowledgeBox.create(
662
- txn,
663
- slug,
664
- semantic_model,
665
- uuid=forceuuid,
666
- config=config,
667
- release_channel=release_channel,
668
- )
669
- if failed:
670
- raise Exception("Failed to create KB")
671
- await txn.commit()
672
- return uuid
673
- except KnowledgeBoxConflict:
674
- raise
675
- except Exception as e:
676
- errors.capture_exception(e)
677
- raise e
678
-
679
- async def update_kb(
680
- self,
681
- kbid: str,
682
- slug: str,
683
- config: Optional[knowledgebox_pb2.KnowledgeBoxConfig],
684
- ) -> str:
685
- async with self.driver.transaction() as txn:
686
- uuid = await KnowledgeBox.update(txn, kbid, slug, config=config)
687
- await txn.commit()
688
- return uuid
689
-
690
- async def delete_kb(self, kbid: str) -> str:
691
- async with self.driver.transaction() as txn:
692
- uuid = await KnowledgeBox.delete_kb(txn, kbid)
693
- await txn.commit()
694
- return uuid
695
-
696
19
 
697
- def messages_source(messages: list[writer_pb2.BrokerMessage]):
698
- from_writer = all(
699
- (
700
- message.source == writer_pb2.BrokerMessage.MessageSource.WRITER
701
- for message in messages
702
- )
703
- )
704
- from_processor = all(
705
- (
706
- message.source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR
707
- for message in messages
708
- )
709
- )
710
- if from_writer:
711
- source = nodewriter_pb2.IndexMessageSource.WRITER
712
- elif from_processor:
713
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
714
- else: # pragma: nocover
715
- msg = "Processor received multiple broker messages with different sources in the same txn!"
716
- logger.error(msg)
717
- errors.capture_exception(Exception(msg))
718
- source = nodewriter_pb2.IndexMessageSource.PROCESSOR
719
- return source
20
+ # reexports
21
+ from .processor import Processor, validate_indexable_resource # noqa: F401