nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,25 +18,27 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import functools
21
- from io import BytesIO
22
21
  from typing import AsyncGenerator, AsyncIterator, Callable, Optional
23
22
 
24
- import nats.errors
25
23
  from google.protobuf.message import DecodeError as ProtobufDecodeError
26
24
 
25
+ from nucliadb import learning_proxy
27
26
  from nucliadb.common import datamanagers
28
27
  from nucliadb.common.context import ApplicationContext
29
28
  from nucliadb.export_import import logger
30
29
  from nucliadb.export_import.datamanager import ExportImportDataManager
31
30
  from nucliadb.export_import.exceptions import (
32
31
  ExportStreamExhausted,
32
+ IncompatibleExport,
33
33
  WrongExportStreamFormat,
34
34
  )
35
35
  from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
36
+ from nucliadb.ingest.orm.broker_message import generate_broker_message
36
37
  from nucliadb_models.export_import import Status
37
38
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
38
39
  from nucliadb_protos import resources_pb2, writer_pb2
39
40
  from nucliadb_utils.const import Streams
41
+ from nucliadb_utils.transaction import MaxTransactionSizeExceededError
40
42
 
41
43
  BinaryStream = AsyncGenerator[bytes, None]
42
44
  BinaryStreamGenerator = Callable[[int], BinaryStream]
@@ -50,7 +52,6 @@ PROCESSING_BM_FIELDS = [
50
52
  "field_metadata",
51
53
  "field_vectors",
52
54
  "field_large_metadata",
53
- "user_vectors",
54
55
  ]
55
56
 
56
57
  # Broker message fields that are populated by the nucliadb writer component
@@ -59,9 +60,6 @@ WRITER_BM_FIELDS = [
59
60
  "files",
60
61
  "texts",
61
62
  "conversations",
62
- "layouts",
63
- "keywordsets",
64
- "datetimes",
65
63
  ]
66
64
 
67
65
 
@@ -88,7 +86,7 @@ async def transaction_commit(
88
86
  wait=False,
89
87
  target_subject=Streams.INGEST_PROCESSED.subject,
90
88
  )
91
- except nats.errors.MaxPayloadError:
89
+ except MaxTransactionSizeExceededError:
92
90
  stored_key = await context.blob_storage.set_stream_message(
93
91
  kbid=bm.kbid, rid=bm.uuid, data=bm.SerializeToString()
94
92
  )
@@ -151,23 +149,17 @@ async def set_entities_groups(
151
149
  context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
152
150
  ) -> None:
153
151
  async with datamanagers.with_transaction() as txn:
154
- await datamanagers.entities.set_entities_groups(
155
- txn, kbid=kbid, entities_groups=entities_groups
156
- )
152
+ await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
157
153
  await txn.commit()
158
154
 
159
155
 
160
- async def set_labels(
161
- context: ApplicationContext, kbid: str, labels: kb_pb2.Labels
162
- ) -> None:
156
+ async def set_labels(context: ApplicationContext, kbid: str, labels: kb_pb2.Labels) -> None:
163
157
  async with datamanagers.with_transaction() as txn:
164
158
  await datamanagers.labels.set_labels(txn, kbid=kbid, labels=labels)
165
159
  await txn.commit()
166
160
 
167
161
 
168
- async def iter_kb_resource_uuids(
169
- context: ApplicationContext, kbid: str
170
- ) -> AsyncGenerator[str, None]:
162
+ async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> AsyncGenerator[str, None]:
171
163
  async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
172
164
  yield rid
173
165
 
@@ -175,8 +167,13 @@ async def iter_kb_resource_uuids(
175
167
  async def get_broker_message(
176
168
  context: ApplicationContext, kbid: str, rid: str
177
169
  ) -> Optional[writer_pb2.BrokerMessage]:
178
- async with datamanagers.with_transaction() as txn:
179
- return await datamanagers.resources.get_broker_message(txn, kbid=kbid, rid=rid)
170
+ async with datamanagers.with_ro_transaction() as txn:
171
+ resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
172
+ if resource is None:
173
+ return None
174
+ resource.disable_vectors = False
175
+ resource.txn = txn
176
+ return await generate_broker_message(resource)
180
177
 
181
178
 
182
179
  def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFile]:
@@ -184,6 +181,10 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
184
181
  binaries: list[resources_pb2.CloudFile] = []
185
182
  for file_field in bm.files.values():
186
183
  if file_field.HasField("file"):
184
+ if file_field.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
185
+ # Binaries of externally hosted files are not
186
+ # to be downloaded and stored in the export file
187
+ continue
187
188
  _clone_collect_cf(binaries, file_field.file)
188
189
 
189
190
  for conversation in bm.conversations.values():
@@ -191,11 +192,6 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
191
192
  for attachment in message.content.attachments:
192
193
  _clone_collect_cf(binaries, attachment)
193
194
 
194
- for layout in bm.layouts.values():
195
- for block in layout.body.blocks.values():
196
- if block.HasField("file"):
197
- _clone_collect_cf(binaries, block.file)
198
-
199
195
  for field_extracted_data in bm.file_extracted_data:
200
196
  if field_extracted_data.HasField("file_thumbnail"):
201
197
  _clone_collect_cf(binaries, field_extracted_data.file_thumbnail)
@@ -213,6 +209,8 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
213
209
  _clone_collect_cf(binaries, link_extracted_data.link_preview)
214
210
  if link_extracted_data.HasField("link_image"):
215
211
  _clone_collect_cf(binaries, link_extracted_data.link_image)
212
+ for file_generated in link_extracted_data.file_generated.values():
213
+ _clone_collect_cf(binaries, file_generated)
216
214
 
217
215
  for field_metadata in bm.field_metadata:
218
216
  if field_metadata.metadata.metadata.HasField("thumbnail"):
@@ -225,9 +223,7 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
225
223
  return binaries
226
224
 
227
225
 
228
- def _clone_collect_cf(
229
- binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile
230
- ):
226
+ def _clone_collect_cf(binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile):
231
227
  cf = resources_pb2.CloudFile()
232
228
  cf.CopyFrom(origin)
233
229
  # Mark the cloud file of the broker message being exported as export source
@@ -244,43 +240,21 @@ async def download_binary(
244
240
 
245
241
 
246
242
  async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
247
- async with datamanagers.with_transaction() as txn:
243
+ async with datamanagers.with_ro_transaction() as txn:
248
244
  return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
249
245
 
250
246
 
251
247
  async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
252
- async with datamanagers.with_transaction() as txn:
248
+ async with datamanagers.with_ro_transaction() as txn:
253
249
  return await datamanagers.labels.get_labels(txn, kbid=kbid)
254
250
 
255
251
 
256
- class EndOfStream(Exception):
257
- ...
252
+ class EndOfStream(Exception): ...
258
253
 
259
254
 
260
255
  class ExportStream:
261
256
  """
262
257
  Models a stream of export bytes that can be read from asynchronously.
263
- """
264
-
265
- def __init__(self, export: BytesIO):
266
- self.export = export
267
- self.read_bytes = 0
268
- self._length = len(export.getvalue())
269
-
270
- async def read(self, n_bytes):
271
- """
272
- Reads n_bytes from the export stream.
273
- Raises ExportStreamExhausted if there are no more bytes to read.
274
- """
275
- if self.read_bytes == self._length:
276
- raise ExportStreamExhausted()
277
- chunk = self.export.read(n_bytes)
278
- self.read_bytes += len(chunk)
279
- return chunk
280
-
281
-
282
- class IteratorExportStream(ExportStream):
283
- """
284
258
  Adapts the parent class to be able to read bytes yielded from an async iterator.
285
259
  """
286
260
 
@@ -325,8 +299,8 @@ class ExportStreamReader:
325
299
  yields the deserialized export items ready to be imported.
326
300
  """
327
301
 
328
- def __init__(self, export_stream: ExportStream):
329
- self.stream = export_stream
302
+ def __init__(self, stream: AsyncGenerator[bytes, None]):
303
+ self.stream = ExportStream(stream)
330
304
 
331
305
  @property
332
306
  def read_bytes(self) -> int:
@@ -401,6 +375,26 @@ class ExportStreamReader:
401
375
  raise WrongExportStreamFormat() from ex
402
376
  return labels
403
377
 
378
+ async def maybe_read_learning_config(
379
+ self,
380
+ ) -> tuple[Optional[learning_proxy.LearningConfiguration], bytes]:
381
+ """
382
+ Tries to read a learning config from the beginning of the stream.
383
+ Returs the learning config if found. It also returns any leftover bytes that
384
+ may have been read from the network into memory that need to be yielded and imported.
385
+ """
386
+ # We assume that the learning config is the first item in the export stream.
387
+ try:
388
+ type_bytes = await self.stream.read(3)
389
+ except ExportStreamExhausted:
390
+ return None, self.stream.buffer
391
+ if type_bytes != ExportedItemType.LEARNING_CONFIG.value.encode():
392
+ # Backward compatible code for old exports that don't have a learning config.
393
+ return None, type_bytes + self.stream.buffer
394
+ data = await self.read_item()
395
+ lconfig = learning_proxy.LearningConfiguration.model_validate_json(data)
396
+ return lconfig, self.stream.buffer
397
+
404
398
  async def iter_items(self) -> AsyncGenerator[ExportItem, None]:
405
399
  while True:
406
400
  try:
@@ -411,7 +405,7 @@ class ExportStreamReader:
411
405
  ExportedItemType.ENTITIES: self.read_entities,
412
406
  ExportedItemType.LABELS: self.read_labels,
413
407
  }[item_type]
414
- data = await read_data_func() # type: ignore
408
+ data = await read_data_func()
415
409
  yield item_type, data
416
410
  except ExportStreamExhausted:
417
411
  break
@@ -476,3 +470,49 @@ class TaskRetryHandler:
476
470
  await self.dm.set_metadata(self.type, metadata)
477
471
 
478
472
  return wrapper
473
+
474
+
475
+ async def get_learning_config(
476
+ kbid: str,
477
+ ) -> Optional[learning_proxy.LearningConfiguration]:
478
+ return await learning_proxy.get_configuration(kbid)
479
+
480
+
481
+ def stream_compatible_with_kb(
482
+ kbid: str, stream: AsyncGenerator[bytes, None]
483
+ ) -> AsyncGenerator[bytes, None]:
484
+ """
485
+ Wrapper around an export stream that checks if the export is compatible with the destination knowledge box.
486
+ """
487
+
488
+ async def wrapped() -> AsyncGenerator[bytes, None]:
489
+ # Read the a few bytes from the beginning of the stream to check the semantic model.
490
+ # If the semantic model is not compatible, raise an exception.
491
+ # If there are leftover bytes, yield them.
492
+ leftover_bytes = await _check_semantic_model_compatibility(kbid, stream)
493
+ if len(leftover_bytes) > 0:
494
+ yield leftover_bytes
495
+
496
+ # Now yield the rest of the stream
497
+ async for chunk in stream:
498
+ yield chunk
499
+
500
+ return wrapped()
501
+
502
+
503
+ async def _check_semantic_model_compatibility(kbid: str, stream: AsyncGenerator[bytes, None]) -> bytes:
504
+ stream_reader = ExportStreamReader(stream)
505
+ lconfig, leftover_bytes = await stream_reader.maybe_read_learning_config()
506
+ if lconfig is None:
507
+ logger.warning("Learning config not found on the export stream. Export may be incompatible.")
508
+ return leftover_bytes
509
+ kb_lconfig = await get_learning_config(kbid)
510
+ if kb_lconfig is None:
511
+ logger.warning("No learning config found on the knowledge box. Export may be incompatible.")
512
+ return leftover_bytes
513
+ if kb_lconfig.semantic_model == lconfig.semantic_model:
514
+ logger.info(f"Semantic model match: {kb_lconfig.semantic_model}")
515
+ return leftover_bytes
516
+ raise IncompatibleExport(
517
+ f"Cannot import. Semantic model mismatch: {kb_lconfig.semantic_model} != {lconfig.semantic_model}"
518
+ )
nucliadb/health.py CHANGED
@@ -78,9 +78,7 @@ async def grpc_health_check(health_servicer) -> None:
78
78
  for check in _health_checks:
79
79
  if not check():
80
80
  logger.info(f"Health check failed on {check.__name__}")
81
- await health_servicer.set(
82
- "", health_pb2.HealthCheckResponse.NOT_SERVING
83
- )
81
+ await health_servicer.set("", health_pb2.HealthCheckResponse.NOT_SERVING)
84
82
  break
85
83
  else:
86
84
  await health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)
nucliadb/ingest/app.py CHANGED
@@ -18,10 +18,9 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
+ import importlib.metadata
21
22
  from typing import Awaitable, Callable
22
23
 
23
- import pkg_resources
24
-
25
24
  from nucliadb import health
26
25
  from nucliadb.common.cluster.discovery.utils import (
27
26
  setup_cluster_discovery,
@@ -30,10 +29,12 @@ from nucliadb.common.cluster.discovery.utils import (
30
29
  from nucliadb.common.cluster.settings import settings as cluster_settings
31
30
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
32
31
  from nucliadb.common.context import ApplicationContext
32
+ from nucliadb.common.nidx import start_nidx_utility
33
33
  from nucliadb.export_import.tasks import get_exports_consumer, get_imports_consumer
34
34
  from nucliadb.ingest import SERVICE_NAME
35
35
  from nucliadb.ingest.consumer import service as consumer_service
36
36
  from nucliadb.ingest.partitions import assign_partitions
37
+ from nucliadb.ingest.processing import start_processing_engine, stop_processing_engine
37
38
  from nucliadb.ingest.service import start_grpc
38
39
  from nucliadb.ingest.settings import settings
39
40
  from nucliadb_telemetry import errors
@@ -46,10 +47,12 @@ from nucliadb_utils.utilities import (
46
47
  start_audit_utility,
47
48
  start_indexing_utility,
48
49
  start_nats_manager,
50
+ start_partitioning_utility,
49
51
  start_transaction_utility,
50
52
  stop_audit_utility,
51
53
  stop_indexing_utility,
52
54
  stop_nats_manager,
55
+ stop_partitioning_utility,
53
56
  stop_transaction_utility,
54
57
  )
55
58
 
@@ -59,15 +62,17 @@ async def initialize() -> list[Callable[[], Awaitable[None]]]:
59
62
 
60
63
  await setup_cluster()
61
64
  await start_transaction_utility(SERVICE_NAME)
62
- if (
63
- not cluster_settings.standalone_mode
64
- and indexing_settings.index_jetstream_servers is not None
65
- ):
65
+ if not cluster_settings.standalone_mode and indexing_settings.index_jetstream_servers is not None:
66
66
  await start_indexing_utility(SERVICE_NAME)
67
67
 
68
+ start_partitioning_utility()
69
+
70
+ await start_nidx_utility()
71
+
68
72
  await start_audit_utility(SERVICE_NAME)
69
73
 
70
74
  finalizers = [
75
+ stop_partitioning_utility,
71
76
  stop_transaction_utility,
72
77
  stop_indexing_utility,
73
78
  stop_audit_utility,
@@ -123,8 +128,7 @@ async def main_consumer(): # pragma: no cover
123
128
  ingest_consumers = await consumer_service.start_ingest_consumers(SERVICE_NAME)
124
129
 
125
130
  await run_until_exit(
126
- [grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown]
127
- + finalizers
131
+ [grpc_health_finalizer, pull_workers, ingest_consumers, metrics_server.shutdown] + finalizers
128
132
  )
129
133
 
130
134
 
@@ -138,12 +142,13 @@ async def main_orm_grpc(): # pragma: no cover
138
142
  async def main_ingest_processed_consumer(): # pragma: no cover
139
143
  finalizers = await initialize()
140
144
 
145
+ await start_processing_engine()
141
146
  metrics_server = await serve_metrics()
142
147
  grpc_health_finalizer = await health.start_grpc_health_service(settings.grpc_port)
143
148
  consumer = await consumer_service.start_ingest_processed_consumer(SERVICE_NAME)
144
149
 
145
150
  await run_until_exit(
146
- [grpc_health_finalizer, consumer, metrics_server.shutdown] + finalizers
151
+ [grpc_health_finalizer, consumer, metrics_server.shutdown, stop_processing_engine] + finalizers
147
152
  )
148
153
 
149
154
 
@@ -181,10 +186,9 @@ async def main_subscriber_workers(): # pragma: no cover
181
186
 
182
187
  def setup_configuration(): # pragma: no cover
183
188
  setup_logging()
184
-
185
189
  assign_partitions(settings)
186
190
 
187
- errors.setup_error_handling(pkg_resources.get_distribution("nucliadb").version)
191
+ errors.setup_error_handling(importlib.metadata.distribution("nucliadb").version)
188
192
 
189
193
  if asyncio._get_running_loop() is not None:
190
194
  raise RuntimeError("cannot be called from a running event loop")
@@ -23,14 +23,11 @@ import logging
23
23
  import uuid
24
24
  from functools import partial
25
25
 
26
- from nucliadb_protos.resources_pb2 import FieldType
27
-
26
+ from nucliadb.common import datamanagers
28
27
  from nucliadb.common.cluster.exceptions import ShardsNotFound
29
28
  from nucliadb.common.cluster.manager import choose_node
30
29
  from nucliadb.common.cluster.utils import get_shard_manager
31
- from nucliadb.common.maindb.driver import Driver
32
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
33
- from nucliadb.ingest.orm.resource import Resource
30
+ from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
34
31
  from nucliadb_protos import audit_pb2, nodereader_pb2, noderesources_pb2, writer_pb2
35
32
  from nucliadb_utils import const
36
33
  from nucliadb_utils.audit.audit import AuditStorage
@@ -62,12 +59,10 @@ class IndexAuditHandler:
62
59
  def __init__(
63
60
  self,
64
61
  *,
65
- driver: Driver,
66
62
  audit: AuditStorage,
67
63
  pubsub: PubSubDriver,
68
64
  check_delay: float = 5.0,
69
65
  ):
70
- self.driver = driver
71
66
  self.audit = audit
72
67
  self.pubsub = pubsub
73
68
  self.shard_manager = get_shard_manager()
@@ -98,17 +93,15 @@ class IndexAuditHandler:
98
93
  metrics.total_messages.inc({"action": "ignored", "type": "audit_counter"})
99
94
  return
100
95
 
101
- self.task_handler.schedule(
102
- notification.kbid, partial(self.process_kb, notification.kbid)
103
- )
96
+ self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
104
97
  metrics.total_messages.inc({"action": "scheduled", "type": "audit_counter"})
105
98
 
106
99
  @metrics.handler_histo.wrap({"type": "audit_counter"})
107
100
  async def process_kb(self, kbid: str) -> None:
108
101
  try:
109
- shard_groups: list[
110
- writer_pb2.ShardObject
111
- ] = await self.shard_manager.get_shards_by_kbid(kbid)
102
+ shard_groups: list[writer_pb2.ShardObject] = await self.shard_manager.get_shards_by_kbid(
103
+ kbid
104
+ )
112
105
  except ShardsNotFound:
113
106
  logger.warning(f"No shards found for kbid {kbid}, skipping")
114
107
  return
@@ -119,7 +112,8 @@ class IndexAuditHandler:
119
112
  total_paragraphs = 0
120
113
 
121
114
  for shard_obj in shard_groups:
122
- node, shard_id = choose_node(shard_obj)
115
+ # TODO: Uses node for auditing, don't want to suddenly change metrics
116
+ node, shard_id = choose_node(shard_obj, use_nidx=False)
123
117
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
124
118
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
125
119
  )
@@ -127,12 +121,18 @@ class IndexAuditHandler:
127
121
  total_fields += shard.fields
128
122
  total_paragraphs += shard.paragraphs
129
123
 
130
- await self.audit.report(
124
+ async with datamanagers.with_ro_transaction() as txn:
125
+ num_vectorsets = (
126
+ len([vs async for vs in datamanagers.vectorsets.iter(txn=txn, kbid=kbid)]) or 1
127
+ )
128
+
129
+ self.audit.report_storage(
131
130
  kbid=kbid,
132
- audit_type=audit_pb2.AuditRequest.AuditType.INDEXED,
133
- kb_counter=audit_pb2.AuditKBCounter(
134
- fields=total_fields, paragraphs=total_paragraphs
135
- ),
131
+ paragraphs=total_paragraphs,
132
+ fields=total_fields,
133
+ bytes=total_paragraphs # This is an estimation of bytes stored in a KB
134
+ * AVG_PARAGRAPH_SIZE_BYTES
135
+ * num_vectorsets,
136
136
  )
137
137
 
138
138
 
@@ -147,12 +147,10 @@ class ResourceWritesAuditHandler:
147
147
  def __init__(
148
148
  self,
149
149
  *,
150
- driver: Driver,
151
150
  storage: Storage,
152
151
  audit: AuditStorage,
153
152
  pubsub: PubSubDriver,
154
153
  ):
155
- self.driver = driver
156
154
  self.storage = storage
157
155
  self.audit = audit
158
156
  self.pubsub = pubsub
@@ -169,117 +167,6 @@ class ResourceWritesAuditHandler:
169
167
  async def finalize(self) -> None:
170
168
  await self.pubsub.unsubscribe(self.subscription_id)
171
169
 
172
- def iterate_auditable_fields(
173
- self,
174
- resource_keys: list[tuple[FieldType.ValueType, str]],
175
- message: writer_pb2.BrokerMessage,
176
- ):
177
- """
178
- Generator that emits the combined list of field ids from both
179
- the existing resource and message that needs to be considered
180
- in the audit of fields.
181
- """
182
- yielded = set()
183
-
184
- # Include all fields present in the message we are processing
185
- for field_id in message.files.keys():
186
- key = (field_id, writer_pb2.FieldType.FILE)
187
- yield key
188
- yielded.add(key)
189
-
190
- for field_id in message.conversations.keys():
191
- key = (field_id, writer_pb2.FieldType.CONVERSATION)
192
- yield key
193
- yielded.add(key)
194
-
195
- for field_id in message.layouts.keys():
196
- key = (field_id, writer_pb2.FieldType.LAYOUT)
197
- yield key
198
- yielded.add(key)
199
-
200
- for field_id in message.texts.keys():
201
- key = (field_id, writer_pb2.FieldType.TEXT)
202
- yield key
203
- yielded.add(key)
204
-
205
- for field_id in message.keywordsets.keys():
206
- key = (field_id, writer_pb2.FieldType.KEYWORDSET)
207
- yield key
208
- yielded.add(key)
209
-
210
- for field_id in message.datetimes.keys():
211
- key = (field_id, writer_pb2.FieldType.DATETIME)
212
- yield key
213
- yielded.add(key)
214
-
215
- for field_id in message.links.keys():
216
- key = (field_id, writer_pb2.FieldType.LINK)
217
- yield key
218
- yielded.add(key)
219
-
220
- for field_type, field_id in resource_keys:
221
- if field_type is writer_pb2.FieldType.GENERIC:
222
- continue
223
-
224
- if not (
225
- field_id in message.files
226
- or message.type is writer_pb2.BrokerMessage.MessageType.DELETE
227
- ):
228
- continue
229
-
230
- # Avoid duplicates
231
- if (field_type, field_id) in yielded:
232
- continue
233
-
234
- yield (field_id, field_type)
235
-
236
- async def collect_audit_fields(
237
- self, message: writer_pb2.BrokerMessage
238
- ) -> list[audit_pb2.AuditField]:
239
- if message.type == writer_pb2.BrokerMessage.MessageType.DELETE:
240
- # If we are fully deleting a resource we won't iterate the delete_fields (if any).
241
- # Make no sense as we already collected all resource fields as deleted
242
- return []
243
-
244
- audit_storage_fields: list[audit_pb2.AuditField] = []
245
- async with self.driver.transaction() as txn:
246
- kb = KnowledgeBox(txn, self.storage, message.kbid)
247
- resource = Resource(txn, self.storage, kb, message.uuid)
248
- field_keys = await resource.get_fields_ids()
249
-
250
- for field_id, field_type in self.iterate_auditable_fields(
251
- field_keys, message
252
- ):
253
- auditfield = audit_pb2.AuditField()
254
- auditfield.field_type = field_type
255
- auditfield.field_id = field_id
256
- if field_type is writer_pb2.FieldType.FILE:
257
- auditfield.filename = message.files[field_id].file.filename
258
- # The field did exist, so we are overwriting it, with a modified file
259
- # in case of a file
260
- auditfield.action = audit_pb2.AuditField.FieldAction.MODIFIED
261
- if field_type is writer_pb2.FieldType.FILE:
262
- auditfield.size = message.files[field_id].file.size
263
-
264
- audit_storage_fields.append(auditfield)
265
-
266
- for fieldid in message.delete_fields or []:
267
- field = await resource.get_field(
268
- fieldid.field, writer_pb2.FieldType.FILE, load=True
269
- )
270
- audit_field = audit_pb2.AuditField()
271
- audit_field.action = audit_pb2.AuditField.FieldAction.DELETED
272
- audit_field.field_id = fieldid.field
273
- audit_field.field_type = fieldid.field_type
274
- if fieldid.field_type is writer_pb2.FieldType.FILE:
275
- val = await field.get_value()
276
- audit_field.size = 0
277
- if val is not None:
278
- audit_field.filename = val.file.filename
279
- audit_storage_fields.append(audit_field)
280
-
281
- return audit_storage_fields
282
-
283
170
  async def handle_message(self, raw_data) -> None:
284
171
  data = self.pubsub.parse(raw_data)
285
172
  notification = writer_pb2.Notification()
@@ -289,27 +176,23 @@ class ResourceWritesAuditHandler:
289
176
  metrics.total_messages.inc({"action": "ignored", "type": "audit_fields"})
290
177
  return
291
178
 
292
- message = notification.message
293
- if message.source == message.MessageSource.PROCESSOR:
179
+ message_audit: writer_pb2.Audit = notification.message_audit
180
+ if message_audit.message_source == writer_pb2.BrokerMessage.MessageSource.PROCESSOR:
294
181
  metrics.total_messages.inc({"action": "ignored", "type": "audit_fields"})
295
182
  return
296
183
 
297
- logger.info(
298
- {"message": "Processing field audit for kbid", "kbid": notification.kbid}
299
- )
184
+ logger.info({"message": "Processing field audit for kbid", "kbid": notification.kbid})
300
185
 
301
186
  metrics.total_messages.inc({"action": "scheduled", "type": "audit_fields"})
302
187
  with metrics.handler_histo({"type": "audit_fields"}):
303
- audit_fields = await self.collect_audit_fields(message)
304
- field_metadata = [fi.field for fi in message.field_metadata]
305
- when = message.audit.when if message.audit.HasField("when") else None
306
- await self.audit.report(
307
- kbid=message.kbid,
188
+ when = message_audit.when if message_audit.HasField("when") else None
189
+ self.audit.report_and_send(
190
+ kbid=message_audit.kbid,
308
191
  when=when,
309
- user=message.audit.user,
310
- rid=message.uuid,
311
- origin=message.audit.origin,
312
- field_metadata=field_metadata,
192
+ user=message_audit.user,
193
+ rid=message_audit.uuid,
194
+ origin=message_audit.origin,
195
+ field_metadata=list(message_audit.field_metadata),
313
196
  audit_type=AUDIT_TYPES.get(notification.write_type),
314
- audit_fields=audit_fields,
197
+ audit_fields=list(message_audit.audit_fields),
315
198
  )