nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,260 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ import os
22
+ from typing import Optional
23
+
24
+ from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxSearcherStub
25
+
26
+ from nucliadb.common.cluster.base import AbstractIndexNode
27
+ from nucliadb.common.cluster.settings import settings
28
+ from nucliadb.ingest.settings import DriverConfig
29
+ from nucliadb.ingest.settings import settings as ingest_settings
30
+ from nucliadb_protos.nodewriter_pb2 import (
31
+ IndexMessage,
32
+ )
33
+ from nucliadb_utils import logger
34
+ from nucliadb_utils.grpc import get_traced_grpc_channel
35
+ from nucliadb_utils.nats import NatsConnectionManager
36
+ from nucliadb_utils.settings import FileBackendConfig, indexing_settings, storage_settings
37
+ from nucliadb_utils.storages.settings import settings as extended_storage_settings
38
+ from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
39
+
40
+ NIDX_ENABLED = bool(os.environ.get("NIDX_ENABLED"))
41
+
42
+
43
+ class NidxUtility:
44
+ api_client = None
45
+ searcher_client = None
46
+
47
+ async def initialize(self):
48
+ raise NotImplementedError()
49
+
50
+ async def finalize(self):
51
+ raise NotImplementedError()
52
+
53
+ async def index(self, msg: IndexMessage) -> int:
54
+ raise NotImplementedError()
55
+
56
+ def wait_for_sync(self):
57
+ pass
58
+
59
+
60
+ def _storage_config(prefix: str, bucket: Optional[str]) -> dict[str, str]:
61
+ config = {}
62
+ if storage_settings.file_backend == FileBackendConfig.LOCAL:
63
+ local_bucket = bucket or storage_settings.local_indexing_bucket
64
+ file_path = f"{storage_settings.local_files}/{local_bucket}"
65
+ os.makedirs(file_path, exist_ok=True)
66
+
67
+ config[f"{prefix}__OBJECT_STORE"] = "file"
68
+ config[f"{prefix}__FILE_PATH"] = file_path
69
+ elif storage_settings.file_backend == FileBackendConfig.GCS:
70
+ gcs_bucket = bucket or extended_storage_settings.gcs_indexing_bucket
71
+ config[f"{prefix}__OBJECT_STORE"] = "gcs"
72
+ if gcs_bucket:
73
+ config[f"{prefix}__BUCKET"] = gcs_bucket
74
+ if storage_settings.gcs_base64_creds:
75
+ config[f"{prefix}__BASE64_CREDS"] = storage_settings.gcs_base64_creds
76
+ if storage_settings.gcs_endpoint_url:
77
+ config[f"{prefix}__ENDPOINT"] = storage_settings.gcs_endpoint_url
78
+ elif storage_settings.file_backend == FileBackendConfig.S3:
79
+ s3_bucket = bucket or extended_storage_settings.s3_indexing_bucket
80
+ config[f"{prefix}__OBJECT_STORE"] = "s3"
81
+ if s3_bucket:
82
+ config[f"{prefix}__BUCKET"] = s3_bucket
83
+ config[f"{prefix}__CLIENT_ID"] = storage_settings.s3_client_id or ""
84
+ config[f"{prefix}__CLIENT_SECRET"] = storage_settings.s3_client_secret or ""
85
+ config[f"{prefix}__REGION_NAME"] = storage_settings.s3_region_name or ""
86
+ if storage_settings.s3_endpoint:
87
+ config[f"{prefix}__ENDPOINT"] = storage_settings.s3_endpoint
88
+
89
+ return config
90
+
91
+
92
+ class NidxBindingUtility(NidxUtility):
93
+ """Implements Nidx utility using the binding"""
94
+
95
+ def __init__(self):
96
+ if ingest_settings.driver != DriverConfig.PG:
97
+ raise ValueError("nidx_binding requires DRIVER=pg")
98
+
99
+ self.config = {
100
+ "METADATA__DATABASE_URL": ingest_settings.driver_pg_url,
101
+ **_storage_config("INDEXER", None),
102
+ **_storage_config("STORAGE", "nidx"),
103
+ }
104
+
105
+ async def initialize(self):
106
+ import nidx_binding # type: ignore
107
+
108
+ self.binding = nidx_binding.NidxBinding(self.config)
109
+ self.api_client = NidxApiStub(
110
+ get_traced_grpc_channel(f"localhost:{self.binding.api_port}", "nidx_api")
111
+ )
112
+ self.searcher_client = NidxSearcherStub(
113
+ get_traced_grpc_channel(f"localhost:{self.binding.searcher_port}", "nidx_searcher")
114
+ )
115
+
116
+ async def finalize(self):
117
+ del self.binding
118
+
119
+ async def index(self, msg: IndexMessage) -> int:
120
+ return self.binding.index(msg.SerializeToString())
121
+
122
+ def wait_for_sync(self):
123
+ self.binding.wait_for_sync()
124
+
125
+
126
+ class NidxServiceUtility(NidxUtility):
127
+ """Implements Nidx utility connecting to the network service"""
128
+
129
+ def __init__(self):
130
+ if indexing_settings.index_nidx_subject is None:
131
+ raise ValueError("INDEX_NIDX_SUBJECT needed for nidx utility")
132
+
133
+ if not settings.nidx_api_address or not settings.nidx_searcher_address:
134
+ raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
135
+
136
+ self.nats_connection_manager = NatsConnectionManager(
137
+ service_name="NidxIndexer",
138
+ nats_servers=indexing_settings.index_jetstream_servers,
139
+ nats_creds=indexing_settings.index_jetstream_auth,
140
+ )
141
+ self.subject = indexing_settings.index_nidx_subject
142
+
143
+ async def initialize(self):
144
+ await self.nats_connection_manager.initialize()
145
+ self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
146
+ self.searcher_client = NidxSearcherStub(
147
+ get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
148
+ )
149
+
150
+ async def finalize(self):
151
+ await self.nats_connection_manager.finalize()
152
+
153
+ async def index(self, writer: IndexMessage) -> int:
154
+ res = await self.nats_connection_manager.js.publish(self.subject, writer.SerializeToString())
155
+ logger.info(
156
+ f" = Pushed message to nidx shard: {writer.shard}, txid: {writer.txid} seqid: {res.seq}" # noqa
157
+ )
158
+ return res.seq
159
+
160
+
161
+ async def start_nidx_utility() -> Optional[NidxUtility]:
162
+ if not NIDX_ENABLED:
163
+ return None
164
+
165
+ nidx = get_nidx()
166
+ if nidx:
167
+ return nidx
168
+
169
+ nidx_utility: NidxUtility
170
+ if settings.standalone_mode:
171
+ nidx_utility = NidxBindingUtility()
172
+ else:
173
+ nidx_utility = NidxServiceUtility()
174
+
175
+ await nidx_utility.initialize()
176
+ set_utility(Utility.NIDX, nidx_utility)
177
+ return nidx_utility
178
+
179
+
180
+ async def stop_nidx_utility():
181
+ nidx_utility = get_nidx()
182
+ if nidx_utility:
183
+ clean_utility(Utility.NIDX)
184
+ await nidx_utility.finalize()
185
+
186
+
187
+ def get_nidx() -> Optional[NidxUtility]:
188
+ return get_utility(Utility.NIDX)
189
+
190
+
191
+ def get_nidx_api_client() -> Optional["NidxApiStub"]:
192
+ nidx = get_nidx()
193
+ if nidx:
194
+ return nidx.api_client
195
+ else:
196
+ return None
197
+
198
+
199
+ def get_nidx_searcher_client() -> Optional["NidxSearcherStub"]:
200
+ nidx = get_nidx()
201
+ if nidx:
202
+ return nidx.searcher_client
203
+ else:
204
+ return None
205
+
206
+
207
+ # TODO: Remove the index node abstraction
208
+ class NodeNidxAdapter:
209
+ def __init__(self, api_client, searcher_client):
210
+ # API methods
211
+ self.GetShard = api_client.GetShard
212
+ self.NewShard = api_client.NewShard
213
+ self.DeleteShard = api_client.DeleteShard
214
+ self.ListShards = api_client.ListShards
215
+ self.AddVectorSet = api_client.AddVectorSet
216
+ self.RemoveVectorSet = api_client.RemoveVectorSet
217
+ self.ListVectorSets = api_client.ListVectorSets
218
+ self.GetMetadata = api_client.GetMetadata
219
+
220
+ # Searcher methods
221
+ self.Search = searcher_client.Search
222
+ self.Suggest = searcher_client.Suggest
223
+ self.Paragraphs = searcher_client.Paragraphs
224
+ self.Documents = searcher_client.Documents
225
+
226
+
227
+ class FakeNode(AbstractIndexNode):
228
+ def __init__(self, api_client, searcher_client):
229
+ self.client = NodeNidxAdapter(api_client, searcher_client)
230
+
231
+ @property
232
+ def reader(self):
233
+ return self.client
234
+
235
+ @property
236
+ def writer(self):
237
+ return self.client
238
+
239
+ def is_read_replica(_):
240
+ return False
241
+
242
+ @property
243
+ def id(self):
244
+ return "nidx"
245
+
246
+ @property
247
+ def address(self):
248
+ return "nidx"
249
+
250
+ @property
251
+ def primary_id(self):
252
+ return "nidx"
253
+
254
+
255
+ def get_nidx_fake_node() -> Optional[FakeNode]:
256
+ nidx = get_nidx()
257
+ if nidx:
258
+ return FakeNode(nidx.api_client, nidx.searcher_client)
259
+ else:
260
+ return None
@@ -17,6 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import json
20
21
  from datetime import datetime
21
22
  from typing import AsyncGenerator, Union
22
23
 
@@ -53,25 +54,38 @@ class ExportImportDataManager:
53
54
 
54
55
  async def get_metadata(self, type: str, kbid: str, id: str) -> Metadata:
55
56
  key = self._get_maindb_metadata_key(type, kbid, id)
56
- async with self.driver.transaction() as txn:
57
+ async with self.driver.transaction(read_only=True) as txn:
57
58
  data = await txn.get(key)
58
59
  if data is None or data == b"":
59
60
  raise MetadataNotFound()
60
61
  decoded = data.decode("utf-8")
61
- model_type = {
62
- "export": ExportMetadata,
63
- "import": ImportMetadata,
64
- }[type]
65
- return model_type.parse_raw(decoded) # type: ignore
62
+ if type == "export":
63
+ model_type = ExportMetadata
64
+ elif type == "import":
65
+ model_type = ImportMetadata # type: ignore
66
+ else:
67
+ raise ValueError(f"Invalid type: {type}")
68
+ json_decoded = json.loads(decoded)
69
+
70
+ # For some reason, the total and processed fields are not always present in the metadata.
71
+ # This is to unblock already created exports that hit this bug.
72
+ if json_decoded.get("total") is None:
73
+ json_decoded["total"] = 0
74
+ if json_decoded.get("processed") is None:
75
+ json_decoded["processed"] = 0
76
+
77
+ return model_type.model_validate(json_decoded)
66
78
 
67
79
  async def set_metadata(
68
80
  self,
69
81
  type: str,
70
82
  metadata: Metadata,
71
83
  ):
84
+ metadata.processed = metadata.processed or 0
85
+ metadata.total = metadata.total or 0
72
86
  metadata.modified = datetime.utcnow()
73
87
  key = self._get_maindb_metadata_key(type, metadata.kbid, metadata.id)
74
- data = metadata.json().encode("utf-8")
88
+ data = metadata.model_dump_json().encode("utf-8")
75
89
  async with self.driver.transaction() as txn:
76
90
  await txn.set(key, data)
77
91
  await txn.commit()
@@ -97,9 +111,7 @@ class ExportImportDataManager:
97
111
  await self.storage.uploaditerator(export_bytes, field, cf)
98
112
  return cf.size
99
113
 
100
- async def download_export(
101
- self, kbid: str, export_id: str
102
- ) -> AsyncGenerator[bytes, None]:
114
+ async def download_export(self, kbid: str, export_id: str) -> AsyncGenerator[bytes, None]:
103
115
  key = STORAGE_EXPORT_KEY.format(export_id=export_id)
104
116
  bucket = self.storage.get_bucket_name(kbid)
105
117
  async for chunk in self.storage.download(bucket, key):
@@ -125,13 +137,9 @@ class ExportImportDataManager:
125
137
  async for chunk in self.storage.download(bucket, key):
126
138
  yield chunk
127
139
 
128
- def _get_storage_field(
129
- self, kbid: str, key: str, cf: resources_pb2.CloudFile
130
- ) -> StorageField:
140
+ def _get_storage_field(self, kbid: str, key: str, cf: resources_pb2.CloudFile) -> StorageField:
131
141
  bucket = self.storage.get_bucket_name(kbid)
132
- return self.storage.field_klass(
133
- storage=self.storage, bucket=bucket, fullkey=key, field=cf
134
- )
142
+ return self.storage.field_klass(storage=self.storage, bucket=bucket, fullkey=key, field=cf)
135
143
 
136
144
  async def delete_import(self, kbid: str, import_id: str):
137
145
  key = STORAGE_IMPORT_KEY.format(import_id=import_id)
@@ -151,6 +159,4 @@ class ExportImportDataManager:
151
159
  await func(kbid, id)
152
160
  except Exception as ex:
153
161
  errors.capture_exception(ex)
154
- logger.exception(
155
- f"Could not delete {type} {id} from storage", extra={"kbid": kbid}
156
- )
162
+ logger.exception(f"Could not delete {type} {id} from storage", extra={"kbid": kbid})
@@ -70,9 +70,7 @@ async def export_kb(
70
70
  yield chunk
71
71
 
72
72
 
73
- async def export_kb_to_blob_storage(
74
- context: ApplicationContext, msg: NatsTaskMessage
75
- ) -> None:
73
+ async def export_kb_to_blob_storage(context: ApplicationContext, msg: NatsTaskMessage) -> None:
76
74
  """
77
75
  Exports the data of a knowledgebox to the blob storage service.
78
76
  """
@@ -90,7 +88,7 @@ async def export_kb_to_blob_storage(
90
88
  export_size = await upload_export_retried(iterator, kbid, export_id)
91
89
 
92
90
  # Store export size
93
- metadata.total = metadata.processed = export_size
91
+ metadata.total = metadata.processed = export_size or 0
94
92
  await dm.set_metadata("export", metadata)
95
93
 
96
94
 
@@ -107,9 +105,7 @@ async def export_resources(
107
105
  yield chunk
108
106
 
109
107
 
110
- async def export_resources_resumable(
111
- context, metadata: ExportMetadata
112
- ) -> AsyncGenerator[bytes, None]:
108
+ async def export_resources_resumable(context, metadata: ExportMetadata) -> AsyncGenerator[bytes, None]:
113
109
  dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
114
110
 
115
111
  kbid = metadata.kbid
@@ -200,11 +196,9 @@ async def export_learning_config(
200
196
  ) -> AsyncGenerator[bytes, None]:
201
197
  lconfig = await get_learning_config(kbid)
202
198
  if lconfig is None:
203
- logger.warning(
204
- f"No learning configuration found for kbid", extra={"kbid": kbid}
205
- )
199
+ logger.warning(f"No learning configuration found for kbid", extra={"kbid": kbid})
206
200
  return
207
- data = lconfig.json().encode("utf-8")
201
+ data = lconfig.model_dump_json().encode("utf-8")
208
202
  yield ExportedItemType.LEARNING_CONFIG.encode("utf-8")
209
203
  yield len(data).to_bytes(4, byteorder="big")
210
204
  yield data
@@ -89,27 +89,25 @@ async def import_kb(
89
89
  await dm.set_metadata("import", metadata)
90
90
 
91
91
  if metadata is not None:
92
- metadata.processed = stream_reader.read_bytes
92
+ metadata.processed = stream_reader.read_bytes or 0
93
93
  await dm.set_metadata("import", metadata)
94
94
 
95
95
 
96
- async def import_kb_from_blob_storage(
97
- context: ApplicationContext, msg: NatsTaskMessage
98
- ):
96
+ async def import_kb_from_blob_storage(context: ApplicationContext, msg: NatsTaskMessage):
99
97
  """
100
98
  Imports to a knowledgebox from an export stored in the blob storage service.
101
99
  """
102
100
  kbid, import_id = msg.kbid, msg.id
103
101
  dm = ExportImportDataManager(context.kv_driver, context.blob_storage)
104
102
  metadata = await dm.get_metadata(type="import", kbid=kbid, id=import_id)
105
- stream = dm.download_import(kbid, import_id)
106
103
 
107
104
  retry_handler = TaskRetryHandler("import", dm, metadata)
108
105
 
109
106
  @retry_handler.wrap
110
- async def import_kb_retried(context, kbid, stream, metadata):
107
+ async def import_kb_retried(context, kbid, metadata):
108
+ stream = dm.download_import(kbid, import_id)
111
109
  await import_kb(context, kbid, stream, metadata)
112
110
 
113
- await import_kb_retried(context, kbid, stream, metadata) # type: ignore
111
+ await import_kb_retried(context, kbid, metadata)
114
112
 
115
113
  await dm.try_delete_from_storage("import", kbid, import_id)
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from datetime import datetime
20
+ import datetime
21
21
  from enum import Enum
22
22
  from typing import Any
23
23
 
@@ -57,8 +57,8 @@ class Metadata(BaseModel):
57
57
  task: TaskMetadata = TaskMetadata(status=Status.SCHEDULED)
58
58
  total: int = 0
59
59
  processed: int = 0
60
- created: datetime = datetime.utcnow()
61
- modified: datetime = datetime.utcnow()
60
+ created: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
61
+ modified: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
62
62
 
63
63
 
64
64
  class ExportMetadata(Metadata):
@@ -32,7 +32,7 @@ def get_exports_consumer() -> NatsTaskConsumer:
32
32
  name="exports_consumer",
33
33
  stream=const.Streams.KB_EXPORTS, # type: ignore
34
34
  callback=export_kb_to_blob_storage, # type: ignore
35
- msg_type=NatsTaskMessage, # type: ignore
35
+ msg_type=NatsTaskMessage,
36
36
  max_concurrent_messages=10,
37
37
  )
38
38
 
@@ -41,7 +41,7 @@ async def get_exports_producer(context: ApplicationContext) -> NatsTaskProducer:
41
41
  producer = create_producer(
42
42
  name="exports_producer",
43
43
  stream=const.Streams.KB_EXPORTS, # type: ignore
44
- msg_type=NatsTaskMessage, # type: ignore
44
+ msg_type=NatsTaskMessage,
45
45
  )
46
46
  await producer.initialize(context)
47
47
  return producer
@@ -52,7 +52,7 @@ def get_imports_consumer() -> NatsTaskConsumer:
52
52
  name="imports_consumer",
53
53
  stream=const.Streams.KB_IMPORTS, # type: ignore
54
54
  callback=import_kb_from_blob_storage, # type: ignore
55
- msg_type=NatsTaskMessage, # type: ignore
55
+ msg_type=NatsTaskMessage,
56
56
  max_concurrent_messages=10,
57
57
  )
58
58
 
@@ -61,7 +61,7 @@ async def get_imports_producer(context: ApplicationContext) -> NatsTaskProducer:
61
61
  producer = create_producer(
62
62
  name="imports_producer",
63
63
  stream=const.Streams.KB_IMPORTS, # type: ignore
64
- msg_type=NatsTaskMessage, # type: ignore
64
+ msg_type=NatsTaskMessage,
65
65
  )
66
66
  await producer.initialize(context)
67
67
  return producer
@@ -20,7 +20,6 @@
20
20
  import functools
21
21
  from typing import AsyncGenerator, AsyncIterator, Callable, Optional
22
22
 
23
- import nats.errors
24
23
  from google.protobuf.message import DecodeError as ProtobufDecodeError
25
24
 
26
25
  from nucliadb import learning_proxy
@@ -34,10 +33,12 @@ from nucliadb.export_import.exceptions import (
34
33
  WrongExportStreamFormat,
35
34
  )
36
35
  from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
36
+ from nucliadb.ingest.orm.broker_message import generate_broker_message
37
37
  from nucliadb_models.export_import import Status
38
38
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
39
39
  from nucliadb_protos import resources_pb2, writer_pb2
40
40
  from nucliadb_utils.const import Streams
41
+ from nucliadb_utils.transaction import MaxTransactionSizeExceededError
41
42
 
42
43
  BinaryStream = AsyncGenerator[bytes, None]
43
44
  BinaryStreamGenerator = Callable[[int], BinaryStream]
@@ -59,9 +60,6 @@ WRITER_BM_FIELDS = [
59
60
  "files",
60
61
  "texts",
61
62
  "conversations",
62
- "layouts",
63
- "keywordsets",
64
- "datetimes",
65
63
  ]
66
64
 
67
65
 
@@ -88,7 +86,7 @@ async def transaction_commit(
88
86
  wait=False,
89
87
  target_subject=Streams.INGEST_PROCESSED.subject,
90
88
  )
91
- except nats.errors.MaxPayloadError:
89
+ except MaxTransactionSizeExceededError:
92
90
  stored_key = await context.blob_storage.set_stream_message(
93
91
  kbid=bm.kbid, rid=bm.uuid, data=bm.SerializeToString()
94
92
  )
@@ -151,23 +149,17 @@ async def set_entities_groups(
151
149
  context: ApplicationContext, kbid: str, entities_groups: kb_pb2.EntitiesGroups
152
150
  ) -> None:
153
151
  async with datamanagers.with_transaction() as txn:
154
- await datamanagers.entities.set_entities_groups(
155
- txn, kbid=kbid, entities_groups=entities_groups
156
- )
152
+ await datamanagers.entities.set_entities_groups(txn, kbid=kbid, entities_groups=entities_groups)
157
153
  await txn.commit()
158
154
 
159
155
 
160
- async def set_labels(
161
- context: ApplicationContext, kbid: str, labels: kb_pb2.Labels
162
- ) -> None:
156
+ async def set_labels(context: ApplicationContext, kbid: str, labels: kb_pb2.Labels) -> None:
163
157
  async with datamanagers.with_transaction() as txn:
164
158
  await datamanagers.labels.set_labels(txn, kbid=kbid, labels=labels)
165
159
  await txn.commit()
166
160
 
167
161
 
168
- async def iter_kb_resource_uuids(
169
- context: ApplicationContext, kbid: str
170
- ) -> AsyncGenerator[str, None]:
162
+ async def iter_kb_resource_uuids(context: ApplicationContext, kbid: str) -> AsyncGenerator[str, None]:
171
163
  async for rid in datamanagers.resources.iterate_resource_ids(kbid=kbid):
172
164
  yield rid
173
165
 
@@ -175,8 +167,13 @@ async def iter_kb_resource_uuids(
175
167
  async def get_broker_message(
176
168
  context: ApplicationContext, kbid: str, rid: str
177
169
  ) -> Optional[writer_pb2.BrokerMessage]:
178
- async with datamanagers.with_transaction() as txn:
179
- return await datamanagers.resources.get_broker_message(txn, kbid=kbid, rid=rid)
170
+ async with datamanagers.with_ro_transaction() as txn:
171
+ resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
172
+ if resource is None:
173
+ return None
174
+ resource.disable_vectors = False
175
+ resource.txn = txn
176
+ return await generate_broker_message(resource)
180
177
 
181
178
 
182
179
  def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFile]:
@@ -184,6 +181,10 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
184
181
  binaries: list[resources_pb2.CloudFile] = []
185
182
  for file_field in bm.files.values():
186
183
  if file_field.HasField("file"):
184
+ if file_field.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
185
+ # Binaries of externally hosted files are not
186
+ # to be downloaded and stored in the export file
187
+ continue
187
188
  _clone_collect_cf(binaries, file_field.file)
188
189
 
189
190
  for conversation in bm.conversations.values():
@@ -191,11 +192,6 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
191
192
  for attachment in message.content.attachments:
192
193
  _clone_collect_cf(binaries, attachment)
193
194
 
194
- for layout in bm.layouts.values():
195
- for block in layout.body.blocks.values():
196
- if block.HasField("file"):
197
- _clone_collect_cf(binaries, block.file)
198
-
199
195
  for field_extracted_data in bm.file_extracted_data:
200
196
  if field_extracted_data.HasField("file_thumbnail"):
201
197
  _clone_collect_cf(binaries, field_extracted_data.file_thumbnail)
@@ -227,9 +223,7 @@ def get_cloud_files(bm: writer_pb2.BrokerMessage) -> list[resources_pb2.CloudFil
227
223
  return binaries
228
224
 
229
225
 
230
- def _clone_collect_cf(
231
- binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile
232
- ):
226
+ def _clone_collect_cf(binaries: list[resources_pb2.CloudFile], origin: resources_pb2.CloudFile):
233
227
  cf = resources_pb2.CloudFile()
234
228
  cf.CopyFrom(origin)
235
229
  # Mark the cloud file of the broker message being exported as export source
@@ -246,12 +240,12 @@ async def download_binary(
246
240
 
247
241
 
248
242
  async def get_entities(context: ApplicationContext, kbid: str) -> kb_pb2.EntitiesGroups:
249
- async with datamanagers.with_transaction() as txn:
243
+ async with datamanagers.with_ro_transaction() as txn:
250
244
  return await datamanagers.entities.get_entities_groups(txn, kbid=kbid)
251
245
 
252
246
 
253
247
  async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
254
- async with datamanagers.with_transaction() as txn:
248
+ async with datamanagers.with_ro_transaction() as txn:
255
249
  return await datamanagers.labels.get_labels(txn, kbid=kbid)
256
250
 
257
251
 
@@ -398,7 +392,7 @@ class ExportStreamReader:
398
392
  # Backward compatible code for old exports that don't have a learning config.
399
393
  return None, type_bytes + self.stream.buffer
400
394
  data = await self.read_item()
401
- lconfig = learning_proxy.LearningConfiguration.parse_raw(data)
395
+ lconfig = learning_proxy.LearningConfiguration.model_validate_json(data)
402
396
  return lconfig, self.stream.buffer
403
397
 
404
398
  async def iter_items(self) -> AsyncGenerator[ExportItem, None]:
@@ -411,7 +405,7 @@ class ExportStreamReader:
411
405
  ExportedItemType.ENTITIES: self.read_entities,
412
406
  ExportedItemType.LABELS: self.read_labels,
413
407
  }[item_type]
414
- data = await read_data_func() # type: ignore
408
+ data = await read_data_func()
415
409
  yield item_type, data
416
410
  except ExportStreamExhausted:
417
411
  break
@@ -506,21 +500,15 @@ def stream_compatible_with_kb(
506
500
  return wrapped()
507
501
 
508
502
 
509
- async def _check_semantic_model_compatibility(
510
- kbid: str, stream: AsyncGenerator[bytes, None]
511
- ) -> bytes:
503
+ async def _check_semantic_model_compatibility(kbid: str, stream: AsyncGenerator[bytes, None]) -> bytes:
512
504
  stream_reader = ExportStreamReader(stream)
513
505
  lconfig, leftover_bytes = await stream_reader.maybe_read_learning_config()
514
506
  if lconfig is None:
515
- logger.warning(
516
- "Learning config not found on the export stream. Export may be incompatible."
517
- )
507
+ logger.warning("Learning config not found on the export stream. Export may be incompatible.")
518
508
  return leftover_bytes
519
509
  kb_lconfig = await get_learning_config(kbid)
520
510
  if kb_lconfig is None:
521
- logger.warning(
522
- "No learning config found on the knowledge box. Export may be incompatible."
523
- )
511
+ logger.warning("No learning config found on the knowledge box. Export may be incompatible.")
524
512
  return leftover_bytes
525
513
  if kb_lconfig.semantic_model == lconfig.semantic_model:
526
514
  logger.info(f"Semantic model match: {kb_lconfig.semantic_model}")
nucliadb/health.py CHANGED
@@ -78,9 +78,7 @@ async def grpc_health_check(health_servicer) -> None:
78
78
  for check in _health_checks:
79
79
  if not check():
80
80
  logger.info(f"Health check failed on {check.__name__}")
81
- await health_servicer.set(
82
- "", health_pb2.HealthCheckResponse.NOT_SERVING
83
- )
81
+ await health_servicer.set("", health_pb2.HealthCheckResponse.NOT_SERVING)
84
82
  break
85
83
  else:
86
84
  await health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)