nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,691 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import base64
21
- import uuid
22
- from datetime import datetime
23
- from os.path import dirname, getsize
24
- from unittest.mock import patch
25
- from uuid import uuid4
26
-
27
- import nats
28
- import pytest
29
- from nats.aio.client import Client
30
- from nats.js import JetStreamContext
31
- from nucliadb_protos.audit_pb2 import AuditField, AuditRequest
32
- from nucliadb_protos.resources_pb2 import (
33
- TEXT,
34
- Answers,
35
- Classification,
36
- CloudFile,
37
- Entity,
38
- ExtractedTextWrapper,
39
- ExtractedVectorsWrapper,
40
- FieldComputedMetadataWrapper,
41
- FieldID,
42
- FieldQuestionAnswerWrapper,
43
- FieldType,
44
- FileExtractedData,
45
- LargeComputedMetadataWrapper,
46
- )
47
- from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
48
- from nucliadb_protos.resources_pb2 import Origin, Paragraph, QuestionAnswer
49
- from nucliadb_protos.utils_pb2 import Vector
50
- from nucliadb_protos.writer_pb2 import BrokerMessage
51
-
52
- from nucliadb.common import datamanagers
53
- from nucliadb.ingest import SERVICE_NAME
54
- from nucliadb.ingest.consumer.auditing import (
55
- IndexAuditHandler,
56
- ResourceWritesAuditHandler,
57
- )
58
- from nucliadb.ingest.orm.exceptions import DeadletteredError
59
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
60
- from nucliadb.ingest.orm.processor import Processor
61
- from nucliadb.ingest.orm.resource import Resource
62
- from nucliadb_utils.audit.stream import StreamAuditStorage
63
- from nucliadb_utils.storages.storage import Storage
64
- from nucliadb_utils.utilities import Utility, get_indexing, get_storage, set_utility
65
-
66
- EXAMPLE_VECTOR = base64.b64decode(
67
- "k05VTVBZAQB2AHsnZGVzY3InOiAnPGY0JywgJ2ZvcnRyYW5fb3JkZXInOiBGYWxzZSwgJ3NoYXBlJzogKDc2OCwpLCB9ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogKIq+lgYUvvf7bTx39oo+fFaXPbx2a71RfiK/FBroPVTawr4UEm2+AgtuvWzTTT5ipjg9JflLvMdfub6fBIE+d7gmvnJPl75alUQ+n68Hv2BYJL20+74+bHDsPV/wTT63h4E9hes9PqxHXT6h/J079HfVPF/7BD3BSMO+PuA9PjJhtD4W6pq+rmwjPp+Fzz6xUfa+FMZtOutIBT4mPik+EbsAPyRePb41IVW+i+0RPT7GtL51USY+GRjRvjWD1z4+wq+9j9kqvmq/074hHBM+kh+ZPoRfmb6R0yi/kuirvlcqLj+Ss64+0cMBP2UKsD2LtpI9927BvtCfHb5KY7U+8s64vkcGX778+NY+2pMxPNowJD7R39u+dbmfPqbrL73bIby+Nbu8voH3kr4gps6+f3L6PuJFAb3PFWA+99BPPjkLzD0vc8m79JmtvWYnbL6W+6A+WUWEveVVED0V0h8+3zPWvv19Dr2igdC9JcGRPV568z41ZVu8mRxRvdkBQr73JHO+PFxkvtHatLzVgN49NEgav0l7ab276hK+ABMDvrRrJj4akbO++zFnPRzXoDyecdi+pGq4viUgiL4XXwK+tvcOPivvgD7PV0w+D7CwPmfoiL0REec+tsx1Pe2xkD6S9Jm+ZW09P1Obiz2Ov/Q+OtsBP8Xicj7WJpi9szGJvqvWvz4hFqG++ZuGvIAmMb0r+T2+wj9RPgZN0z7KwGI+ezogPgI78D6aUrW8etzkPHpSqb7c4Sg+b6BZvXlSrr6un6a8uUCrvhbBgb7PtwA+CsSwvQzyz73G1eq+plYZP/6I7r6BRsu992/gPuIBJj9aT8u94saNvdIDG76Zar4+GeRxvncSZz3citO+ILq6vmS3D78JHk6/NdeIPWYQwb0WZJW9OnwJPhdIQL7Gta6+MZWevpRNvr0ZH/c9B//hPtNUlL1pWhu/VliNvshFjT6laVS9EpjovQBHdb4HWMe+e/rfvrcSDz620/I+krapPlnIDz5uR1Y+znjqPTFM+T1+kK8+VMcevDegSjvM7fw+e0yKPbDoVz56wk4+EeoGvnq3rT76dbW+ghE6vvos0b6CqQu+p6JDPvzn2bogOui+oZU5v6/Pvr4siDI9Kv6Dvt6TQj51LqW+qYLsPmyjZT45DkG9MQivPgIHBT/qeRW/ghXOPkcJtL6MwhA/9F5PvbR7Jr4ftKA+mdkePwm2A77WpNU+Ho/NvsWEfL75zPS9v8ycvtXFVD5ONFI7mVkOPlFd7bzacZK7aSyRPkRrhz6e8+k+glJ5Pq9mmD0X95y+APOjPveVBb9yOgM+DLlMPkqCRb7CKwW8N+TevpZtmD5lbpq+n+tdPr4+m7661Wg+gd66ve5dzr2ZH7k+x/aNPo+0Kz4PMMa+voMGv+ud8r4Nape892YZPWlDL76twQi/RC2QPk8juz1uTwC9yf3rvn8RmD5LO0e+7t5CvvYTbb5O8UA/yrZqO7aZib6FBEe+n/xAv08BGr15Vxs/FNIevkbN1r3f2Hw+oj18PJwOnb3SDpo+wf67vvy3sj6qvRM/BrljPtrlBr4w2Ck9Jh6fPv6Vn75qa7w+eWShvj6bYj56q46+x41KPvQtqb2qXVm+DTmTPpvXWz5hUnC9f7ptPAu1tsDAcUa+ckyGvTeaIz3FcaC9Zu/cvYvjzD7WUdQ+P2DFvrbdHz4CfVe+HxwAP3HZy775Q7w+eg+svcccuTwLBFW+QkVhPuSSjLymH6g+DFBKviDgWz0wxyK+1C+3PSKk975Hkxi8FKzVvRnykD5lFCa/bqnBPRACHL5uUS+/Zb3FvoK66j5CHUu+vq4TvkxWfT5wv7o+wW79PJHsrD42Aau+SuQFvdzUnz50dEe+qZNjvmZ1LLxvt529oeHDPsv3dT5O+z69vOoevm/1Cz5O7NU9i6uHPibkEr6g5d2+LobFPn+KAT/gLsY+2jm4vlpyhD48l4g+yqx3Pql7yr5sIYK+7awLPlnODb+3e7i+t9RVPQC99z6SQJk+lbXoPbyAI7mKcCu/4kX9vFuhtL61fhq+UjGgvYxSvDzCzfw+24xfvs+Sjr782jy+kTzaPmEqtD6sN2c9otXavSqTiT5hM/q+MjAFP4kflT5JOe280NUmvrQtkT4f55m9CyFwPr8GF7wNzBm+x05SvsFJtz0MG9w+HCf/vn4mkr7iMiw+DmhCPUDI/j3PrVe+glX3vlpDPz8ucKG+MexCPgoBDD+FMn68BMDnvCf+UT3bgq4+srqvvYF8H7+1VKq9qbQTvY1tBL8epwC85PUdviSEhD7hg7e9jMUzvVuFz71qCf29IudEPsAwH767q809fL0uvrk+Mr7OTVy9TNcWvhnV3T4hOwq/F/E3P4UOXz4Vade+fK8TP5v4sr4Amf8+HCqPvmYV7Lo3UMK+0urYPrSH3zw/8oq9tAHCvvs5GD91e6w9GsqJPNRo7j5ffH6+X++MvKFQxj17Es6+TA5OvW8tAz8C4nU8tiHDvm5FZL5Kv3O+fuZ0Php/9j0Gyua+mSKVvs+pDT8+TwC/qS8Gvl/z0r5iVLq+a8e/vIXlIT4r7Ty+dqrXPmn9Db4p4PS+Kv0nPfnVUz7avj0+KVOTPkG3Kz68dQa+LSKGvXnRvjxnzyM92moTvy9SnD4F9Dw+mWoyvXpXRD8nm7I9O245P6KlZT4zCxc+baKLPsyE0rw8YHO+coGePfcAYT300Jw+UoeUvlvHFD7CjpC+p9KpvlteKrvgzwG8Sbg2vn8NDz5MDtW99URGvoaaxb0svk+9+cajvUvAab1qXpS91FSbvszYlj6f9oI+Ge5yPDdVxr45qV2+WmuxPcx+qj5l88W9ApSIvsFrwT4GT8c+Vg/0PjkNT745ezC/9ogqPm7bE7/Wh1O9b7NrvlVU/j4u3ga9mv+xvaHTtD76O40+LIyTvssUDT73Q5y+QO5TvX7bgj3gY5S+YTSfvpYeIL6a+Y29CLmZvda6xz6cC9Q+9sQSPwnG+j3RS927zvaAvq7iLz0CqPw9Fir+vNr7VT5qEgM+yhqtupy5q76uVtE7eZ+Nvi/7h75rkLq9vOW0O7QhFj7JCbc+3tp7vlpEOT9+aPc+hwnnvkqLPr0Ry/4+8zOaPfE0O70OJ6I9eQlJvbAU/T0KcaK8gS2Kvulxdj0u2JY+u4mxPN4vXj7B6xQ+LjBLvuTgJ77vq7M7KbcIvnbIdD0UQd++ZyuHvlaAPr4SeMw++sRuvZ7sXz3yJ5O9cSmPvZ8mRL7X2JM9trN4PpzLt70C3Og94uwLv4pACb8LWoY9Uz+ZPvE1Ij4R8HG9JVyJvvFOZz6XkIU+had5PvoQKT7h3CK+IzATv1U3qrxUum68B1bDviBzhz7u5XI9KXwkPoszXr6en5I9VNxMPAKusT5XGTg8Ne9GvC6yBz/EidM+V8T8u3LO1D7qSJa+AlsUPeb9pb0vNFK+lFCevTGrR70aeSu+zihyvOLan77CaxE/5ZnaPUv8Nr/hBhs+oCZBPttGqr5ZrwO+O0DGPU7JOD7FxdK+pw6CPWumgz6VB7++Gjb1vq6Ns7uZ1FI9VmTLPsl2iz7h5YI8CJYXvh6MSz6ucvc9qx1bPovgpT7ZWyO+Z+d1vrXkrz3VC8s+dmievuxuHb7MOXE+ewUCvJcPuT6n2Rc8mQyYvl45Gr1ER3c9LCZYvmqQhb1lVJu+V1acPZp63z5Cfmu+4NFZPvmBJb6cmAI+J0U7PsLkSb16KrO9wj4JPo4Fq7563+09jAw8vkYbbD7/Z5q7TH1kvnJrLb1mqkS+R+a9vX0ODD4p9ak+un8VO6mSp71C66w+FlLVPr/0Wb0eLR2+AneHvVTFHD/P0X0+TsQ4vlWQQzzP8no6VtEOPHLiG78Foyg+Un5OP/fFeL3uVxc+C1VzP9IInL2Zbbo8bw2Lvt5f0b4LY9w9LyaMvIcBc70K3bs+9lz5vTSTC7770MG+B4dHvvRFSz3lO6w9ENACv5NLBz20vSk+MuMQPLQYZr/2+6o+gzANvXGTjL259Qy9ZUMKPnyCC7498ww8oGGSvouNujyvJVW+TjmIvvI8KT667mq9MC6fvVUcvz0=" # noqa
68
- )
69
-
70
-
71
- @pytest.fixture(autouse=True)
72
- async def audit_consumers(
73
- maindb_driver, storage, pubsub, stream_audit: StreamAuditStorage
74
- ):
75
- index_auditor = IndexAuditHandler(
76
- driver=maindb_driver,
77
- audit=stream_audit,
78
- pubsub=pubsub,
79
- )
80
- resource_writes_auditor = ResourceWritesAuditHandler(
81
- driver=maindb_driver,
82
- storage=storage,
83
- audit=stream_audit,
84
- pubsub=pubsub,
85
- )
86
-
87
- await index_auditor.initialize()
88
- await resource_writes_auditor.initialize()
89
- yield
90
- await index_auditor.finalize()
91
- await resource_writes_auditor.finalize()
92
-
93
-
94
- @pytest.fixture()
95
- def kbid(
96
- local_files,
97
- storage: Storage,
98
- txn,
99
- cache,
100
- fake_node,
101
- processor,
102
- knowledgebox_ingest,
103
- ):
104
- yield knowledgebox_ingest
105
-
106
-
107
- @pytest.mark.asyncio
108
- async def test_ingest_messages_autocommit(kbid: str, processor):
109
- rid = str(uuid.uuid4())
110
- message1: BrokerMessage = BrokerMessage(
111
- kbid=kbid,
112
- uuid=rid,
113
- slug="slug1",
114
- type=BrokerMessage.AUTOCOMMIT,
115
- )
116
- filename = f"{dirname(__file__)}/assets/file.png"
117
- cf1 = CloudFile(
118
- uri="file.png",
119
- source=CloudFile.Source.LOCAL,
120
- bucket_name="/integration/ingest/assets",
121
- size=getsize(filename),
122
- content_type="image/png",
123
- filename="file.png",
124
- )
125
- message1.basic.icon = "text/plain"
126
- message1.basic.title = "Title Resource"
127
- message1.basic.summary = "Summary of Document"
128
- message1.basic.thumbnail = "doc"
129
- message1.basic.layout = "default"
130
- message1.basic.metadata.language = "es"
131
- message1.basic.created.FromDatetime(datetime.now())
132
- message1.basic.modified.FromDatetime(datetime.now())
133
- message1.origin.source = Origin.Source.WEB
134
- message1.files["file"].file.CopyFrom(cf1)
135
-
136
- fed = FileExtractedData()
137
- fed.file_pages_previews.pages.append(cf1)
138
- fed.language = "ca"
139
- fed.md5 = "asdsadsad"
140
- fed.metadata["key1"] = "ca"
141
- fed.nested["key2"] = "ca"
142
- fed.file_generated["subfile1"].CopyFrom(cf1)
143
- fed.file_preview.CopyFrom(cf1)
144
- fed.file_thumbnail.CopyFrom(cf1)
145
- message1.file_extracted_data.append(fed)
146
-
147
- etw = ExtractedTextWrapper()
148
- etw.body.text = "My own text"
149
- etw.field.field = "file"
150
- etw.field.field_type = FieldType.FILE
151
- message1.extracted_text.append(etw)
152
- etw = ExtractedTextWrapper()
153
- etw.body.text = "My summary"
154
- etw.field.field = "summary"
155
- etw.field.field_type = FieldType.GENERIC
156
- message1.extracted_text.append(etw)
157
-
158
- fcm = FieldComputedMetadataWrapper()
159
- fcm.field.field = "file"
160
- fcm.field.field_type = FieldType.FILE
161
- p1 = Paragraph(
162
- start=1,
163
- end=20,
164
- )
165
- fcm.metadata.metadata.paragraphs.append(p1)
166
- fcm.metadata.metadata.last_index.FromDatetime(datetime.now())
167
- fcm.metadata.metadata.last_understanding.FromDatetime(datetime.now())
168
- fcm.metadata.metadata.last_extract.FromDatetime(datetime.now())
169
- fcm.metadata.metadata.ner["Ramon"] = "PERSON"
170
-
171
- c1 = Classification()
172
- c1.label = "label1"
173
- c1.labelset = "labelset1"
174
- fcm.metadata.metadata.classifications.append(c1)
175
- message1.field_metadata.append(fcm)
176
-
177
- lcmw = LargeComputedMetadataWrapper()
178
- lcmw.field.field = "file"
179
- lcmw.field.field_type = FieldType.FILE
180
- lcmw.real.metadata.tokens["asd"] = 4
181
- lcmw.real.metadata.entities.append(Entity(token="token", root="tok", type="PERSON"))
182
- message1.field_large_metadata.append(lcmw)
183
-
184
- ev = ExtractedVectorsWrapper()
185
- ev.field.field = "file"
186
- ev.field.field_type = FieldType.FILE
187
- v1 = Vector(
188
- start=1, end=10, start_paragraph=1, end_paragraph=20, vector=EXAMPLE_VECTOR
189
- )
190
- ev.vectors.vectors.vectors.append(v1)
191
- message1.field_vectors.append(ev)
192
-
193
- message1.source = BrokerMessage.MessageSource.WRITER
194
- await processor.process(message=message1, seqid=1)
195
-
196
- index = get_indexing()
197
- storage = await get_storage(service_name=SERVICE_NAME)
198
-
199
- pb = await storage.get_indexing(index._calls[0][1])
200
- assert pb.texts["a/summary"].text == "My summary" # type: ignore
201
-
202
- pb = await storage.get_indexing(index._calls[1][1])
203
- assert pb.texts["a/summary"].text == "My summary" # type: ignore
204
-
205
-
206
- @pytest.mark.asyncio
207
- async def test_ingest_error_message(
208
- kbid: str, storage: Storage, processor, maindb_driver
209
- ):
210
- filename = f"{dirname(__file__)}/assets/resource.pb"
211
- with open(filename, "r") as f:
212
- data = base64.b64decode(f.read())
213
- message0: BrokerMessage = BrokerMessage()
214
- message0.ParseFromString(data)
215
- message0.kbid = kbid
216
- message0.source = BrokerMessage.MessageSource.WRITER
217
-
218
- await processor.process(message=message0, seqid=1)
219
-
220
- filename = f"{dirname(__file__)}/assets/error.pb"
221
- with open(filename, "r") as f:
222
- data = base64.b64decode(f.read())
223
- message1: BrokerMessage = BrokerMessage()
224
- message1.ParseFromString(data)
225
- message1.kbid = kbid
226
- message1.ClearField("field_vectors")
227
- message1.source = BrokerMessage.MessageSource.WRITER
228
-
229
- await processor.process(message=message1, seqid=2)
230
-
231
- async with maindb_driver.transaction() as txn:
232
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
233
- r = await kb_obj.get(message1.uuid)
234
- assert r is not None
235
- field_obj = await r.get_field("wikipedia_ml", TEXT)
236
- ext1 = await field_obj.get_extracted_text()
237
- lfm1 = await field_obj.get_large_field_metadata()
238
- fm1 = await field_obj.get_field_metadata()
239
- basic = await r.get_basic()
240
- assert basic is not None
241
- assert basic.slug == message1.slug
242
- assert basic.summary == message0.basic.summary
243
-
244
- assert ext1.text == message1.extracted_text[0].body.text
245
-
246
- assert lfm1 is not None
247
- assert fm1 is not None
248
- assert field_obj.value.body == message0.texts["wikipedia_ml"].body
249
-
250
-
251
- @pytest.mark.asyncio
252
- async def test_ingest_messages_origin(
253
- local_files,
254
- storage: Storage,
255
- fake_node,
256
- processor,
257
- knowledgebox_ingest,
258
- ):
259
- rid = "43ece3e4-b706-4c74-b41b-3637f6d28197"
260
- message1: BrokerMessage = BrokerMessage(
261
- kbid=knowledgebox_ingest,
262
- uuid=rid,
263
- slug="slug1",
264
- type=BrokerMessage.AUTOCOMMIT,
265
- )
266
- message1.source = BrokerMessage.MessageSource.WRITER
267
- await processor.process(message=message1, seqid=1)
268
-
269
- async with processor.driver.transaction() as txn:
270
- storage = await get_storage(service_name=SERVICE_NAME)
271
- kb = KnowledgeBox(txn, storage, knowledgebox_ingest)
272
- res = Resource(txn, storage, kb, rid)
273
- origin = await res.get_origin()
274
-
275
- # should not be set
276
- assert origin is None
277
-
278
- # now set the origin
279
- message1.origin.CopyFrom(
280
- Origin(
281
- source=Origin.Source.API,
282
- filename="file.png",
283
- url="http://www.google.com",
284
- )
285
- )
286
- await processor.process(message=message1, seqid=2)
287
-
288
- async with processor.driver.transaction() as txn:
289
- kb = KnowledgeBox(txn, storage, knowledgebox_ingest)
290
- res = Resource(txn, storage, kb, rid)
291
- origin = await res.get_origin()
292
-
293
- assert origin is not None
294
- assert origin.url == "http://www.google.com"
295
- assert origin.source == Origin.Source.API
296
- assert origin.filename == "file.png"
297
-
298
-
299
- def add_filefields(message, items=None):
300
- items = items or []
301
- for fieldid, filename in items:
302
- file_path = f"{dirname(__file__)}/assets/{filename}"
303
- cf1 = CloudFile(
304
- uri=filename,
305
- source=CloudFile.Source.LOCAL,
306
- bucket_name="/integration/ingest/assets",
307
- size=getsize(file_path),
308
- content_type="application/octet-stream",
309
- filename=filename,
310
- )
311
- message.files[fieldid].file.CopyFrom(cf1)
312
-
313
-
314
- def add_textfields(message, items=None):
315
- items = items or []
316
- for fieldid in items:
317
- message.texts[fieldid].body = "some random text"
318
-
319
-
320
- def make_message(
321
- kbid: str, rid: str, slug: str = "resource", message_type=BrokerMessage.AUTOCOMMIT
322
- ):
323
- message: BrokerMessage = BrokerMessage(
324
- kbid=kbid,
325
- uuid=rid,
326
- slug=slug,
327
- type=message_type,
328
- )
329
- message.basic.icon = "text/plain"
330
- message.basic.title = "Title Resource"
331
- message.basic.summary = "Summary of document"
332
- message.basic.thumbnail = "doc"
333
- message.basic.layout = "default"
334
- message.basic.metadata.language = "es"
335
- message.basic.created.FromDatetime(datetime.now())
336
- message.basic.modified.FromDatetime(datetime.now())
337
- message.origin.source = Origin.Source.WEB
338
-
339
- return message
340
-
341
-
342
- async def get_audit_messages(sub):
343
- msg = await sub.fetch(1)
344
- auditreq = AuditRequest()
345
- auditreq.ParseFromString(msg[0].data)
346
- return auditreq
347
-
348
-
349
- @pytest.mark.asyncio
350
- async def test_ingest_audit_stream_files_only(
351
- local_files,
352
- storage: Storage,
353
- txn,
354
- cache,
355
- fake_node,
356
- knowledgebox_ingest,
357
- stream_processor,
358
- stream_audit: StreamAuditStorage,
359
- maindb_driver,
360
- ):
361
- from nucliadb_utils.settings import audit_settings
362
-
363
- # Prepare a test audit stream to receive our messages
364
- partition = stream_audit.get_partition(knowledgebox_ingest)
365
- client: Client = await nats.connect(stream_audit.nats_servers)
366
- jetstream: JetStreamContext = client.jetstream()
367
- if audit_settings.audit_jetstream_target is None:
368
- assert False, "Missing jetstream target in audit settings"
369
- subject = audit_settings.audit_jetstream_target.format(
370
- partition=partition, type="*"
371
- )
372
- try:
373
- await jetstream.delete_stream(name=audit_settings.audit_stream)
374
- except nats.js.errors.NotFoundError:
375
- pass
376
- await jetstream.add_stream(name=audit_settings.audit_stream, subjects=[subject])
377
- psub = await jetstream.pull_subscribe(subject, "psub")
378
-
379
- rid = str(uuid.uuid4())
380
-
381
- # We use the same file multiple times, so the size will be the same
382
- test_png_size = getsize(f"{dirname(__file__)}/assets/file.png")
383
- test_text_size = getsize(f"{dirname(__file__)}/assets/text.pb")
384
- test_vectors_size = getsize(f"{dirname(__file__)}/assets/vectors.pb")
385
-
386
- #
387
- # Test 1: add a resource with some files
388
- #
389
- message = make_message(knowledgebox_ingest, rid)
390
- add_filefields(
391
- message,
392
- [("file_1", "file.png"), ("file_2", "text.pb"), ("file_3", "vectors.pb")],
393
- )
394
- await stream_processor.process(message=message, seqid=1)
395
-
396
- auditreq = await get_audit_messages(psub)
397
-
398
- # Minimal assert to make sure we get the information from the node on the audit
399
- # gets from the sidecar to the audit report when adding or modifying a resource
400
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
401
-
402
- assert auditreq.kbid == knowledgebox_ingest
403
- assert auditreq.rid == rid
404
- assert auditreq.type == AuditRequest.AuditType.NEW
405
-
406
- try:
407
- int(auditreq.trace_id)
408
- except ValueError:
409
- assert False, "Invalid trace ID"
410
-
411
- audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
412
- assert audit_by_fieldid["file_1"].action == AuditField.FieldAction.MODIFIED
413
- assert audit_by_fieldid["file_1"].size == test_png_size
414
- assert audit_by_fieldid["file_2"].action == AuditField.FieldAction.MODIFIED
415
- assert audit_by_fieldid["file_2"].size == test_text_size
416
- assert audit_by_fieldid["file_3"].action == AuditField.FieldAction.MODIFIED
417
- assert audit_by_fieldid["file_3"].size == test_vectors_size
418
-
419
- #
420
- # Test 2: delete one of the previous field on the same resource
421
- #
422
-
423
- message.files.clear()
424
- fieldid = FieldID(field="file_1", field_type=FieldType.FILE)
425
- message.delete_fields.append(fieldid)
426
-
427
- await stream_processor.process(message=message, seqid=2)
428
- auditreq = await get_audit_messages(psub)
429
-
430
- # Minimal assert to make sure we get the information from the node on the audit
431
- # gets from the sidecar to the audit report when adding or modifying a resource
432
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
433
-
434
- assert auditreq.kbid == knowledgebox_ingest
435
- assert auditreq.rid == rid
436
- assert auditreq.type == AuditRequest.AuditType.MODIFIED
437
-
438
- #
439
- # Test 3: modify a file while adding and deleting other files
440
- #
441
-
442
- message = make_message(knowledgebox_ingest, rid)
443
- add_filefields(message, [("file_2", "file.png"), ("file_4", "text.pb")])
444
- fieldid = FieldID(field="file_3", field_type=FieldType.FILE)
445
- message.delete_fields.append(fieldid)
446
-
447
- await stream_processor.process(message=message, seqid=3)
448
- auditreq = await get_audit_messages(psub)
449
-
450
- # Minimal assert to make sure we get the information from the node on the audit
451
- # gets from the sidecar to the audit report when adding or modifying a resource
452
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
453
-
454
- assert auditreq.kbid == knowledgebox_ingest
455
- assert auditreq.rid == rid
456
- assert auditreq.type == AuditRequest.AuditType.MODIFIED
457
-
458
- audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
459
- assert audit_by_fieldid["file_2"].action == AuditField.FieldAction.MODIFIED
460
- assert audit_by_fieldid["file_2"].size == test_png_size
461
- assert audit_by_fieldid["file_4"].action == AuditField.FieldAction.MODIFIED
462
- assert audit_by_fieldid["file_4"].size == test_text_size
463
- assert audit_by_fieldid["file_3"].action == AuditField.FieldAction.DELETED
464
- assert audit_by_fieldid["file_3"].size == 0
465
-
466
- #
467
- # Test 4: delete resource
468
- #
469
-
470
- message = make_message(
471
- knowledgebox_ingest, rid, message_type=BrokerMessage.MessageType.DELETE
472
- )
473
- await stream_processor.process(message=message, seqid=4)
474
- auditreq = await get_audit_messages(psub)
475
-
476
- assert auditreq.type == AuditRequest.AuditType.DELETED
477
-
478
- # Test 5: Delete knowledgebox
479
-
480
- txn = await maindb_driver.begin()
481
- kb = await datamanagers.kb.get_config(txn, kbid=knowledgebox_ingest)
482
-
483
- set_utility(Utility.AUDIT, stream_audit)
484
- await KnowledgeBox.delete_kb(txn, kb.slug, knowledgebox_ingest) # type: ignore
485
-
486
- auditreq = await get_audit_messages(psub)
487
- assert auditreq.kbid == knowledgebox_ingest
488
- assert auditreq.type == AuditRequest.AuditType.KB_DELETED
489
-
490
- try:
491
- int(auditreq.trace_id)
492
- except ValueError:
493
- assert False, "Invalid trace ID"
494
-
495
- # Currently where not updating audit counters on delete operations
496
- assert not auditreq.HasField("kb_counter")
497
-
498
- await txn.abort()
499
-
500
- await client.drain()
501
- await client.close()
502
-
503
-
504
- @pytest.mark.asyncio
505
- async def test_qa(
506
- local_files,
507
- storage: Storage,
508
- cache,
509
- fake_node,
510
- stream_processor,
511
- stream_audit: StreamAuditStorage,
512
- test_resource: Resource,
513
- ):
514
- kbid = test_resource.kb.kbid
515
- rid = test_resource.uuid
516
- driver = stream_processor.driver
517
- message = make_message(kbid, rid)
518
- message.account_seq = 2
519
- message.files["qa"].file.uri = "http://something"
520
- message.files["qa"].file.size = 123
521
- message.files["qa"].file.source = CloudFile.Source.LOCAL
522
-
523
- qaw = FieldQuestionAnswerWrapper()
524
- qaw.field.field_type = FieldType.FILE
525
- qaw.field.field = "qa"
526
-
527
- for i in range(10):
528
- qa = QuestionAnswer()
529
-
530
- qa.question.text = f"My question {i}"
531
- qa.question.language = "catalan"
532
- qa.question.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
533
-
534
- answer = Answers()
535
- answer.text = f"My answer {i}"
536
- answer.language = "catalan"
537
- answer.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
538
- qa.answers.append(answer)
539
- qaw.question_answers.question_answer.append(qa)
540
-
541
- message.question_answers.append(qaw)
542
-
543
- await stream_processor.process(message=message, seqid=1)
544
-
545
- async with driver.transaction() as txn:
546
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
547
- r = await kb_obj.get(message.uuid)
548
- assert r is not None
549
- res = await r.get_field(key="qa", type=FieldType.FILE)
550
- res_qa = await res.get_question_answers()
551
-
552
- assert qaw.question_answers == res_qa
553
-
554
- # delete op
555
- message = make_message(kbid, rid, message_type=BrokerMessage.MessageType.DELETE)
556
- await stream_processor.process(message=message, seqid=2)
557
-
558
-
559
- @pytest.mark.asyncio
560
- async def test_ingest_audit_stream_mixed(
561
- local_files,
562
- storage: Storage,
563
- cache,
564
- fake_node,
565
- stream_processor,
566
- stream_audit: StreamAuditStorage,
567
- test_resource: Resource,
568
- ):
569
- from nucliadb_utils.settings import audit_settings
570
-
571
- kbid = test_resource.kb.kbid
572
- rid = test_resource.uuid
573
- # Prepare a test audit stream to receive our messages
574
- partition = stream_audit.get_partition(kbid)
575
- client: Client = await nats.connect(stream_audit.nats_servers)
576
- jetstream: JetStreamContext = client.jetstream()
577
- if audit_settings.audit_jetstream_target is None:
578
- assert False, "Missing jetstream target in audit settings"
579
- subject = audit_settings.audit_jetstream_target.format(
580
- partition=partition, type="*"
581
- )
582
- try:
583
- await jetstream.delete_stream(name=audit_settings.audit_stream)
584
- except nats.js.errors.NotFoundError:
585
- pass
586
- await jetstream.add_stream(name=audit_settings.audit_stream, subjects=[subject])
587
- psub = await jetstream.pull_subscribe(subject, "psub")
588
-
589
- #
590
- # Test 1: starting with a complete resource, do one of heac add, mod, del field
591
- #
592
- message = make_message(kbid, rid)
593
- add_filefields(message, [("file_1", "file.png")])
594
- add_textfields(message, ["text1"])
595
- fieldid = FieldID(field="conv1", field_type=FieldType.CONVERSATION)
596
- message.delete_fields.append(fieldid)
597
- await stream_processor.process(message=message, seqid=1)
598
-
599
- auditreq = await get_audit_messages(psub)
600
-
601
- # Minimal assert to make sure we get the information from the node on the audit
602
- # gets from the sidecar to the audit report when adding or modifying a resource
603
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
604
-
605
- assert auditreq.kbid == kbid
606
- assert auditreq.rid == rid
607
- assert auditreq.type == AuditRequest.AuditType.MODIFIED
608
-
609
- assert len(auditreq.fields_audit) == 4
610
- audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
611
- assert audit_by_fieldid["file_1"].action == AuditField.FieldAction.MODIFIED
612
- assert audit_by_fieldid["text1"].action == AuditField.FieldAction.MODIFIED
613
- assert audit_by_fieldid["conv1"].action == AuditField.FieldAction.DELETED
614
-
615
- #
616
- # Test 2: delete resource
617
- #
618
-
619
- message = make_message(kbid, rid, message_type=BrokerMessage.MessageType.DELETE)
620
- await stream_processor.process(message=message, seqid=2)
621
- auditreq = await get_audit_messages(psub)
622
-
623
- assert auditreq.type == AuditRequest.AuditType.DELETED
624
-
625
- await client.drain()
626
- await client.close()
627
-
628
-
629
- @pytest.mark.asyncio
630
- async def test_ingest_account_seq_stored(
631
- local_files,
632
- storage: Storage,
633
- fake_node,
634
- stream_processor,
635
- test_resource: Resource,
636
- ):
637
- driver = stream_processor.driver
638
- kbid = test_resource.kb.kbid
639
- rid = test_resource.uuid
640
-
641
- message = make_message(kbid, rid)
642
- message.account_seq = 2
643
- add_filefields(message, [("file_1", "file.png")])
644
- await stream_processor.process(message=message, seqid=1)
645
-
646
- async with driver.transaction() as txn:
647
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
648
- r = await kb_obj.get(message.uuid)
649
- assert r is not None
650
- basic = await r.get_basic()
651
-
652
- assert basic is not None
653
- assert basic.last_account_seq == 2
654
- assert basic.queue == 0
655
-
656
-
657
- @pytest.mark.asyncio
658
- async def test_ingest_processor_handles_missing_kb(
659
- local_files,
660
- storage: Storage,
661
- fake_node,
662
- stream_processor,
663
- test_resource: Resource,
664
- ):
665
- kbid = str(uuid4())
666
- rid = str(uuid4())
667
- message = make_message(kbid, rid)
668
- message.account_seq = 1
669
- await stream_processor.process(message=message, seqid=1)
670
-
671
-
672
- @pytest.mark.asyncio
673
- async def test_ingest_autocommit_deadletter_marks_resource(
674
- kbid: str, processor: Processor, storage, maindb_driver
675
- ):
676
- rid = str(uuid.uuid4())
677
- message = make_message(kbid, rid)
678
-
679
- with patch.object(processor, "notify_commit") as mock_notify, pytest.raises(
680
- DeadletteredError
681
- ):
682
- # cause an error to force deadletter handling
683
- mock_notify.side_effect = Exception("test")
684
- await processor.process(message=message, seqid=1)
685
-
686
- async with maindb_driver.transaction() as txn:
687
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
688
- resource = await kb_obj.get(message.uuid)
689
-
690
- mock_notify.assert_called_once()
691
- assert resource.basic.metadata.status == PBMetadata.Status.ERROR # type: ignore