nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,684 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import base64
21
- import uuid
22
- from datetime import datetime
23
- from os.path import dirname, getsize
24
- from unittest.mock import patch
25
- from uuid import uuid4
26
-
27
- import nats
28
- import pytest
29
- from nats.aio.client import Client
30
- from nats.js import JetStreamContext
31
- from nucliadb_protos.audit_pb2 import AuditField, AuditRequest
32
- from nucliadb_protos.resources_pb2 import (
33
- TEXT,
34
- Answers,
35
- Classification,
36
- CloudFile,
37
- Entity,
38
- ExtractedTextWrapper,
39
- ExtractedVectorsWrapper,
40
- FieldComputedMetadataWrapper,
41
- FieldID,
42
- FieldQuestionAnswerWrapper,
43
- FieldType,
44
- FileExtractedData,
45
- LargeComputedMetadataWrapper,
46
- )
47
- from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
48
- from nucliadb_protos.resources_pb2 import Origin, Paragraph, QuestionAnswer
49
- from nucliadb_protos.utils_pb2 import Vector
50
- from nucliadb_protos.writer_pb2 import BrokerMessage
51
-
52
- from nucliadb.common.maindb.driver import Driver
53
- from nucliadb.ingest import SERVICE_NAME
54
- from nucliadb.ingest.consumer.auditing import (
55
- IndexAuditHandler,
56
- ResourceWritesAuditHandler,
57
- )
58
- from nucliadb.ingest.orm.exceptions import DeadletteredError
59
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
60
- from nucliadb.ingest.orm.processor import Processor
61
- from nucliadb.ingest.orm.resource import Resource
62
- from nucliadb_utils.audit.stream import StreamAuditStorage
63
- from nucliadb_utils.storages.storage import Storage
64
- from nucliadb_utils.utilities import Utility, get_indexing, get_storage, set_utility
65
-
66
- EXAMPLE_VECTOR = base64.b64decode(
67
- "k05VTVBZAQB2AHsnZGVzY3InOiAnPGY0JywgJ2ZvcnRyYW5fb3JkZXInOiBGYWxzZSwgJ3NoYXBlJzogKDc2OCwpLCB9ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIAogKIq+lgYUvvf7bTx39oo+fFaXPbx2a71RfiK/FBroPVTawr4UEm2+AgtuvWzTTT5ipjg9JflLvMdfub6fBIE+d7gmvnJPl75alUQ+n68Hv2BYJL20+74+bHDsPV/wTT63h4E9hes9PqxHXT6h/J079HfVPF/7BD3BSMO+PuA9PjJhtD4W6pq+rmwjPp+Fzz6xUfa+FMZtOutIBT4mPik+EbsAPyRePb41IVW+i+0RPT7GtL51USY+GRjRvjWD1z4+wq+9j9kqvmq/074hHBM+kh+ZPoRfmb6R0yi/kuirvlcqLj+Ss64+0cMBP2UKsD2LtpI9927BvtCfHb5KY7U+8s64vkcGX778+NY+2pMxPNowJD7R39u+dbmfPqbrL73bIby+Nbu8voH3kr4gps6+f3L6PuJFAb3PFWA+99BPPjkLzD0vc8m79JmtvWYnbL6W+6A+WUWEveVVED0V0h8+3zPWvv19Dr2igdC9JcGRPV568z41ZVu8mRxRvdkBQr73JHO+PFxkvtHatLzVgN49NEgav0l7ab276hK+ABMDvrRrJj4akbO++zFnPRzXoDyecdi+pGq4viUgiL4XXwK+tvcOPivvgD7PV0w+D7CwPmfoiL0REec+tsx1Pe2xkD6S9Jm+ZW09P1Obiz2Ov/Q+OtsBP8Xicj7WJpi9szGJvqvWvz4hFqG++ZuGvIAmMb0r+T2+wj9RPgZN0z7KwGI+ezogPgI78D6aUrW8etzkPHpSqb7c4Sg+b6BZvXlSrr6un6a8uUCrvhbBgb7PtwA+CsSwvQzyz73G1eq+plYZP/6I7r6BRsu992/gPuIBJj9aT8u94saNvdIDG76Zar4+GeRxvncSZz3citO+ILq6vmS3D78JHk6/NdeIPWYQwb0WZJW9OnwJPhdIQL7Gta6+MZWevpRNvr0ZH/c9B//hPtNUlL1pWhu/VliNvshFjT6laVS9EpjovQBHdb4HWMe+e/rfvrcSDz620/I+krapPlnIDz5uR1Y+znjqPTFM+T1+kK8+VMcevDegSjvM7fw+e0yKPbDoVz56wk4+EeoGvnq3rT76dbW+ghE6vvos0b6CqQu+p6JDPvzn2bogOui+oZU5v6/Pvr4siDI9Kv6Dvt6TQj51LqW+qYLsPmyjZT45DkG9MQivPgIHBT/qeRW/ghXOPkcJtL6MwhA/9F5PvbR7Jr4ftKA+mdkePwm2A77WpNU+Ho/NvsWEfL75zPS9v8ycvtXFVD5ONFI7mVkOPlFd7bzacZK7aSyRPkRrhz6e8+k+glJ5Pq9mmD0X95y+APOjPveVBb9yOgM+DLlMPkqCRb7CKwW8N+TevpZtmD5lbpq+n+tdPr4+m7661Wg+gd66ve5dzr2ZH7k+x/aNPo+0Kz4PMMa+voMGv+ud8r4Nape892YZPWlDL76twQi/RC2QPk8juz1uTwC9yf3rvn8RmD5LO0e+7t5CvvYTbb5O8UA/yrZqO7aZib6FBEe+n/xAv08BGr15Vxs/FNIevkbN1r3f2Hw+oj18PJwOnb3SDpo+wf67vvy3sj6qvRM/BrljPtrlBr4w2Ck9Jh6fPv6Vn75qa7w+eWShvj6bYj56q46+x41KPvQtqb2qXVm+DTmTPpvXWz5hUnC9f7ptPAu1tsDAcUa+ckyGvTeaIz3FcaC9Zu/cvYvjzD7WUdQ+P2DFvrbdHz4CfVe+HxwAP3HZy775Q7w+eg+svcccuTwLBFW+QkVhPuSSjLymH6g+DFBKviDgWz0wxyK+1C+3PSKk975Hkxi8FKzVvRnykD5lFCa/bqnBPRACHL5uUS+/Zb3FvoK66j5CHUu+vq4TvkxWfT5wv7o+wW79PJHsrD42Aau+SuQFvdzUnz50dEe+qZNjvmZ1LLxvt529oeHDPsv3dT5O+z69vOoevm/1Cz5O7NU9i6uHPibkEr6g5d2+LobFPn+KAT/gLsY+2jm4vlpyhD48l4g+yqx3Pql7yr5sIYK+7awLPlnODb+3e7i+t9RVPQC99z6SQJk+lbXoPbyAI7mKcCu/4kX9vFuhtL61fhq+UjGgvYxSvDzCzfw+24xfvs+Sjr782jy+kTzaPmEqtD6sN2c9otXavSqTiT5hM/q+MjAFP4kflT5JOe280NUmvrQtkT4f55m9CyFwPr8GF7wNzBm+x05SvsFJtz0MG9w+HCf/vn4mkr7iMiw+DmhCPUDI/j3PrVe+glX3vlpDPz8ucKG+MexCPgoBDD+FMn68BMDnvCf+UT3bgq4+srqvvYF8H7+1VKq9qbQTvY1tBL8epwC85PUdviSEhD7hg7e9jMUzvVuFz71qCf29IudEPsAwH767q809fL0uvrk+Mr7OTVy9TNcWvhnV3T4hOwq/F/E3P4UOXz4Vade+fK8TP5v4sr4Amf8+HCqPvmYV7Lo3UMK+0urYPrSH3zw/8oq9tAHCvvs5GD91e6w9GsqJPNRo7j5ffH6+X++MvKFQxj17Es6+TA5OvW8tAz8C4nU8tiHDvm5FZL5Kv3O+fuZ0Php/9j0Gyua+mSKVvs+pDT8+TwC/qS8Gvl/z0r5iVLq+a8e/vIXlIT4r7Ty+dqrXPmn9Db4p4PS+Kv0nPfnVUz7avj0+KVOTPkG3Kz68dQa+LSKGvXnRvjxnzyM92moTvy9SnD4F9Dw+mWoyvXpXRD8nm7I9O245P6KlZT4zCxc+baKLPsyE0rw8YHO+coGePfcAYT300Jw+UoeUvlvHFD7CjpC+p9KpvlteKrvgzwG8Sbg2vn8NDz5MDtW99URGvoaaxb0svk+9+cajvUvAab1qXpS91FSbvszYlj6f9oI+Ge5yPDdVxr45qV2+WmuxPcx+qj5l88W9ApSIvsFrwT4GT8c+Vg/0PjkNT745ezC/9ogqPm7bE7/Wh1O9b7NrvlVU/j4u3ga9mv+xvaHTtD76O40+LIyTvssUDT73Q5y+QO5TvX7bgj3gY5S+YTSfvpYeIL6a+Y29CLmZvda6xz6cC9Q+9sQSPwnG+j3RS927zvaAvq7iLz0CqPw9Fir+vNr7VT5qEgM+yhqtupy5q76uVtE7eZ+Nvi/7h75rkLq9vOW0O7QhFj7JCbc+3tp7vlpEOT9+aPc+hwnnvkqLPr0Ry/4+8zOaPfE0O70OJ6I9eQlJvbAU/T0KcaK8gS2Kvulxdj0u2JY+u4mxPN4vXj7B6xQ+LjBLvuTgJ77vq7M7KbcIvnbIdD0UQd++ZyuHvlaAPr4SeMw++sRuvZ7sXz3yJ5O9cSmPvZ8mRL7X2JM9trN4PpzLt70C3Og94uwLv4pACb8LWoY9Uz+ZPvE1Ij4R8HG9JVyJvvFOZz6XkIU+had5PvoQKT7h3CK+IzATv1U3qrxUum68B1bDviBzhz7u5XI9KXwkPoszXr6en5I9VNxMPAKusT5XGTg8Ne9GvC6yBz/EidM+V8T8u3LO1D7qSJa+AlsUPeb9pb0vNFK+lFCevTGrR70aeSu+zihyvOLan77CaxE/5ZnaPUv8Nr/hBhs+oCZBPttGqr5ZrwO+O0DGPU7JOD7FxdK+pw6CPWumgz6VB7++Gjb1vq6Ns7uZ1FI9VmTLPsl2iz7h5YI8CJYXvh6MSz6ucvc9qx1bPovgpT7ZWyO+Z+d1vrXkrz3VC8s+dmievuxuHb7MOXE+ewUCvJcPuT6n2Rc8mQyYvl45Gr1ER3c9LCZYvmqQhb1lVJu+V1acPZp63z5Cfmu+4NFZPvmBJb6cmAI+J0U7PsLkSb16KrO9wj4JPo4Fq7563+09jAw8vkYbbD7/Z5q7TH1kvnJrLb1mqkS+R+a9vX0ODD4p9ak+un8VO6mSp71C66w+FlLVPr/0Wb0eLR2+AneHvVTFHD/P0X0+TsQ4vlWQQzzP8no6VtEOPHLiG78Foyg+Un5OP/fFeL3uVxc+C1VzP9IInL2Zbbo8bw2Lvt5f0b4LY9w9LyaMvIcBc70K3bs+9lz5vTSTC7770MG+B4dHvvRFSz3lO6w9ENACv5NLBz20vSk+MuMQPLQYZr/2+6o+gzANvXGTjL259Qy9ZUMKPnyCC7498ww8oGGSvouNujyvJVW+TjmIvvI8KT667mq9MC6fvVUcvz0=" # noqa
68
- )
69
-
70
-
71
- @pytest.fixture(autouse=True)
72
- async def audit_consumers(storage, pubsub, stream_audit: StreamAuditStorage):
73
- index_auditor = IndexAuditHandler(
74
- audit=stream_audit,
75
- pubsub=pubsub,
76
- )
77
- resource_writes_auditor = ResourceWritesAuditHandler(
78
- storage=storage,
79
- audit=stream_audit,
80
- pubsub=pubsub,
81
- )
82
-
83
- await index_auditor.initialize()
84
- await resource_writes_auditor.initialize()
85
- yield
86
- await index_auditor.finalize()
87
- await resource_writes_auditor.finalize()
88
-
89
-
90
- @pytest.fixture()
91
- def kbid(
92
- local_files,
93
- storage: Storage,
94
- txn,
95
- cache,
96
- fake_node,
97
- processor,
98
- knowledgebox_ingest,
99
- ):
100
- yield knowledgebox_ingest
101
-
102
-
103
- @pytest.mark.asyncio
104
- async def test_ingest_messages_autocommit(kbid: str, processor):
105
- rid = str(uuid.uuid4())
106
- message1: BrokerMessage = BrokerMessage(
107
- kbid=kbid,
108
- uuid=rid,
109
- slug="slug1",
110
- type=BrokerMessage.AUTOCOMMIT,
111
- )
112
- filename = f"{dirname(__file__)}/assets/file.png"
113
- cf1 = CloudFile(
114
- uri="file.png",
115
- source=CloudFile.Source.LOCAL,
116
- bucket_name="/integration/ingest/assets",
117
- size=getsize(filename),
118
- content_type="image/png",
119
- filename="file.png",
120
- )
121
- message1.basic.icon = "text/plain"
122
- message1.basic.title = "Title Resource"
123
- message1.basic.summary = "Summary of Document"
124
- message1.basic.thumbnail = "doc"
125
- message1.basic.layout = "default"
126
- message1.basic.metadata.language = "es"
127
- message1.basic.created.FromDatetime(datetime.now())
128
- message1.basic.modified.FromDatetime(datetime.now())
129
- message1.origin.source = Origin.Source.WEB
130
- message1.files["file"].file.CopyFrom(cf1)
131
-
132
- fed = FileExtractedData()
133
- fed.file_pages_previews.pages.append(cf1)
134
- fed.language = "ca"
135
- fed.md5 = "asdsadsad"
136
- fed.metadata["key1"] = "ca"
137
- fed.nested["key2"] = "ca"
138
- fed.file_generated["subfile1"].CopyFrom(cf1)
139
- fed.file_preview.CopyFrom(cf1)
140
- fed.file_thumbnail.CopyFrom(cf1)
141
- message1.file_extracted_data.append(fed)
142
-
143
- etw = ExtractedTextWrapper()
144
- etw.body.text = "My own text"
145
- etw.field.field = "file"
146
- etw.field.field_type = FieldType.FILE
147
- message1.extracted_text.append(etw)
148
- etw = ExtractedTextWrapper()
149
- etw.body.text = "My summary"
150
- etw.field.field = "summary"
151
- etw.field.field_type = FieldType.GENERIC
152
- message1.extracted_text.append(etw)
153
-
154
- fcm = FieldComputedMetadataWrapper()
155
- fcm.field.field = "file"
156
- fcm.field.field_type = FieldType.FILE
157
- p1 = Paragraph(
158
- start=1,
159
- end=20,
160
- )
161
- fcm.metadata.metadata.paragraphs.append(p1)
162
- fcm.metadata.metadata.last_index.FromDatetime(datetime.now())
163
- fcm.metadata.metadata.last_understanding.FromDatetime(datetime.now())
164
- fcm.metadata.metadata.last_extract.FromDatetime(datetime.now())
165
- fcm.metadata.metadata.ner["Ramon"] = "PERSON"
166
-
167
- c1 = Classification()
168
- c1.label = "label1"
169
- c1.labelset = "labelset1"
170
- fcm.metadata.metadata.classifications.append(c1)
171
- message1.field_metadata.append(fcm)
172
-
173
- lcmw = LargeComputedMetadataWrapper()
174
- lcmw.field.field = "file"
175
- lcmw.field.field_type = FieldType.FILE
176
- lcmw.real.metadata.tokens["asd"] = 4
177
- lcmw.real.metadata.entities.append(Entity(token="token", root="tok", type="PERSON"))
178
- message1.field_large_metadata.append(lcmw)
179
-
180
- ev = ExtractedVectorsWrapper()
181
- ev.field.field = "file"
182
- ev.field.field_type = FieldType.FILE
183
- v1 = Vector(
184
- start=1, end=10, start_paragraph=1, end_paragraph=20, vector=EXAMPLE_VECTOR
185
- )
186
- ev.vectors.vectors.vectors.append(v1)
187
- message1.field_vectors.append(ev)
188
-
189
- message1.source = BrokerMessage.MessageSource.WRITER
190
- await processor.process(message=message1, seqid=1)
191
-
192
- index = get_indexing()
193
- storage = await get_storage(service_name=SERVICE_NAME)
194
-
195
- pb = await storage.get_indexing(index._calls[0][1])
196
- assert pb.texts["a/summary"].text == "My summary" # type: ignore
197
-
198
- pb = await storage.get_indexing(index._calls[1][1])
199
- assert pb.texts["a/summary"].text == "My summary" # type: ignore
200
-
201
-
202
- @pytest.mark.asyncio
203
- async def test_ingest_error_message(
204
- kbid: str, storage: Storage, processor, maindb_driver: Driver
205
- ):
206
- filename = f"{dirname(__file__)}/assets/resource.pb"
207
- with open(filename, "r") as f:
208
- data = base64.b64decode(f.read())
209
- message0: BrokerMessage = BrokerMessage()
210
- message0.ParseFromString(data)
211
- message0.kbid = kbid
212
- message0.source = BrokerMessage.MessageSource.WRITER
213
-
214
- await processor.process(message=message0, seqid=1)
215
-
216
- filename = f"{dirname(__file__)}/assets/error.pb"
217
- with open(filename, "r") as f:
218
- data = base64.b64decode(f.read())
219
- message1: BrokerMessage = BrokerMessage()
220
- message1.ParseFromString(data)
221
- message1.kbid = kbid
222
- message1.ClearField("field_vectors")
223
- message1.source = BrokerMessage.MessageSource.WRITER
224
-
225
- await processor.process(message=message1, seqid=2)
226
-
227
- async with maindb_driver.transaction() as txn:
228
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
229
- r = await kb_obj.get(message1.uuid)
230
- assert r is not None
231
- field_obj = await r.get_field("wikipedia_ml", TEXT)
232
- ext1 = await field_obj.get_extracted_text()
233
- lfm1 = await field_obj.get_large_field_metadata()
234
- fm1 = await field_obj.get_field_metadata()
235
- basic = await r.get_basic()
236
- assert basic is not None
237
- assert basic.slug == message1.slug
238
- assert basic.summary == message0.basic.summary
239
-
240
- assert ext1.text == message1.extracted_text[0].body.text
241
-
242
- assert lfm1 is not None
243
- assert fm1 is not None
244
- assert field_obj.value.body == message0.texts["wikipedia_ml"].body
245
-
246
-
247
- @pytest.mark.asyncio
248
- async def test_ingest_messages_origin(
249
- local_files,
250
- storage: Storage,
251
- fake_node,
252
- processor,
253
- knowledgebox_ingest,
254
- ):
255
- rid = "43ece3e4-b706-4c74-b41b-3637f6d28197"
256
- message1: BrokerMessage = BrokerMessage(
257
- kbid=knowledgebox_ingest,
258
- uuid=rid,
259
- slug="slug1",
260
- type=BrokerMessage.AUTOCOMMIT,
261
- )
262
- message1.source = BrokerMessage.MessageSource.WRITER
263
- await processor.process(message=message1, seqid=1)
264
-
265
- async with processor.driver.transaction() as txn:
266
- storage = await get_storage(service_name=SERVICE_NAME)
267
- kb = KnowledgeBox(txn, storage, knowledgebox_ingest)
268
- res = Resource(txn, storage, kb, rid)
269
- origin = await res.get_origin()
270
-
271
- # should not be set
272
- assert origin is None
273
-
274
- # now set the origin
275
- message1.origin.CopyFrom(
276
- Origin(
277
- source=Origin.Source.API,
278
- filename="file.png",
279
- url="http://www.google.com",
280
- )
281
- )
282
- await processor.process(message=message1, seqid=2)
283
-
284
- async with processor.driver.transaction() as txn:
285
- kb = KnowledgeBox(txn, storage, knowledgebox_ingest)
286
- res = Resource(txn, storage, kb, rid)
287
- origin = await res.get_origin()
288
-
289
- assert origin is not None
290
- assert origin.url == "http://www.google.com"
291
- assert origin.source == Origin.Source.API
292
- assert origin.filename == "file.png"
293
-
294
-
295
- def add_filefields(message, items=None):
296
- items = items or []
297
- for fieldid, filename in items:
298
- file_path = f"{dirname(__file__)}/assets/{filename}"
299
- cf1 = CloudFile(
300
- uri=filename,
301
- source=CloudFile.Source.LOCAL,
302
- bucket_name="/integration/ingest/assets",
303
- size=getsize(file_path),
304
- content_type="application/octet-stream",
305
- filename=filename,
306
- )
307
- message.files[fieldid].file.CopyFrom(cf1)
308
-
309
-
310
- def add_textfields(message, items=None):
311
- items = items or []
312
- for fieldid in items:
313
- message.texts[fieldid].body = "some random text"
314
-
315
-
316
- def make_message(
317
- kbid: str, rid: str, slug: str = "resource", message_type=BrokerMessage.AUTOCOMMIT
318
- ):
319
- message: BrokerMessage = BrokerMessage(
320
- kbid=kbid,
321
- uuid=rid,
322
- slug=slug,
323
- type=message_type,
324
- )
325
- message.basic.icon = "text/plain"
326
- message.basic.title = "Title Resource"
327
- message.basic.summary = "Summary of document"
328
- message.basic.thumbnail = "doc"
329
- message.basic.layout = "default"
330
- message.basic.metadata.language = "es"
331
- message.basic.created.FromDatetime(datetime.now())
332
- message.basic.modified.FromDatetime(datetime.now())
333
- message.origin.source = Origin.Source.WEB
334
-
335
- return message
336
-
337
-
338
- async def get_audit_messages(sub):
339
- msg = await sub.fetch(1)
340
- auditreq = AuditRequest()
341
- auditreq.ParseFromString(msg[0].data)
342
- return auditreq
343
-
344
-
345
- @pytest.mark.asyncio
346
- async def test_ingest_audit_stream_files_only(
347
- local_files,
348
- storage: Storage,
349
- txn,
350
- cache,
351
- fake_node,
352
- knowledgebox_ingest,
353
- stream_processor,
354
- stream_audit: StreamAuditStorage,
355
- maindb_driver: Driver,
356
- ):
357
- from nucliadb_utils.settings import audit_settings
358
-
359
- # Prepare a test audit stream to receive our messages
360
- partition = stream_audit.get_partition(knowledgebox_ingest)
361
- client: Client = await nats.connect(stream_audit.nats_servers)
362
- jetstream: JetStreamContext = client.jetstream()
363
- if audit_settings.audit_jetstream_target is None:
364
- assert False, "Missing jetstream target in audit settings"
365
- subject = audit_settings.audit_jetstream_target.format(
366
- partition=partition, type="*"
367
- )
368
- try:
369
- await jetstream.delete_stream(name=audit_settings.audit_stream)
370
- except nats.js.errors.NotFoundError:
371
- pass
372
- await jetstream.add_stream(name=audit_settings.audit_stream, subjects=[subject])
373
- psub = await jetstream.pull_subscribe(subject, "psub")
374
-
375
- rid = str(uuid.uuid4())
376
-
377
- # We use the same file multiple times, so the size will be the same
378
- test_png_size = getsize(f"{dirname(__file__)}/assets/file.png")
379
- test_text_size = getsize(f"{dirname(__file__)}/assets/text.pb")
380
- test_vectors_size = getsize(f"{dirname(__file__)}/assets/vectors.pb")
381
-
382
- #
383
- # Test 1: add a resource with some files
384
- #
385
- message = make_message(knowledgebox_ingest, rid)
386
- add_filefields(
387
- message,
388
- [("file_1", "file.png"), ("file_2", "text.pb"), ("file_3", "vectors.pb")],
389
- )
390
- await stream_processor.process(message=message, seqid=1)
391
-
392
- auditreq = await get_audit_messages(psub)
393
-
394
- # Minimal assert to make sure we get the information from the node on the audit
395
- # gets from the sidecar to the audit report when adding or modifying a resource
396
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
397
-
398
- assert auditreq.kbid == knowledgebox_ingest
399
- assert auditreq.rid == rid
400
- assert auditreq.type == AuditRequest.AuditType.NEW
401
-
402
- try:
403
- int(auditreq.trace_id)
404
- except ValueError:
405
- assert False, "Invalid trace ID"
406
-
407
- audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
408
- assert audit_by_fieldid["file_1"].action == AuditField.FieldAction.MODIFIED
409
- assert audit_by_fieldid["file_1"].size == test_png_size
410
- assert audit_by_fieldid["file_2"].action == AuditField.FieldAction.MODIFIED
411
- assert audit_by_fieldid["file_2"].size == test_text_size
412
- assert audit_by_fieldid["file_3"].action == AuditField.FieldAction.MODIFIED
413
- assert audit_by_fieldid["file_3"].size == test_vectors_size
414
-
415
- #
416
- # Test 2: delete one of the previous field on the same resource
417
- #
418
-
419
- message.files.clear()
420
- fieldid = FieldID(field="file_1", field_type=FieldType.FILE)
421
- message.delete_fields.append(fieldid)
422
-
423
- await stream_processor.process(message=message, seqid=2)
424
- auditreq = await get_audit_messages(psub)
425
-
426
- # Minimal assert to make sure we get the information from the node on the audit
427
- # gets from the sidecar to the audit report when adding or modifying a resource
428
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
429
-
430
- assert auditreq.kbid == knowledgebox_ingest
431
- assert auditreq.rid == rid
432
- assert auditreq.type == AuditRequest.AuditType.MODIFIED
433
-
434
- #
435
- # Test 3: modify a file while adding and deleting other files
436
- #
437
-
438
- message = make_message(knowledgebox_ingest, rid)
439
- add_filefields(message, [("file_2", "file.png"), ("file_4", "text.pb")])
440
- fieldid = FieldID(field="file_3", field_type=FieldType.FILE)
441
- message.delete_fields.append(fieldid)
442
-
443
- await stream_processor.process(message=message, seqid=3)
444
- auditreq = await get_audit_messages(psub)
445
-
446
- # Minimal assert to make sure we get the information from the node on the audit
447
- # gets from the sidecar to the audit report when adding or modifying a resource
448
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
449
-
450
- assert auditreq.kbid == knowledgebox_ingest
451
- assert auditreq.rid == rid
452
- assert auditreq.type == AuditRequest.AuditType.MODIFIED
453
-
454
- audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
455
- assert audit_by_fieldid["file_2"].action == AuditField.FieldAction.MODIFIED
456
- assert audit_by_fieldid["file_2"].size == test_png_size
457
- assert audit_by_fieldid["file_4"].action == AuditField.FieldAction.MODIFIED
458
- assert audit_by_fieldid["file_4"].size == test_text_size
459
- assert audit_by_fieldid["file_3"].action == AuditField.FieldAction.DELETED
460
- assert audit_by_fieldid["file_3"].size == 0
461
-
462
- #
463
- # Test 4: delete resource
464
- #
465
-
466
- message = make_message(
467
- knowledgebox_ingest, rid, message_type=BrokerMessage.MessageType.DELETE
468
- )
469
- await stream_processor.process(message=message, seqid=4)
470
- auditreq = await get_audit_messages(psub)
471
-
472
- assert auditreq.type == AuditRequest.AuditType.DELETED
473
-
474
- # Test 5: Delete knowledgebox
475
-
476
- async with maindb_driver.transaction() as txn:
477
- set_utility(Utility.AUDIT, stream_audit)
478
- await KnowledgeBox.delete_kb(txn, knowledgebox_ingest) # type: ignore
479
-
480
- auditreq = await get_audit_messages(psub)
481
- assert auditreq.kbid == knowledgebox_ingest
482
- assert auditreq.type == AuditRequest.AuditType.KB_DELETED
483
-
484
- try:
485
- int(auditreq.trace_id)
486
- except ValueError:
487
- assert False, "Invalid trace ID"
488
-
489
- # Currently where not updating audit counters on delete operations
490
- assert not auditreq.HasField("kb_counter")
491
-
492
- await client.drain()
493
- await client.close()
494
-
495
-
496
- @pytest.mark.asyncio
497
- async def test_qa(
498
- local_files,
499
- storage: Storage,
500
- cache,
501
- fake_node,
502
- stream_processor,
503
- stream_audit: StreamAuditStorage,
504
- test_resource: Resource,
505
- ):
506
- kbid = test_resource.kb.kbid
507
- rid = test_resource.uuid
508
- driver = stream_processor.driver
509
- message = make_message(kbid, rid)
510
- message.account_seq = 2
511
- message.files["qa"].file.uri = "http://something"
512
- message.files["qa"].file.size = 123
513
- message.files["qa"].file.source = CloudFile.Source.LOCAL
514
-
515
- qaw = FieldQuestionAnswerWrapper()
516
- qaw.field.field_type = FieldType.FILE
517
- qaw.field.field = "qa"
518
-
519
- for i in range(10):
520
- qa = QuestionAnswer()
521
-
522
- qa.question.text = f"My question {i}"
523
- qa.question.language = "catalan"
524
- qa.question.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
525
-
526
- answer = Answers()
527
- answer.text = f"My answer {i}"
528
- answer.language = "catalan"
529
- answer.ids_paragraphs.extend([f"id1/{i}", f"id2/{i}"])
530
- qa.answers.append(answer)
531
- qaw.question_answers.question_answer.append(qa)
532
-
533
- message.question_answers.append(qaw)
534
-
535
- await stream_processor.process(message=message, seqid=1)
536
-
537
- async with driver.transaction() as txn:
538
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
539
- r = await kb_obj.get(message.uuid)
540
- assert r is not None
541
- res = await r.get_field(key="qa", type=FieldType.FILE)
542
- res_qa = await res.get_question_answers()
543
-
544
- assert qaw.question_answers == res_qa
545
-
546
- # delete op
547
- message = make_message(kbid, rid, message_type=BrokerMessage.MessageType.DELETE)
548
- await stream_processor.process(message=message, seqid=2)
549
-
550
-
551
- @pytest.mark.asyncio
552
- async def test_ingest_audit_stream_mixed(
553
- local_files,
554
- storage: Storage,
555
- cache,
556
- fake_node,
557
- stream_processor,
558
- stream_audit: StreamAuditStorage,
559
- test_resource: Resource,
560
- ):
561
- from nucliadb_utils.settings import audit_settings
562
-
563
- kbid = test_resource.kb.kbid
564
- rid = test_resource.uuid
565
- # Prepare a test audit stream to receive our messages
566
- partition = stream_audit.get_partition(kbid)
567
- client: Client = await nats.connect(stream_audit.nats_servers)
568
- jetstream: JetStreamContext = client.jetstream()
569
- if audit_settings.audit_jetstream_target is None:
570
- assert False, "Missing jetstream target in audit settings"
571
- subject = audit_settings.audit_jetstream_target.format(
572
- partition=partition, type="*"
573
- )
574
- try:
575
- await jetstream.delete_stream(name=audit_settings.audit_stream)
576
- except nats.js.errors.NotFoundError:
577
- pass
578
- await jetstream.add_stream(name=audit_settings.audit_stream, subjects=[subject])
579
- psub = await jetstream.pull_subscribe(subject, "psub")
580
-
581
- #
582
- # Test 1: starting with a complete resource, do one of heac add, mod, del field
583
- #
584
- message = make_message(kbid, rid)
585
- add_filefields(message, [("file_1", "file.png")])
586
- add_textfields(message, ["text1"])
587
- fieldid = FieldID(field="conv1", field_type=FieldType.CONVERSATION)
588
- message.delete_fields.append(fieldid)
589
- await stream_processor.process(message=message, seqid=1)
590
-
591
- auditreq = await get_audit_messages(psub)
592
-
593
- # Minimal assert to make sure we get the information from the node on the audit
594
- # gets from the sidecar to the audit report when adding or modifying a resource
595
- # The values are hardcoded on nucliadb/nucliadb/ingest/orm/grpc_node_dummy.py
596
-
597
- assert auditreq.kbid == kbid
598
- assert auditreq.rid == rid
599
- assert auditreq.type == AuditRequest.AuditType.MODIFIED
600
-
601
- assert len(auditreq.fields_audit) == 4
602
- audit_by_fieldid = {audit.field_id: audit for audit in auditreq.fields_audit}
603
- assert audit_by_fieldid["file_1"].action == AuditField.FieldAction.MODIFIED
604
- assert audit_by_fieldid["text1"].action == AuditField.FieldAction.MODIFIED
605
- assert audit_by_fieldid["conv1"].action == AuditField.FieldAction.DELETED
606
-
607
- #
608
- # Test 2: delete resource
609
- #
610
-
611
- message = make_message(kbid, rid, message_type=BrokerMessage.MessageType.DELETE)
612
- await stream_processor.process(message=message, seqid=2)
613
- auditreq = await get_audit_messages(psub)
614
-
615
- assert auditreq.type == AuditRequest.AuditType.DELETED
616
-
617
- await client.drain()
618
- await client.close()
619
-
620
-
621
- @pytest.mark.asyncio
622
- async def test_ingest_account_seq_stored(
623
- local_files,
624
- storage: Storage,
625
- fake_node,
626
- stream_processor,
627
- test_resource: Resource,
628
- ):
629
- driver = stream_processor.driver
630
- kbid = test_resource.kb.kbid
631
- rid = test_resource.uuid
632
-
633
- message = make_message(kbid, rid)
634
- message.account_seq = 2
635
- add_filefields(message, [("file_1", "file.png")])
636
- await stream_processor.process(message=message, seqid=1)
637
-
638
- async with driver.transaction() as txn:
639
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
640
- r = await kb_obj.get(message.uuid)
641
- assert r is not None
642
- basic = await r.get_basic()
643
-
644
- assert basic is not None
645
- assert basic.last_account_seq == 2
646
- assert basic.queue == 0
647
-
648
-
649
- @pytest.mark.asyncio
650
- async def test_ingest_processor_handles_missing_kb(
651
- local_files,
652
- storage: Storage,
653
- fake_node,
654
- stream_processor,
655
- test_resource: Resource,
656
- ):
657
- kbid = str(uuid4())
658
- rid = str(uuid4())
659
- message = make_message(kbid, rid)
660
- message.account_seq = 1
661
- await stream_processor.process(message=message, seqid=1)
662
-
663
-
664
- @pytest.mark.asyncio
665
- async def test_ingest_autocommit_deadletter_marks_resource(
666
- kbid: str, processor: Processor, storage, maindb_driver: Driver
667
- ):
668
- rid = str(uuid.uuid4())
669
- message = make_message(kbid, rid)
670
-
671
- with (
672
- patch.object(processor, "notify_commit") as mock_notify,
673
- pytest.raises(DeadletteredError),
674
- ):
675
- # cause an error to force deadletter handling
676
- mock_notify.side_effect = Exception("test")
677
- await processor.process(message=message, seqid=1)
678
-
679
- async with maindb_driver.transaction() as txn:
680
- kb_obj = KnowledgeBox(txn, storage, kbid=kbid)
681
- resource = await kb_obj.get(message.uuid)
682
-
683
- mock_notify.assert_called_once()
684
- assert resource.basic.metadata.status == PBMetadata.Status.ERROR # type: ignore