nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,140 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
-
21
- import asyncio
22
- from unittest.mock import AsyncMock, MagicMock, patch
23
-
24
- import pytest
25
- from nucliadb_protos.writer_pb2 import Notification, ShardObject, Shards
26
-
27
- from nucliadb.common.cluster.settings import settings
28
- from nucliadb.ingest.consumer import shard_creator
29
- from nucliadb_protos import nodereader_pb2
30
-
31
- pytestmark = pytest.mark.asyncio
32
-
33
-
34
- @pytest.fixture()
35
- def pubsub():
36
- mock = AsyncMock()
37
- mock.parse = lambda x: x
38
- yield mock
39
-
40
-
41
- @pytest.fixture()
42
- def reader():
43
- yield AsyncMock()
44
-
45
-
46
- @pytest.fixture()
47
- def kbdm():
48
- mock = MagicMock()
49
- mock.get_model_metadata = AsyncMock(return_value="model")
50
- with patch("nucliadb.common.cluster.manager.datamanagers.kb", return_value=mock):
51
- yield mock
52
-
53
-
54
- @pytest.fixture()
55
- def shard_manager(reader):
56
- sm = MagicMock()
57
- node = MagicMock(reader=reader)
58
- shards = Shards(shards=[ShardObject(read_only=False)], actual=0)
59
- sm.get_current_active_shard = AsyncMock(return_value=shards.shards[0])
60
- sm.maybe_create_new_shard = AsyncMock()
61
- with (
62
- patch(
63
- "nucliadb.ingest.consumer.shard_creator.get_shard_manager", return_value=sm
64
- ),
65
- patch(
66
- "nucliadb.ingest.consumer.shard_creator.choose_node",
67
- return_value=(node, "shard_id"),
68
- ),
69
- patch(
70
- "nucliadb.ingest.consumer.shard_creator.locking.distributed_lock",
71
- return_value=AsyncMock(),
72
- ),
73
- ):
74
- yield sm
75
-
76
-
77
- @pytest.fixture()
78
- async def shard_creator_handler(pubsub, shard_manager):
79
- sc = shard_creator.ShardCreatorHandler(
80
- driver=AsyncMock(transaction=MagicMock(return_value=AsyncMock())),
81
- storage=AsyncMock(),
82
- pubsub=pubsub,
83
- check_delay=0.05,
84
- )
85
- await sc.initialize()
86
- yield sc
87
- await sc.finalize()
88
-
89
-
90
- async def test_handle_message_create_new_shard(
91
- shard_creator_handler: shard_creator.ShardCreatorHandler,
92
- reader,
93
- kbdm,
94
- shard_manager,
95
- ):
96
- reader.GetShard.return_value = nodereader_pb2.Shard(
97
- paragraphs=settings.max_shard_paragraphs + 1
98
- )
99
-
100
- notif = Notification(
101
- kbid="kbid",
102
- action=Notification.Action.INDEXED,
103
- )
104
- await shard_creator_handler.handle_message(notif.SerializeToString())
105
- await asyncio.sleep(0.06)
106
- shard_manager.maybe_create_new_shard.assert_called_with(
107
- "kbid", settings.max_shard_paragraphs + 1
108
- )
109
-
110
-
111
- async def test_handle_message_do_not_create(
112
- shard_creator_handler: shard_creator.ShardCreatorHandler, reader, shard_manager
113
- ):
114
- reader.GetShard.return_value = nodereader_pb2.Shard(
115
- paragraphs=settings.max_shard_paragraphs - 1
116
- )
117
-
118
- notif = Notification(
119
- kbid="kbid",
120
- action=Notification.Action.INDEXED,
121
- )
122
- await shard_creator_handler.handle_message(notif.SerializeToString())
123
-
124
- await shard_creator_handler.finalize()
125
-
126
- shard_manager.create_shard_by_kbid.assert_not_called()
127
-
128
-
129
- async def test_handle_message_ignore_not_indexed(
130
- shard_creator_handler: shard_creator.ShardCreatorHandler, shard_manager
131
- ):
132
- notif = Notification(
133
- kbid="kbid",
134
- action=Notification.Action.COMMIT,
135
- )
136
- await shard_creator_handler.handle_message(notif.SerializeToString())
137
-
138
- await shard_creator_handler.finalize()
139
-
140
- shard_manager.create_shard_by_kbid.assert_not_called()
@@ -1,67 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
-
21
- import asyncio
22
-
23
- import pytest
24
-
25
- from nucliadb.ingest.consumer import utils
26
-
27
- pytestmark = pytest.mark.asyncio
28
-
29
-
30
- async def test_delay_task_handler():
31
- dth = utils.DelayedTaskHandler(0.05)
32
- await dth.initialize()
33
-
34
- counter = 0
35
-
36
- async def handler():
37
- await asyncio.sleep(0.1)
38
- nonlocal counter
39
- counter += 1
40
-
41
- dth.schedule("key1", handler)
42
- dth.schedule("key1", handler)
43
- dth.schedule("key1", handler)
44
- dth.schedule("key2", handler)
45
- dth.schedule("key3", handler)
46
- dth.schedule("key4", handler)
47
-
48
- # all should be scheduled and duplicates ignored
49
- assert len(dth.to_process) == 4
50
-
51
- await asyncio.sleep(0.06)
52
- # they should all be running now
53
- assert len(dth.outstanding_tasks) == 4
54
-
55
- # schedule a couple more
56
- dth.schedule("key1", handler) # duplicate key, should get rescheduled at end
57
- dth.schedule("key5", handler)
58
- dth.schedule("key6", handler)
59
-
60
- await asyncio.sleep(0.1)
61
- # original set should be finished now
62
- assert counter == 4
63
-
64
- # finish everything now
65
- await dth.finalize()
66
-
67
- assert counter == 7
@@ -1,19 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
@@ -1,247 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- from uuid import uuid4
21
-
22
- import pytest
23
- from nucliadb_protos.noderesources_pb2 import Resource as PBResource
24
- from nucliadb_protos.resources_pb2 import (
25
- Basic,
26
- ExtractedText,
27
- FieldComputedMetadataWrapper,
28
- FieldID,
29
- FieldType,
30
- Metadata,
31
- Paragraph,
32
- Sentence,
33
- )
34
-
35
- from nucliadb.ingest.orm.brain import ParagraphPages, ResourceBrain
36
- from nucliadb_protos import resources_pb2
37
-
38
-
39
- def test_apply_field_metadata_marks_duplicated_paragraphs():
40
- # Simulate a field with two paragraphs that contain the same text
41
- br = ResourceBrain(rid=str(uuid4()))
42
- field_key = "text1"
43
- fcmw = FieldComputedMetadataWrapper()
44
- fcmw.field.CopyFrom(FieldID(field_type=FieldType.TEXT, field=field_key))
45
- paragraph = "Some paragraph here. "
46
- text_1 = f"{paragraph}{paragraph}"
47
- first_occurrence = [0, len(paragraph)]
48
- second_occurrence = [len(paragraph), len(paragraph) * 2]
49
-
50
- et = ExtractedText(text=text_1)
51
- p1 = Paragraph(start=first_occurrence[0], end=first_occurrence[1])
52
- p1.sentences.append(
53
- Sentence(start=first_occurrence[0], end=first_occurrence[1], key="test")
54
- )
55
- p2 = Paragraph(start=second_occurrence[0], end=second_occurrence[1])
56
- p2.sentences.append(
57
- Sentence(start=second_occurrence[0], end=second_occurrence[1], key="test")
58
- )
59
- fcmw.metadata.metadata.paragraphs.append(p1)
60
- fcmw.metadata.metadata.paragraphs.append(p2)
61
-
62
- br.apply_field_metadata(
63
- field_key,
64
- fcmw.metadata,
65
- replace_field=[],
66
- replace_splits={},
67
- page_positions={},
68
- extracted_text=et,
69
- )
70
-
71
- assert len(br.brain.paragraphs[field_key].paragraphs) == 2
72
- for key, paragraph in br.brain.paragraphs[field_key].paragraphs.items():
73
- if f"{first_occurrence[0]}-{first_occurrence[1]}" in key:
74
- # Only the first time that a paragraph is found should be set to false
75
- assert paragraph.repeated_in_field is False
76
- else:
77
- assert paragraph.repeated_in_field is True
78
-
79
-
80
- def test_apply_field_metadata_marks_duplicated_paragraphs_on_split_metadata():
81
- # # Test now the split text path
82
- br = ResourceBrain(rid=str(uuid4()))
83
- field_key = "text1"
84
- split_key = "subfield"
85
- fcmw = FieldComputedMetadataWrapper()
86
- fcmw.field.CopyFrom(FieldID(field_type=FieldType.TEXT, field=field_key))
87
- paragraph = "Some paragraph here. "
88
- text_1 = f"{paragraph}{paragraph}"
89
- first_occurrence = [0, len(paragraph)]
90
- second_occurrence = [len(paragraph), len(paragraph) * 2]
91
-
92
- et = ExtractedText()
93
- et.split_text[split_key] = text_1
94
- p1 = Paragraph(start=first_occurrence[0], end=first_occurrence[1])
95
- p1.sentences.append(
96
- Sentence(start=first_occurrence[0], end=first_occurrence[1], key="test")
97
- )
98
- p2 = Paragraph(start=second_occurrence[0], end=second_occurrence[1])
99
- p2.sentences.append(
100
- Sentence(start=second_occurrence[0], end=second_occurrence[1], key="test")
101
- )
102
- fcmw.metadata.split_metadata[split_key].paragraphs.append(p1)
103
- fcmw.metadata.split_metadata[split_key].paragraphs.append(p2)
104
-
105
- br.apply_field_metadata(
106
- field_key,
107
- fcmw.metadata,
108
- replace_field=[],
109
- replace_splits={},
110
- page_positions={},
111
- extracted_text=et,
112
- )
113
-
114
- assert len(br.brain.paragraphs[field_key].paragraphs) == 2
115
- for key, paragraph in br.brain.paragraphs[field_key].paragraphs.items():
116
- if f"{first_occurrence[0]}-{first_occurrence[1]}" in key:
117
- # Only the first time that a paragraph is found should be set to false
118
- assert paragraph.repeated_in_field is False
119
- else:
120
- assert paragraph.repeated_in_field is True
121
-
122
-
123
- def test_get_page_number():
124
- page_numbers = ParagraphPages(
125
- {
126
- 0: (0, 99),
127
- 1: (100, 199),
128
- 2: (200, 299),
129
- }
130
- )
131
- assert page_numbers.get(10) == 0
132
- assert page_numbers.get(100) == 1
133
- assert page_numbers.get(500) == 2
134
-
135
-
136
- @pytest.mark.parametrize(
137
- "new_status,previous_status,expected_brain_status",
138
- [
139
- # No previous_status
140
- (Metadata.Status.PENDING, None, PBResource.PENDING),
141
- (Metadata.Status.PROCESSED, None, PBResource.PROCESSED),
142
- (Metadata.Status.ERROR, None, PBResource.PROCESSED),
143
- (Metadata.Status.BLOCKED, None, PBResource.PROCESSED),
144
- (Metadata.Status.EXPIRED, None, PBResource.PROCESSED),
145
- # previous_status = PENDING
146
- (Metadata.Status.PENDING, Metadata.Status.PENDING, PBResource.PENDING),
147
- (Metadata.Status.PROCESSED, Metadata.Status.PENDING, PBResource.PROCESSED),
148
- (Metadata.Status.ERROR, Metadata.Status.PENDING, PBResource.PROCESSED),
149
- (Metadata.Status.BLOCKED, Metadata.Status.PENDING, PBResource.PROCESSED),
150
- (Metadata.Status.EXPIRED, Metadata.Status.PENDING, PBResource.PROCESSED),
151
- # previous_status = PROCESSED
152
- (Metadata.Status.PROCESSED, Metadata.Status.PROCESSED, PBResource.PROCESSED),
153
- (Metadata.Status.ERROR, Metadata.Status.PROCESSED, PBResource.PROCESSED),
154
- (Metadata.Status.BLOCKED, Metadata.Status.PROCESSED, PBResource.PROCESSED),
155
- (Metadata.Status.PENDING, Metadata.Status.PROCESSED, PBResource.PROCESSED),
156
- (Metadata.Status.EXPIRED, Metadata.Status.PROCESSED, PBResource.PROCESSED),
157
- # previous_status = ERROR
158
- (Metadata.Status.PENDING, Metadata.Status.ERROR, PBResource.PROCESSED),
159
- (Metadata.Status.PROCESSED, Metadata.Status.ERROR, PBResource.PROCESSED),
160
- (Metadata.Status.ERROR, Metadata.Status.ERROR, PBResource.PROCESSED),
161
- (Metadata.Status.BLOCKED, Metadata.Status.ERROR, PBResource.PROCESSED),
162
- (Metadata.Status.EXPIRED, Metadata.Status.ERROR, PBResource.PROCESSED),
163
- # previous_status = BLOCKED
164
- (Metadata.Status.PENDING, Metadata.Status.BLOCKED, PBResource.PROCESSED),
165
- (Metadata.Status.PROCESSED, Metadata.Status.BLOCKED, PBResource.PROCESSED),
166
- (Metadata.Status.ERROR, Metadata.Status.BLOCKED, PBResource.PROCESSED),
167
- (Metadata.Status.BLOCKED, Metadata.Status.BLOCKED, PBResource.PROCESSED),
168
- (Metadata.Status.EXPIRED, Metadata.Status.BLOCKED, PBResource.PROCESSED),
169
- # previous_status = EXPIRED
170
- (Metadata.Status.PENDING, Metadata.Status.EXPIRED, PBResource.PROCESSED),
171
- (Metadata.Status.PROCESSED, Metadata.Status.EXPIRED, PBResource.PROCESSED),
172
- (Metadata.Status.ERROR, Metadata.Status.EXPIRED, PBResource.PROCESSED),
173
- (Metadata.Status.BLOCKED, Metadata.Status.EXPIRED, PBResource.PROCESSED),
174
- (Metadata.Status.EXPIRED, Metadata.Status.EXPIRED, PBResource.PROCESSED),
175
- ],
176
- )
177
- def test_set_processing_status(new_status, previous_status, expected_brain_status):
178
- br = ResourceBrain(rid="foo")
179
- basic = Basic()
180
- basic.metadata.status = new_status
181
- br.set_processing_status(basic, previous_status)
182
- assert br.brain.status == expected_brain_status
183
-
184
-
185
- def test_apply_field_metadata_populates_page_number():
186
- br = ResourceBrain(rid="foo")
187
- field_key = "text1"
188
-
189
- fcmw = FieldComputedMetadataWrapper()
190
- fcmw.field.CopyFrom(FieldID(field_type=FieldType.TEXT, field=field_key))
191
-
192
- p1 = Paragraph(
193
- start=40, end=54, start_seconds=[0], end_seconds=[10], text="Some text here"
194
- )
195
- p1.sentences.append(Sentence(start=40, end=54, key="test"))
196
- fcmw.metadata.metadata.paragraphs.append(p1)
197
-
198
- # Add it to the split too
199
- fcmw.metadata.split_metadata["subfield"].paragraphs.append(p1)
200
-
201
- page_positions = {
202
- 0: (0, 20),
203
- 1: (21, 39),
204
- 2: (40, 100),
205
- }
206
- br.apply_field_metadata(
207
- field_key,
208
- fcmw.metadata,
209
- replace_field=[],
210
- replace_splits={},
211
- page_positions=page_positions,
212
- extracted_text=None,
213
- )
214
-
215
- assert len(br.brain.paragraphs[field_key].paragraphs) == 2
216
- for paragraph in br.brain.paragraphs[field_key].paragraphs.values():
217
- assert paragraph.metadata.position.page_number == 2
218
- assert paragraph.metadata.position.start == 40
219
- assert paragraph.metadata.position.end == 54
220
- assert paragraph.metadata.position.start_seconds == [0]
221
- assert paragraph.metadata.position.end_seconds == [10]
222
-
223
-
224
- def test_set_resource_metadata_promotes_origin_dates():
225
- resource_brain = ResourceBrain("rid")
226
- basic = Basic()
227
- basic.created.seconds = 1
228
- basic.modified.seconds = 2
229
- origin = resources_pb2.Origin()
230
- origin.created.seconds = 3
231
- origin.modified.seconds = 4
232
-
233
- resource_brain.set_resource_metadata(basic, origin)
234
-
235
- assert resource_brain.brain.metadata.created.seconds == 3
236
- assert resource_brain.brain.metadata.modified.seconds == 4
237
-
238
-
239
- def test_set_resource_metadata_handles_timestamp_not_present():
240
- resource_brain = ResourceBrain("rid")
241
- basic = Basic()
242
- resource_brain.set_resource_metadata(basic, None)
243
- created = resource_brain.brain.metadata.created.seconds
244
- modified = resource_brain.brain.metadata.modified.seconds
245
- assert created > 0
246
- assert modified > 0
247
- assert modified >= created
@@ -1,74 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
-
21
- import uuid
22
-
23
- from nucliadb.ingest.orm.brain import FIELD_PARAGRAPH_ID, FIELD_VECTOR_ID, ResourceBrain
24
- from nucliadb_protos import utils_pb2
25
-
26
-
27
- def test_apply_field_vectors_for_matryoshka_embeddings():
28
- STORED_VECTOR_DIMENSION = 100
29
- MATRYOSHKA_DIMENSION = 10
30
-
31
- rid = uuid.uuid4().hex
32
- field_id = uuid.uuid4().hex
33
- vectors = utils_pb2.VectorObject(
34
- vectors=utils_pb2.Vectors(
35
- vectors=[
36
- utils_pb2.Vector(
37
- start=0,
38
- end=10,
39
- start_paragraph=0,
40
- end_paragraph=10,
41
- vector=[1.0] * STORED_VECTOR_DIMENSION,
42
- )
43
- ]
44
- )
45
- )
46
- paragraph_key = FIELD_PARAGRAPH_ID.format(
47
- rid=rid,
48
- field_id=field_id,
49
- paragraph_start=0,
50
- paragraph_end=10,
51
- )
52
- vector_key = FIELD_VECTOR_ID.format(
53
- rid=rid,
54
- field_id=field_id,
55
- index=0,
56
- vector_start=0,
57
- vector_end=10,
58
- )
59
-
60
- brain = ResourceBrain(rid=rid)
61
- brain.apply_field_vectors(field_id, vectors, matryoshka_vector_dimension=None)
62
- vector = (
63
- brain.brain.paragraphs[field_id].paragraphs[paragraph_key].sentences[vector_key]
64
- )
65
- assert len(vector.vector) == STORED_VECTOR_DIMENSION
66
-
67
- brain = ResourceBrain(rid=rid)
68
- brain.apply_field_vectors(
69
- field_id, vectors, matryoshka_vector_dimension=MATRYOSHKA_DIMENSION
70
- )
71
- vector = (
72
- brain.brain.paragraphs[field_id].paragraphs[paragraph_key].sentences[vector_key]
73
- )
74
- assert len(vector.vector) == MATRYOSHKA_DIMENSION
@@ -1,131 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
-
20
- from unittest.mock import AsyncMock, MagicMock, Mock, patch
21
-
22
- import pytest
23
-
24
- from nucliadb.common.cluster.settings import settings as cluster_settings
25
- from nucliadb.ingest.orm.exceptions import ResourceNotIndexable
26
- from nucliadb.ingest.orm.processor import Processor, validate_indexable_resource
27
- from nucliadb_protos import noderesources_pb2
28
-
29
-
30
- @pytest.fixture()
31
- def txn():
32
- yield AsyncMock()
33
-
34
-
35
- @pytest.fixture()
36
- def driver(txn):
37
- mock = MagicMock()
38
- mock.transaction.return_value.__aenter__.return_value = txn
39
- yield mock
40
-
41
-
42
- @pytest.fixture()
43
- def sm():
44
- mock = AsyncMock()
45
- mock.add_resource = AsyncMock()
46
- with patch("nucliadb.ingest.orm.processor.get_shard_manager", return_value=mock):
47
- yield mock
48
-
49
-
50
- @pytest.fixture()
51
- def processor(driver, sm):
52
- yield Processor(driver, None)
53
-
54
-
55
- @pytest.fixture()
56
- def resource():
57
- mock = MagicMock()
58
- mock.set_basic = AsyncMock()
59
- yield mock
60
-
61
-
62
- @pytest.fixture()
63
- def kb():
64
- mock = MagicMock(kbid="kbid")
65
- mock.get_resource_shard_id = AsyncMock()
66
- mock.get_resource_shard = AsyncMock()
67
- yield mock
68
-
69
-
70
- async def test_commit_slug(processor: Processor, txn, resource):
71
- another_txn = Mock()
72
- resource.txn = another_txn
73
- resource.set_slug = AsyncMock()
74
-
75
- await processor.commit_slug(resource)
76
-
77
- resource.set_slug.assert_awaited_once()
78
- txn.commit.assert_awaited_once()
79
- assert resource.txn is another_txn
80
-
81
-
82
- async def test_mark_resource_error(processor: Processor, txn, resource, kb, sm):
83
- await processor._mark_resource_error(kb, resource, partition="partition", seqid=1)
84
- txn.commit.assert_called_once()
85
- resource.set_basic.assert_awaited_once()
86
- sm.add_resource.assert_awaited_once_with(
87
- kb.get_resource_shard.return_value,
88
- resource.indexer.brain,
89
- 1,
90
- partition="partition",
91
- kb="kbid",
92
- )
93
-
94
-
95
- async def test_mark_resource_error_handle_error(
96
- processor: Processor, kb, resource, txn
97
- ):
98
- resource.set_basic.side_effect = Exception("test")
99
- await processor._mark_resource_error(kb, resource, partition="partition", seqid=1)
100
- txn.commit.assert_not_called()
101
-
102
-
103
- async def test_mark_resource_error_skip_no_shard(
104
- processor: Processor, resource, driver, kb, txn
105
- ):
106
- kb.get_resource_shard.return_value = None
107
- await processor._mark_resource_error(kb, resource, partition="partition", seqid=1)
108
- txn.commit.assert_not_called()
109
-
110
-
111
- async def test_mark_resource_error_skip_no_resource(
112
- processor: Processor, kb, driver, txn
113
- ):
114
- await processor._mark_resource_error(kb, None, partition="partition", seqid=1)
115
- txn.commit.assert_not_called()
116
-
117
-
118
- def test_validate_indexable_resource():
119
- resource = noderesources_pb2.Resource()
120
- resource.paragraphs["test"].paragraphs["test"].sentences["test"].vector.append(1.0)
121
- validate_indexable_resource(resource)
122
-
123
-
124
- def test_validate_indexable_resource_throws_error_for_max():
125
- resource = noderesources_pb2.Resource()
126
- for i in range(cluster_settings.max_resource_paragraphs + 1):
127
- resource.paragraphs["test"].paragraphs[f"test{i}"].sentences[
128
- "test"
129
- ].vector.append(1.0)
130
- with pytest.raises(ResourceNotIndexable):
131
- validate_indexable_resource(resource)