nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,78 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import asyncio
21
- import time
22
-
23
- from httpx import AsyncClient
24
- from nucliadb_protos.knowledgebox_pb2 import KnowledgeBoxID
25
- from nucliadb_protos.writer_pb2 import GetEntitiesGroupRequest, GetEntitiesGroupResponse
26
- from nucliadb_protos.writer_pb2_grpc import WriterStub
27
-
28
- from nucliadb.writer.api.v1.router import KB_PREFIX
29
- from nucliadb_models.entities import (
30
- CreateEntitiesGroupPayload,
31
- UpdateEntitiesGroupPayload,
32
- )
33
-
34
-
35
- async def create_entities_group(
36
- writer: AsyncClient, kbid: str, payload: CreateEntitiesGroupPayload
37
- ):
38
- resp = await writer.post(
39
- f"/{KB_PREFIX}/{kbid}/entitiesgroups",
40
- content=payload.json(),
41
- )
42
- return resp
43
-
44
-
45
- async def update_entities_group(
46
- writer: AsyncClient,
47
- kbid: str,
48
- group: str,
49
- payload: UpdateEntitiesGroupPayload,
50
- ):
51
- resp = await writer.patch(
52
- f"/{KB_PREFIX}/{kbid}/entitiesgroup/{group}",
53
- content=payload.json(),
54
- )
55
- return resp
56
-
57
-
58
- async def delete_entities_group(writer: AsyncClient, kbid: str, group: str):
59
- resp = await writer.delete(f"/{KB_PREFIX}/{kbid}/entitiesgroup/{group}")
60
- return resp
61
-
62
-
63
- async def wait_until_entity(
64
- ingest: WriterStub, kbid: str, group: str, entity: str, timeout: float = 1.0
65
- ):
66
- start = time.time()
67
- found = False
68
- while not found:
69
- response: GetEntitiesGroupResponse = await ingest.GetEntitiesGroup( # type: ignore
70
- GetEntitiesGroupRequest(kb=KnowledgeBoxID(uuid=kbid), group=group)
71
- )
72
- found = entity in response.group.entities
73
- assert (
74
- time.time() - start < timeout
75
- ), "Timeout while waiting for entity {group}/{entity}"
76
-
77
- if not found:
78
- await asyncio.sleep(0.1)
@@ -1,60 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
-
21
- from fastapi import Request
22
- from fastapi_versioning import version # type: ignore
23
-
24
- from nucliadb.train.api.utils import get_kb_partitions
25
- from nucliadb.train.api.v1.router import KB_PREFIX, api
26
- from nucliadb_models.resource import NucliaDBRoles
27
- from nucliadb_models.trainset import TrainSetPartitions
28
- from nucliadb_utils.authentication import requires_one
29
-
30
-
31
- @api.get(
32
- f"/{KB_PREFIX}/{{kbid}}/check/labeler/{{labelset}}",
33
- tags=["Train"],
34
- status_code=200,
35
- name="Return check status of labels",
36
- response_model=TrainSetPartitions,
37
- )
38
- @version(1)
39
- @requires_one([NucliaDBRoles.READER])
40
- async def check_labeler(
41
- request: Request, kbid: str, labelset: str
42
- ) -> TrainSetPartitions:
43
- all_keys = await get_kb_partitions(kbid)
44
- return TrainSetPartitions(partitions=all_keys)
45
-
46
-
47
- @api.get(
48
- f"/{KB_PREFIX}/{{kbid}}/check/ner/{{entitygroup}}",
49
- tags=["Train"],
50
- status_code=200,
51
- name="Return check status of entities",
52
- response_model=TrainSetPartitions,
53
- )
54
- @version(1)
55
- @requires_one([NucliaDBRoles.READER])
56
- async def check_ner(
57
- request: Request, kbid: str, entitygroup: str
58
- ) -> TrainSetPartitions:
59
- all_keys = await get_kb_partitions(kbid)
60
- return TrainSetPartitions(partitions=all_keys)
@@ -1,19 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
@@ -1,29 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- pytest_plugins = [
21
- "pytest_docker_fixtures",
22
- "nucliadb.ingest.tests.fixtures",
23
- "nucliadb.tests.fixtures",
24
- "nucliadb.train.tests.fixtures",
25
- "nucliadb_utils.tests.nats",
26
- "nucliadb_utils.tests.conftest",
27
- "nucliadb_utils.tests.gcs",
28
- "nucliadb_utils.tests.s3",
29
- ]
@@ -1,342 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import asyncio
21
- import uuid
22
- from datetime import datetime
23
-
24
- import aiohttp
25
- import pytest
26
- from grpc import aio
27
- from nucliadb_protos.knowledgebox_pb2 import EntitiesGroup, Label, LabelSet
28
- from nucliadb_protos.resources_pb2 import (
29
- ExtractedTextWrapper,
30
- FieldComputedMetadataWrapper,
31
- FieldID,
32
- FieldType,
33
- Paragraph,
34
- Position,
35
- Sentence,
36
- )
37
- from nucliadb_protos.writer_pb2 import (
38
- BrokerMessage,
39
- SetEntitiesRequest,
40
- SetLabelsRequest,
41
- )
42
- from nucliadb_protos.writer_pb2_grpc import WriterStub
43
-
44
- from nucliadb.ingest.orm.entities import EntitiesManager
45
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
46
- from nucliadb.ingest.orm.processor import Processor
47
- from nucliadb.ingest.orm.resource import KB_RESOURCE_SLUG_BASE
48
- from nucliadb.standalone.settings import Settings
49
- from nucliadb.train.utils import start_shard_manager, stop_shard_manager
50
- from nucliadb_utils.tests import free_port
51
- from nucliadb_utils.utilities import clear_global_cache, get_storage
52
-
53
-
54
- @pytest.fixture(scope="function")
55
- async def train_rest_api(nucliadb: Settings): # type: ignore
56
- async with aiohttp.ClientSession(
57
- headers={"X-NUCLIADB-ROLES": "READER"},
58
- base_url=f"http://localhost:{nucliadb.http_port}",
59
- ) as client:
60
- yield client
61
-
62
-
63
- @pytest.fixture(scope="function")
64
- async def writer_rest_api(nucliadb: Settings): # type: ignore
65
- async with aiohttp.ClientSession(
66
- headers={"X-NUCLIADB-ROLES": "WRITER"},
67
- base_url=f"http://localhost:{nucliadb.http_port}",
68
- ) as client:
69
- yield client
70
-
71
-
72
- @pytest.fixture(scope="function")
73
- async def knowledgebox_with_labels(nucliadb_grpc: WriterStub, knowledgebox: str):
74
- slr = SetLabelsRequest()
75
- slr.kb.uuid = knowledgebox
76
- slr.id = "labelset_paragraphs"
77
- slr.labelset.kind.append(LabelSet.LabelSetKind.PARAGRAPHS)
78
- l1 = Label(title="label_machine")
79
- l2 = Label(title="label_user")
80
- slr.labelset.labels.append(l1)
81
- slr.labelset.labels.append(l2)
82
- await nucliadb_grpc.SetLabels(slr) # type: ignore
83
-
84
- slr = SetLabelsRequest()
85
- slr.kb.uuid = knowledgebox
86
- slr.id = "labelset_resources"
87
- slr.labelset.kind.append(LabelSet.LabelSetKind.RESOURCES)
88
- l1 = Label(title="label_machine")
89
- l2 = Label(title="label_user")
90
- slr.labelset.labels.append(l1)
91
- slr.labelset.labels.append(l2)
92
- await nucliadb_grpc.SetLabels(slr) # type: ignore
93
-
94
- yield knowledgebox
95
-
96
-
97
- @pytest.fixture(scope="function")
98
- async def knowledgebox_with_entities(nucliadb_grpc: WriterStub, knowledgebox: str):
99
- ser = SetEntitiesRequest()
100
- ser.kb.uuid = knowledgebox
101
- ser.group = "PERSON"
102
- ser.entities.title = "PERSON"
103
- ser.entities.entities["Ramon"].value = "Ramon"
104
- ser.entities.entities["Eudald Camprubi"].value = "Eudald Camprubi"
105
- ser.entities.entities["Carmen Iniesta"].value = "Carmen Iniesta"
106
- ser.entities.entities["el Super Fran"].value = "el Super Fran"
107
- await nucliadb_grpc.SetEntities(ser) # type: ignore
108
-
109
- ser = SetEntitiesRequest()
110
- ser.kb.uuid = knowledgebox
111
- ser.group = "ORG"
112
- ser.entities.title = "ORG"
113
- ser.entities.entities["Nuclia"].value = "Nuclia"
114
- ser.entities.entities["Debian"].value = "Debian"
115
- ser.entities.entities["Generalitat de Catalunya"].value = "Generalitat de Catalunya"
116
- await nucliadb_grpc.SetEntities(ser) # type: ignore
117
-
118
- yield knowledgebox
119
-
120
-
121
- def broker_simple_resource(knowledgebox: str, number: int) -> BrokerMessage:
122
- rid = str(uuid.uuid4())
123
- message1: BrokerMessage = BrokerMessage(
124
- kbid=knowledgebox,
125
- uuid=rid,
126
- slug=str(number),
127
- type=BrokerMessage.AUTOCOMMIT,
128
- )
129
-
130
- message1.basic.slug = str(number)
131
- message1.basic.icon = "text/plain"
132
- message1.basic.title = f"MY TITLE {number}"
133
- message1.basic.summary = "Summary of document"
134
- message1.basic.thumbnail = "doc"
135
- message1.basic.layout = "default"
136
- message1.basic.metadata.useful = True
137
- message1.basic.metadata.language = "es"
138
- message1.basic.created.FromDatetime(datetime.utcnow())
139
- message1.basic.modified.FromDatetime(datetime.utcnow())
140
- message1.texts[
141
- "field1"
142
- ].body = "My lovely field with some information from Barcelona. This will be the good field. \n\n And then we will go Manresa." # noqa
143
- message1.source = BrokerMessage.MessageSource.WRITER
144
- return message1
145
-
146
-
147
- def broker_processed_resource(knowledgebox, number, rid) -> BrokerMessage:
148
- message2: BrokerMessage = BrokerMessage(
149
- kbid=knowledgebox,
150
- uuid=rid,
151
- slug=str(number),
152
- type=BrokerMessage.AUTOCOMMIT,
153
- )
154
- message2.basic.metadata.useful = True
155
- message2.basic.metadata.language = "es"
156
- message2.source = BrokerMessage.MessageSource.PROCESSOR
157
-
158
- field1_if = FieldID()
159
- field1_if.field = "field1"
160
- field1_if.field_type = FieldType.TEXT
161
-
162
- title_if = FieldID()
163
- title_if.field = "title"
164
- title_if.field_type = FieldType.GENERIC
165
-
166
- etw = ExtractedTextWrapper()
167
- etw.field.CopyFrom(field1_if)
168
- etw.body.text = "My lovely field with some information from Barcelona. This will be the good field. \n\n And then we will go Manresa. I miss Manresa!" # noqa
169
- message2.extracted_text.append(etw)
170
-
171
- fcmw = FieldComputedMetadataWrapper()
172
- fcmw.field.CopyFrom(field1_if)
173
- p1 = Paragraph()
174
- p1.start = 0
175
- p1.end = 82
176
- s1 = Sentence()
177
- s1.start = 0
178
- s1.end = 52
179
- p1.sentences.append(s1)
180
- s1 = Sentence()
181
- s1.start = 53
182
- s1.end = 82
183
- p1.sentences.append(s1)
184
-
185
- p2 = Paragraph()
186
- p2.start = 84
187
- p2.end = 130
188
-
189
- s1 = Sentence()
190
- s1.start = 84
191
- s1.end = 130
192
- p2.sentences.append(s1)
193
-
194
- fcmw.metadata.metadata.paragraphs.append(p1)
195
- fcmw.metadata.metadata.paragraphs.append(p2)
196
-
197
- # Add a ner with positions
198
- fcmw.metadata.metadata.ner.update(
199
- {
200
- "Barcelona": "CITY",
201
- "Manresa": "CITY",
202
- }
203
- )
204
- fcmw.metadata.metadata.positions["CITY/Barcelona"].entity = "Barcelona"
205
- fcmw.metadata.metadata.positions["CITY/Barcelona"].position.append(
206
- Position(start=43, end=52)
207
- )
208
- message2.field_metadata.append(fcmw)
209
-
210
- etw = ExtractedTextWrapper()
211
- etw.field.CopyFrom(title_if)
212
- etw.body.text = f"MY TITLE {number}"
213
- message2.extracted_text.append(etw)
214
-
215
- fcmw = FieldComputedMetadataWrapper()
216
- fcmw.field.CopyFrom(title_if)
217
- p1 = Paragraph()
218
- p1.start = 0
219
- p1.end = len(etw.body.text)
220
- s1 = Sentence()
221
- s1.start = 0
222
- s1.end = len(etw.body.text)
223
- p1.sentences.append(s1)
224
- fcmw.metadata.metadata.paragraphs.append(p1)
225
- message2.field_metadata.append(fcmw)
226
- message2.basic.metadata.language = "es"
227
-
228
- return message2
229
-
230
-
231
- # This fixtures should be deleted once grpc train interface is removed
232
-
233
-
234
- @pytest.fixture(scope="function")
235
- async def test_pagination_resources(
236
- processor: Processor, knowledgebox_ingest, test_settings_train
237
- ):
238
- """
239
- Create a set of resources with only basic information to test pagination
240
- """
241
- amount = 10
242
-
243
- # Create resources
244
- for i in range(1, amount + 1):
245
- message = broker_simple_resource(knowledgebox_ingest, i)
246
- await processor.process(message=message, seqid=-1, transaction_check=False)
247
-
248
- message = broker_processed_resource(knowledgebox_ingest, i, message.uuid)
249
- await processor.process(message=message, seqid=-1, transaction_check=False)
250
- # Give processed data some time to reach the node
251
-
252
- from time import time
253
-
254
- from nucliadb.common.maindb.utils import get_driver
255
-
256
- driver = get_driver()
257
-
258
- t0 = time()
259
-
260
- while time() - t0 < 30: # wait max 30 seconds for it
261
- txn = await driver.begin()
262
- count = 0
263
- async for key in txn.keys(
264
- match=KB_RESOURCE_SLUG_BASE.format(kbid=knowledgebox_ingest), count=-1
265
- ):
266
- count += 1
267
- await txn.abort()
268
- if count == amount:
269
- break
270
- print(f"got {count}, retrying")
271
- await asyncio.sleep(2)
272
-
273
- # Add entities
274
- storage = await get_storage()
275
- txn = await driver.begin()
276
- kb = KnowledgeBox(txn, storage, kbid=knowledgebox_ingest)
277
- entities_manager = EntitiesManager(kb, txn)
278
- entities = EntitiesGroup()
279
- entities.entities["entity1"].value = "PERSON"
280
- await entities_manager.set_entities_group_force("group1", entities)
281
-
282
- # Add ontology
283
- labelset = LabelSet()
284
- labelset.title = "ls1"
285
- label = Label()
286
- label_title = "label1"
287
- label.title = label_title
288
- labelset.labels.append(label)
289
- await kb.set_labelset(label_title, labelset)
290
- await txn.commit()
291
-
292
- yield knowledgebox_ingest
293
-
294
-
295
- @pytest.fixture(scope="function")
296
- def test_settings_train(cache, gcs, fake_node, maindb_driver): # type: ignore
297
- from nucliadb.train.settings import settings
298
- from nucliadb_utils.settings import (
299
- FileBackendConfig,
300
- running_settings,
301
- storage_settings,
302
- )
303
-
304
- running_settings.debug = False
305
- print(f"Redis ready at {maindb_driver.url}")
306
-
307
- old_file_backend = storage_settings.file_backend
308
- old_gcs_endpoint_url = storage_settings.gcs_endpoint_url
309
- old_gcs_bucket = storage_settings.gcs_bucket
310
- old_grpc_port = settings.grpc_port
311
-
312
- storage_settings.gcs_endpoint_url = gcs
313
- storage_settings.file_backend = FileBackendConfig.GCS
314
- storage_settings.gcs_bucket = "test_{kbid}"
315
- settings.grpc_port = free_port()
316
- yield
317
- storage_settings.file_backend = old_file_backend
318
- storage_settings.gcs_endpoint_url = old_gcs_endpoint_url
319
- storage_settings.gcs_bucket = old_gcs_bucket
320
- settings.grpc_port = old_grpc_port
321
-
322
-
323
- @pytest.fixture(scope="function")
324
- async def train_api(test_settings_train: None, local_files): # type: ignore
325
- from nucliadb.train.utils import start_train_grpc, stop_train_grpc
326
-
327
- await start_shard_manager()
328
- await start_train_grpc("testing_train")
329
- yield
330
- await stop_train_grpc()
331
- await stop_shard_manager()
332
-
333
-
334
- @pytest.fixture(scope="function")
335
- async def train_client(train_api): # type: ignore
336
- from nucliadb_protos.train_pb2_grpc import TrainStub
337
-
338
- from nucliadb.train.settings import settings
339
-
340
- channel = aio.insecure_channel(f"localhost:{settings.grpc_port}")
341
- yield TrainStub(channel)
342
- clear_global_cache()
@@ -1,122 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
-
20
- import asyncio
21
-
22
- import aiohttp
23
- import pytest
24
- from nucliadb_protos.dataset_pb2 import FieldClassificationBatch, TaskType, TrainSet
25
- from nucliadb_protos.knowledgebox_pb2 import Label, LabelSet
26
- from nucliadb_protos.writer_pb2 import SetLabelsRequest
27
- from nucliadb_protos.writer_pb2_grpc import WriterStub
28
-
29
- from nucliadb.tests.utils import inject_message
30
- from nucliadb.tests.utils.broker_messages import BrokerMessageBuilder
31
- from nucliadb.train import API_PREFIX
32
- from nucliadb.train.api.v1.router import KB_PREFIX
33
- from nucliadb.train.tests.utils import get_batches_from_train_response_stream
34
-
35
-
36
- @pytest.mark.asyncio
37
- @pytest.mark.parametrize("knowledgebox", ["STABLE", "EXPERIMENTAL"], indirect=True)
38
- async def test_generator_field_classification(
39
- train_rest_api: aiohttp.ClientSession,
40
- knowledgebox_with_labels: str,
41
- ):
42
- kbid = knowledgebox_with_labels
43
-
44
- async with train_rest_api.get(
45
- f"/{API_PREFIX}/v1/{KB_PREFIX}/{kbid}/trainset"
46
- ) as partitions:
47
- assert partitions.status == 200
48
- data = await partitions.json()
49
- assert len(data["partitions"]) == 1
50
- partition_id = data["partitions"][0]
51
-
52
- trainset = TrainSet()
53
- trainset.type = TaskType.FIELD_CLASSIFICATION
54
- trainset.batch_size = 2
55
-
56
- tests = [
57
- (["labelset_resources"], 2, 4),
58
- # 2 fields
59
- (["labelset_resources/label_user"], 1, 2),
60
- # unused label
61
- (["labelset_resources/label_alien"], 0, 0),
62
- # non existent
63
- (["nonexistent_labelset"], 0, 0),
64
- ]
65
-
66
- for labels, expected_batches, expected_total in tests:
67
- trainset.filter.ClearField("labels")
68
- trainset.filter.labels.extend(labels) # type: ignore
69
-
70
- async with train_rest_api.post(
71
- f"/{API_PREFIX}/v1/{KB_PREFIX}/{kbid}/trainset/{partition_id}",
72
- data=trainset.SerializeToString(),
73
- ) as response:
74
- assert response.status == 200
75
- batches = []
76
- total = 0
77
- async for batch in get_batches_from_train_response_stream(
78
- response, FieldClassificationBatch
79
- ):
80
- batches.append(batch)
81
- total += len(batch.data)
82
- assert len(batches) == expected_batches
83
- assert total == expected_total
84
-
85
-
86
- @pytest.fixture(scope="function")
87
- @pytest.mark.asyncio
88
- async def knowledgebox_with_labels(nucliadb_grpc: WriterStub, knowledgebox: str):
89
- slr = SetLabelsRequest()
90
- slr.kb.uuid = knowledgebox
91
- slr.id = "labelset_paragraphs"
92
- slr.labelset.kind.append(LabelSet.LabelSetKind.PARAGRAPHS)
93
- slr.labelset.labels.append(Label(title="label_machine"))
94
- slr.labelset.labels.append(Label(title="label_user"))
95
- slr.labelset.labels.append(Label(title="label_alien"))
96
- await nucliadb_grpc.SetLabels(slr) # type: ignore
97
-
98
- slr = SetLabelsRequest()
99
- slr.kb.uuid = knowledgebox
100
- slr.id = "labelset_resources"
101
- slr.labelset.kind.append(LabelSet.LabelSetKind.RESOURCES)
102
- slr.labelset.labels.append(Label(title="label_machine"))
103
- slr.labelset.labels.append(Label(title="label_user"))
104
- slr.labelset.labels.append(Label(title="label_alien"))
105
- await nucliadb_grpc.SetLabels(slr) # type: ignore
106
-
107
- bmb = BrokerMessageBuilder(kbid=knowledgebox)
108
- bmb.with_title("First resource")
109
- bmb.with_summary("First summary")
110
- bmb.with_resource_labels("labelset_resources", ["label_user"])
111
- bm = bmb.build()
112
- await inject_message(nucliadb_grpc, bm)
113
-
114
- bmb = BrokerMessageBuilder(kbid=knowledgebox)
115
- bmb.with_title("Second resource")
116
- bmb.with_summary("Second summary")
117
- bmb.with_resource_labels("labelset_resources", ["label_machine"])
118
- bm = bmb.build()
119
- await inject_message(nucliadb_grpc, bm)
120
-
121
- await asyncio.sleep(0.1)
122
- yield knowledgebox