nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -1,740 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- import asyncio
21
- import base64
22
- import io
23
- import os
24
- from typing import Callable
25
-
26
- import pytest
27
- from httpx import AsyncClient
28
- from nucliadb_protos.resources_pb2 import FieldID, FieldType
29
- from nucliadb_protos.writer_pb2 import BrokerMessage
30
-
31
- from nucliadb.common import datamanagers
32
- from nucliadb.writer.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREFIX
33
- from nucliadb.writer.api.v1.upload import maybe_b64decode
34
- from nucliadb.writer.tus import TUSUPLOAD, UPLOAD, get_storage_manager
35
- from nucliadb_models.resource import NucliaDBRoles
36
- from nucliadb_utils import const
37
- from nucliadb_utils.utilities import get_storage, get_transaction_utility
38
-
39
- ASSETS_PATH = os.path.dirname(__file__) + "/assets"
40
-
41
-
42
- @pytest.mark.asyncio
43
- async def test_knowledgebox_file_tus_options(
44
- writer_api: Callable[[list[NucliaDBRoles]], AsyncClient], knowledgebox_writer: str
45
- ):
46
- client: AsyncClient
47
- async with writer_api([NucliaDBRoles.WRITER]) as client:
48
- resp = await client.options(
49
- f"/{KB_PREFIX}/{knowledgebox_writer}/resource/xxx/file/xxx/{TUSUPLOAD}/xxx"
50
- )
51
- assert resp.status_code == 204
52
- assert resp.headers["tus-resumable"] == "1.0.0"
53
- assert resp.headers["tus-version"] == "1.0.0"
54
- assert resp.headers["tus-extension"] == "creation-defer-length"
55
-
56
- resp = await client.options(
57
- f"/{KB_PREFIX}/{knowledgebox_writer}/resource/xxx/file/xxx/{TUSUPLOAD}"
58
- )
59
- assert resp.status_code == 204
60
- assert resp.headers["tus-resumable"] == "1.0.0"
61
- assert resp.headers["tus-version"] == "1.0.0"
62
- assert resp.headers["tus-extension"] == "creation-defer-length"
63
-
64
- resp = await client.options(f"/{KB_PREFIX}/{knowledgebox_writer}/{TUSUPLOAD}")
65
- assert resp.status_code == 204
66
- assert resp.headers["tus-resumable"] == "1.0.0"
67
- assert resp.headers["tus-version"] == "1.0.0"
68
- assert resp.headers["tus-extension"] == "creation-defer-length"
69
-
70
- resp = await client.options(
71
- f"/{KB_PREFIX}/{knowledgebox_writer}/{TUSUPLOAD}/xxx"
72
- )
73
- assert resp.status_code == 204
74
- assert resp.headers["tus-resumable"] == "1.0.0"
75
- assert resp.headers["tus-version"] == "1.0.0"
76
- assert resp.headers["tus-extension"] == "creation-defer-length"
77
-
78
-
79
- @pytest.mark.asyncio
80
- async def test_knowledgebox_file_tus_upload_root(writer_api, knowledgebox_writer):
81
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
82
- language = base64.b64encode(b"ca").decode()
83
- filename = base64.b64encode(b"image.jpg").decode()
84
- md5 = base64.b64encode(b"7af0916dba8b70e29d99e72941923529").decode()
85
- resp = await client.post(
86
- f"/{KB_PREFIX}/{knowledgebox_writer}/{TUSUPLOAD}",
87
- headers={
88
- "tus-resumable": "1.0.0",
89
- "upload-metadata": f"filename {filename},language {language},md5 {md5}",
90
- "content-type": "image/jpg",
91
- "upload-defer-length": "1",
92
- },
93
- )
94
- assert resp.status_code == 201
95
- url = resp.headers["location"]
96
-
97
- offset = 0
98
-
99
- # We upload a file that spans across more than one chunk
100
- min_chunk_size = get_storage_manager().min_upload_size
101
- raw_bytes = b"x" * min_chunk_size + b"y" * 500
102
- io_bytes = io.BytesIO(raw_bytes)
103
- data = io_bytes.read(min_chunk_size)
104
- while data != b"":
105
- resp = await client.head(url)
106
- assert resp.headers["Upload-Length"] == f"0"
107
- assert resp.headers["Upload-Offset"] == f"{offset}"
108
-
109
- headers = {
110
- "upload-offset": f"{offset}",
111
- "content-length": f"{len(data)}",
112
- }
113
- is_last_chunk = len(data) < min_chunk_size
114
- if is_last_chunk:
115
- headers["upload-length"] = f"{offset + len(data)}"
116
-
117
- resp = await client.patch(
118
- url,
119
- content=data,
120
- headers=headers,
121
- )
122
- offset += len(data)
123
- data = io_bytes.read(min_chunk_size)
124
-
125
- assert resp.headers["Tus-Upload-Finished"] == "1"
126
-
127
- transaction = get_transaction_utility()
128
-
129
- sub = await transaction.js.pull_subscribe(
130
- const.Streams.INGEST.subject.format(partition="1"), "auto"
131
- )
132
- msgs = await sub.fetch(1)
133
-
134
- writer = BrokerMessage()
135
- writer.ParseFromString(msgs[0].data)
136
- await msgs[0].ack()
137
-
138
- path = resp.headers["ndb-field"]
139
- field = path.split("/")[-1]
140
- rid = path.split("/")[-3]
141
- assert writer.uuid == rid
142
- assert writer.basic.icon == "image/jpg"
143
- assert writer.basic.title == "image.jpg"
144
- assert writer.files[field].language == "ca"
145
- assert writer.files[field].file.size == len(raw_bytes)
146
- assert writer.files[field].file.filename == "image.jpg"
147
- assert writer.files[field].file.md5 == "7af0916dba8b70e29d99e72941923529"
148
-
149
- storage = await get_storage()
150
- data = await storage.downloadbytes(
151
- bucket=writer.files[field].file.bucket_name,
152
- key=writer.files[field].file.uri,
153
- )
154
- assert len(data.read()) == len(raw_bytes)
155
- await asyncio.sleep(1)
156
-
157
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
158
- resp = await client.post(
159
- f"/{KB_PREFIX}/{knowledgebox_writer}/{TUSUPLOAD}",
160
- headers={
161
- "tus-resumable": "1.0.0",
162
- "upload-metadata": f"filename {filename},language {language},md5 {md5}",
163
- "content-type": "image/jpg",
164
- "upload-defer-length": "1",
165
- },
166
- )
167
- assert resp.status_code == 409
168
-
169
-
170
- @pytest.mark.asyncio
171
- async def test_knowledgebox_file_upload_root(
172
- writer_api: Callable[[list[NucliaDBRoles]], AsyncClient],
173
- knowledgebox_writer: str,
174
- ):
175
- async with writer_api([NucliaDBRoles.WRITER]) as client:
176
- with open(f"{ASSETS_PATH}/image001.jpg", "rb") as f:
177
- resp = await client.post(
178
- f"/{KB_PREFIX}/{knowledgebox_writer}/{UPLOAD}",
179
- content=f.read(),
180
- headers={
181
- "content-type": "image/jpg",
182
- "X-MD5": "7af0916dba8b70e29d99e72941923529",
183
- },
184
- )
185
- assert resp.status_code == 201
186
-
187
- transaction = get_transaction_utility()
188
-
189
- assert transaction.js is not None
190
- sub = await transaction.js.pull_subscribe(
191
- const.Streams.INGEST.subject.format(partition="1"), "auto"
192
- )
193
- msgs = await sub.fetch(1)
194
- writer = BrokerMessage()
195
- writer.ParseFromString(msgs[0].data)
196
- await msgs[0].ack()
197
-
198
- body = resp.json()
199
- field = body["field_id"]
200
- rid = body["uuid"]
201
- assert writer.uuid == rid
202
- assert writer.basic.icon == "image/jpg"
203
- assert writer.files[field].file.size == 30472
204
-
205
- storage = await get_storage()
206
- data = await storage.downloadbytes(
207
- bucket=writer.files[field].file.bucket_name,
208
- key=writer.files[field].file.uri,
209
- )
210
- assert len(data.read()) == 30472
211
- await asyncio.sleep(1)
212
-
213
- async with writer_api([NucliaDBRoles.WRITER]) as client:
214
- with open(f"{ASSETS_PATH}/image001.jpg", "rb") as f:
215
- resp = await client.post(
216
- f"/{KB_PREFIX}/{knowledgebox_writer}/{UPLOAD}",
217
- content=f.read(),
218
- headers={
219
- "content-type": "image/jpg",
220
- "X-MD5": "7af0916dba8b70e29d99e72941923529",
221
- },
222
- )
223
- assert resp.status_code == 409
224
-
225
-
226
- @pytest.mark.asyncio
227
- async def test_knowledgebox_file_upload_root_headers(
228
- writer_api: Callable[[list[NucliaDBRoles]], AsyncClient],
229
- knowledgebox_writer: str,
230
- ):
231
- async with writer_api([NucliaDBRoles.WRITER]) as client:
232
- filename = base64.b64encode(b"image.jpg").decode()
233
- with open(f"{ASSETS_PATH}/image001.jpg", "rb") as f:
234
- resp = await client.post(
235
- f"/{KB_PREFIX}/{knowledgebox_writer}/{UPLOAD}",
236
- content=f.read(),
237
- headers={
238
- "X-FILENAME": filename,
239
- "X-LANGUAGE": "ca",
240
- "X-MD5": "7af0916dba8b70e29d99e72941923529",
241
- "content-type": "image/jpg",
242
- },
243
- )
244
- assert resp.status_code == 201
245
-
246
- transaction = get_transaction_utility()
247
-
248
- assert transaction.js is not None
249
- sub = await transaction.js.pull_subscribe(
250
- const.Streams.INGEST.subject.format(partition="1"), "auto"
251
- )
252
- msgs = await sub.fetch(1)
253
- writer = BrokerMessage()
254
- writer.ParseFromString(msgs[0].data)
255
- await msgs[0].ack()
256
-
257
- body = resp.json()
258
- field = body["field_id"]
259
- rid = body["uuid"]
260
- assert writer.uuid == rid
261
- assert writer.basic.icon == "image/jpg"
262
- assert writer.basic.title == "image.jpg"
263
- assert writer.files[field].language == "ca"
264
- assert writer.files[field].file.size == 30472
265
-
266
- storage = await get_storage()
267
- data = await storage.downloadbytes(
268
- bucket=writer.files[field].file.bucket_name,
269
- key=writer.files[field].file.uri,
270
- )
271
- assert len(data.read()) == 30472
272
-
273
-
274
- @pytest.mark.asyncio
275
- async def test_knowledgebox_file_tus_upload_field(
276
- writer_api, knowledgebox_writer, resource
277
- ):
278
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
279
- language = base64.b64encode(b"ca").decode()
280
- filename = base64.b64encode(b"image.jpg").decode()
281
- md5 = base64.b64encode(b"7af0916dba8b70e29d99e72941923529").decode()
282
-
283
- resp = await client.post(
284
- f"/{KB_PREFIX}/{knowledgebox_writer}/resource/invalidresource/file/field1/{TUSUPLOAD}",
285
- headers={
286
- "tus-resumable": "1.0.0",
287
- "upload-metadata": f"filename {filename},language {language},md5 {md5}",
288
- "content-type": "image/jpg",
289
- "upload-defer-length": "1",
290
- },
291
- )
292
- assert resp.status_code == 404
293
- await asyncio.sleep(1)
294
-
295
- resp = await client.post(
296
- f"/{KB_PREFIX}/{knowledgebox_writer}/resource/{resource}/file/field1/{TUSUPLOAD}",
297
- headers={
298
- "tus-resumable": "1.0.0",
299
- "upload-metadata": f"filename {filename},language {language},md5 {md5}",
300
- "content-type": "image/jpg",
301
- "upload-defer-length": "1",
302
- },
303
- )
304
- assert resp.status_code == 201
305
- url = resp.headers["location"]
306
-
307
- offset = 0
308
- # We upload a file that spans across more than one chunk
309
- min_chunk_size = get_storage_manager().min_upload_size
310
- raw_bytes = b"x" * min_chunk_size + b"y" * 500
311
- io_bytes = io.BytesIO(raw_bytes)
312
- data = io_bytes.read(min_chunk_size)
313
- while data != b"":
314
- resp = await client.head(url)
315
-
316
- assert resp.headers["Upload-Length"] == f"0"
317
- assert resp.headers["Upload-Offset"] == f"{offset}"
318
-
319
- headers = {
320
- "upload-offset": f"{offset}",
321
- "content-length": f"{len(data)}",
322
- }
323
- is_last_chunk = len(data) < min_chunk_size
324
- if is_last_chunk:
325
- headers["upload-length"] = f"{offset + len(data)}"
326
-
327
- resp = await client.patch(
328
- url,
329
- content=data,
330
- headers=headers,
331
- )
332
- assert resp.status_code == 200
333
- offset += len(data)
334
- data = io_bytes.read(min_chunk_size)
335
-
336
- assert resp.headers["Tus-Upload-Finished"] == "1"
337
-
338
- transaction = get_transaction_utility()
339
-
340
- sub = await transaction.js.pull_subscribe(
341
- const.Streams.INGEST.subject.format(partition="1"), "auto"
342
- )
343
- msgs = await sub.fetch(2)
344
-
345
- writer = BrokerMessage()
346
- writer.ParseFromString(msgs[1].data)
347
- await msgs[1].ack()
348
-
349
- path = resp.headers["ndb-field"]
350
- field = path.split("/")[-1]
351
- rid = path.split("/")[-3]
352
- assert writer.uuid == rid
353
- assert writer.basic.icon == "image/jpg"
354
- assert writer.basic.title == ""
355
- assert writer.files[field].language == "ca"
356
- assert writer.files[field].file.size == len(raw_bytes)
357
- assert writer.files[field].file.filename == "image.jpg"
358
- assert writer.files[field].file.md5 == "7af0916dba8b70e29d99e72941923529"
359
-
360
- storage = await get_storage()
361
- data = await storage.downloadbytes(
362
- bucket=writer.files[field].file.bucket_name,
363
- key=writer.files[field].file.uri,
364
- )
365
- assert len(data.read()) == len(raw_bytes)
366
-
367
-
368
- @pytest.mark.asyncio
369
- async def test_knowledgebox_file_upload_field_headers(
370
- writer_api, knowledgebox_writer, resource
371
- ):
372
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
373
- filename = "image.jpg"
374
- encoded_filename = base64.b64encode(filename.encode()).decode()
375
- with open(f"{ASSETS_PATH}/image001.jpg", "rb") as f:
376
- resp = await client.post(
377
- f"/{KB_PREFIX}/{knowledgebox_writer}/resource/{resource}/file/field1/{UPLOAD}",
378
- content=f.read(),
379
- headers={
380
- "X-FILENAME": encoded_filename,
381
- "X-LANGUAGE": "ca",
382
- "X-MD5": "7af0916dba8b70e29d99e72941923529",
383
- "content-type": "image/jpg",
384
- },
385
- )
386
- assert resp.status_code == 201
387
-
388
- transaction = get_transaction_utility()
389
-
390
- sub = await transaction.js.pull_subscribe(
391
- const.Streams.INGEST.subject.format(partition="1"), "auto"
392
- )
393
- msgs = await sub.fetch(2)
394
- writer = BrokerMessage()
395
- writer.ParseFromString(msgs[1].data)
396
- await msgs[1].ack()
397
-
398
- body = resp.json()
399
- field = body["field_id"]
400
- rid = body["uuid"]
401
- assert writer.uuid == rid
402
- assert writer.basic.icon == "image/jpg"
403
- assert writer.basic.title == ""
404
- assert writer.files[field].language == "ca"
405
- assert writer.files[field].file.size == 30472
406
- assert writer.files[field].file.filename == filename
407
-
408
- storage = await get_storage()
409
- data = await storage.downloadbytes(
410
- bucket=writer.files[field].file.bucket_name,
411
- key=writer.files[field].file.uri,
412
- )
413
- assert len(data.read()) == 30472
414
-
415
-
416
- @pytest.mark.asyncio
417
- async def test_knowledgebox_file_upload_field_sync(
418
- writer_api, knowledgebox_writer, resource
419
- ):
420
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
421
- filename = "image.jpg"
422
- with open(f"{ASSETS_PATH}/image001.jpg", "rb") as f:
423
- resp = await client.post(
424
- f"/{KB_PREFIX}/{knowledgebox_writer}/resource/{resource}/file/field1/{UPLOAD}",
425
- content=f.read(),
426
- headers={
427
- "X-FILENAME": filename,
428
- "X-LANGUAGE": "ca",
429
- "X-MD5": "7af0916dba8b70e29d99e72941923529",
430
- "content-type": "image/jpg",
431
- },
432
- )
433
- assert resp.status_code == 201
434
-
435
- async with datamanagers.with_transaction(read_only=True) as txn:
436
- assert (
437
- await datamanagers.resources.has_field(
438
- txn,
439
- kbid=knowledgebox_writer,
440
- rid=resource,
441
- field_id=FieldID(field="field1", field_type=FieldType.FILE),
442
- )
443
- ) is True
444
-
445
-
446
- @pytest.mark.asyncio
447
- async def test_file_tus_upload_field_by_slug(writer_api, knowledgebox_writer, resource):
448
- kb = knowledgebox_writer
449
- rslug = "resource1"
450
-
451
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
452
- language = base64.b64encode(b"ca").decode()
453
- filename = base64.b64encode(b"image.jpg").decode()
454
- md5 = base64.b64encode(b"7af0916dba8b70e29d99e72941923529").decode()
455
- headers = {
456
- "tus-resumable": "1.0.0",
457
- "upload-metadata": f"filename {filename},language {language},md5 {md5}",
458
- "content-type": "image/jpg",
459
- "upload-defer-length": "1",
460
- }
461
-
462
- resp = await client.post(
463
- f"/{KB_PREFIX}/{kb}/slug/idonotexist/file/field1/{TUSUPLOAD}",
464
- headers=headers,
465
- )
466
- assert resp.status_code == 404
467
-
468
- resp = await client.post(
469
- f"/{KB_PREFIX}/{kb}/slug/{rslug}/file/field1/{TUSUPLOAD}",
470
- headers=headers,
471
- )
472
- assert resp.status_code == 201
473
- url = resp.headers["location"]
474
-
475
- # Check that we are using the slug for the whole file upload
476
- assert f"{RSLUG_PREFIX}/{rslug}" in url
477
-
478
- offset = 0
479
- min_chunk_size = get_storage_manager().min_upload_size
480
- raw_bytes = b"x" * min_chunk_size + b"y" * 500
481
- io_bytes = io.BytesIO(raw_bytes)
482
- data = io_bytes.read(min_chunk_size)
483
- while data != b"":
484
- resp = await client.head(url)
485
-
486
- assert resp.headers["Upload-Length"] == f"0"
487
- assert resp.headers["Upload-Offset"] == f"{offset}"
488
-
489
- headers = {
490
- "upload-offset": f"{offset}",
491
- "content-length": f"{len(data)}",
492
- }
493
- is_last_chunk = len(data) < min_chunk_size
494
- if is_last_chunk:
495
- headers["upload-length"] = f"{offset + len(data)}"
496
-
497
- resp = await client.patch(
498
- url,
499
- content=data,
500
- headers=headers,
501
- )
502
- assert resp.status_code == 200
503
- offset += len(data)
504
- data = io_bytes.read(min_chunk_size)
505
-
506
- assert resp.headers["Tus-Upload-Finished"] == "1"
507
-
508
- transaction = get_transaction_utility()
509
-
510
- sub = await transaction.js.pull_subscribe(
511
- const.Streams.INGEST.subject.format(partition="1"), "auto"
512
- )
513
- msgs = await sub.fetch(2)
514
-
515
- writer = BrokerMessage()
516
- writer.ParseFromString(msgs[1].data)
517
- await msgs[1].ack()
518
-
519
- path = resp.headers["ndb-field"]
520
- field = path.split("/")[-1]
521
- rid = path.split("/")[-3]
522
- assert writer.uuid == rid
523
- assert writer.basic.icon == "image/jpg"
524
- assert writer.basic.title == ""
525
- assert writer.files[field].language == "ca"
526
- assert writer.files[field].file.size == len(raw_bytes)
527
- assert writer.files[field].file.filename == "image.jpg"
528
- assert writer.files[field].file.md5 == "7af0916dba8b70e29d99e72941923529"
529
-
530
- storage = await get_storage()
531
- data = await storage.downloadbytes(
532
- bucket=writer.files[field].file.bucket_name,
533
- key=writer.files[field].file.uri,
534
- )
535
- assert len(data.read()) == len(raw_bytes)
536
-
537
-
538
- @pytest.mark.asyncio
539
- async def test_file_tus_upload_urls_field_by_resource_id(
540
- writer_api, knowledgebox_writer, resource
541
- ):
542
- kb = knowledgebox_writer
543
-
544
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
545
- language = base64.b64encode(b"ca").decode()
546
- filename = base64.b64encode(b"image.jpg").decode()
547
- md5 = base64.b64encode(b"7af0916dba8b70e29d99e72941923529").decode()
548
- headers = {
549
- "tus-resumable": "1.0.0",
550
- "upload-metadata": f"filename {filename},language {language},md5 {md5}",
551
- "content-type": "image/jpg",
552
- "upload-defer-length": "1",
553
- }
554
-
555
- resp = await client.post(
556
- f"/{KB_PREFIX}/{kb}/resource/idonotexist/file/field1/{TUSUPLOAD}",
557
- headers=headers,
558
- )
559
- assert resp.status_code == 404
560
-
561
- resp = await client.post(
562
- f"/{KB_PREFIX}/{kb}/resource/{resource}/file/field1/{TUSUPLOAD}",
563
- headers=headers,
564
- )
565
- assert resp.status_code == 201
566
- url = resp.headers["location"]
567
-
568
- # Check that we are using the resource for the whole file upload
569
- assert f"{RESOURCE_PREFIX}/{resource}" in url
570
-
571
- # Make sure the returned URL works
572
- resp = await client.head(url)
573
- assert resp.status_code == 200
574
-
575
- assert resp.headers["Upload-Length"] == "0"
576
- assert resp.headers["Upload-Offset"] == "0"
577
-
578
-
579
- @pytest.mark.asyncio
580
- async def test_multiple_tus_file_upload_tries(
581
- writer_api, knowledgebox_writer, resource
582
- ):
583
- kb = knowledgebox_writer
584
- rslug = "resource1"
585
-
586
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
587
- headers = {
588
- "tus-resumable": "1.0.0",
589
- "content-type": "image/jpg",
590
- "upload-defer-length": "1",
591
- }
592
-
593
- resp = await client.post(
594
- f"/{KB_PREFIX}/{kb}/slug/{rslug}/file/field1/{TUSUPLOAD}",
595
- headers=headers,
596
- )
597
- assert resp.status_code == 201
598
- url = resp.headers["location"]
599
-
600
- # Check that we are using the slug for the whole file upload
601
- assert f"{RSLUG_PREFIX}/{rslug}" in url
602
- resp = await client.patch(
603
- url,
604
- content=b"x" * 10000,
605
- headers={
606
- "upload-offset": "0",
607
- "content-length": "10000",
608
- "upload-length": "10000",
609
- },
610
- )
611
- assert resp.status_code == 200
612
-
613
- assert resp.headers["Tus-Upload-Finished"] == "1"
614
-
615
- # next one should work as well
616
- resp = await client.post(
617
- f"/{KB_PREFIX}/{kb}/slug/{rslug}/file/field1/{TUSUPLOAD}",
618
- headers=headers,
619
- )
620
- assert resp.status_code == 201
621
- url = resp.headers["location"]
622
-
623
- # Check that we are using the slug for the whole file upload
624
- assert f"{RSLUG_PREFIX}/{rslug}" in url
625
- resp = await client.patch(
626
- url,
627
- content=b"x" * 10000,
628
- headers={
629
- "upload-offset": "0",
630
- "content-length": "10000",
631
- "upload-length": "10000",
632
- },
633
- )
634
- assert resp.status_code == 200
635
-
636
- assert resp.headers["Tus-Upload-Finished"] == "1"
637
-
638
-
639
- @pytest.mark.asyncio
640
- async def test_file_upload_by_slug(writer_api, knowledgebox_writer):
641
- kb = knowledgebox_writer
642
- rslug = "myslug"
643
-
644
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
645
- resp = await client.post(
646
- f"/{KB_PREFIX}/{kb}/resources",
647
- json={
648
- "slug": rslug,
649
- },
650
- )
651
- assert str(resp.status_code).startswith("2")
652
-
653
- filename = "image.jpg"
654
- with open(f"{ASSETS_PATH}/image001.jpg", "rb") as f:
655
- resp = await client.post(
656
- f"/{KB_PREFIX}/{kb}/{RSLUG_PREFIX}/{rslug}/file/file1/{UPLOAD}",
657
- content=f.read(),
658
- headers={
659
- "X-FILENAME": filename,
660
- "content-type": "image/jpg",
661
- "X-MD5": "7af0916dba8b70e29d99e72941923529",
662
- },
663
- )
664
- assert resp.status_code == 201
665
-
666
- transaction = get_transaction_utility()
667
-
668
- sub = await transaction.js.pull_subscribe(
669
- const.Streams.INGEST.subject.format(partition="1"), "auto"
670
- )
671
- msgs = await sub.fetch(2)
672
-
673
- writer = BrokerMessage()
674
- writer.ParseFromString(msgs[-1].data)
675
- await msgs[-1].ack()
676
-
677
- body = resp.json()
678
- field = body["field_id"]
679
- rid = body["uuid"]
680
-
681
- assert writer.uuid == rid
682
- assert writer.basic.icon == "image/jpg"
683
- assert writer.files[field].file.size == 30472
684
- assert writer.files[field].file.filename == filename
685
-
686
- storage = await get_storage()
687
- data = await storage.downloadbytes(
688
- bucket=writer.files[field].file.bucket_name,
689
- key=writer.files[field].file.uri,
690
- )
691
- assert len(data.read()) == 30472
692
-
693
-
694
- def test_maybe_b64decode():
695
- something = "something"
696
- something_encoded = base64.b64encode(something.encode())
697
- assert maybe_b64decode(something_encoded) == something
698
- assert maybe_b64decode(something) == something
699
-
700
-
701
- @pytest.mark.asyncio
702
- async def test_tus_validates_intermediate_chunks_length(
703
- writer_api, knowledgebox_writer
704
- ):
705
- async with writer_api(roles=[NucliaDBRoles.WRITER]) as client:
706
- language = base64.b64encode(b"ca").decode()
707
- filename = base64.b64encode(b"image.jpg").decode()
708
- md5 = base64.b64encode(b"7af0916dba8b70e29d99e72941923529").decode()
709
- resp = await client.post(
710
- f"/{KB_PREFIX}/{knowledgebox_writer}/{TUSUPLOAD}",
711
- headers={
712
- "tus-resumable": "1.0.0",
713
- "upload-metadata": f"filename {filename},language {language},md5 {md5}",
714
- "content-type": "image/jpg",
715
- "upload-defer-length": "1",
716
- },
717
- )
718
- assert resp.status_code == 201
719
- url = resp.headers["location"]
720
- # We upload a chunk that is smaller than the minimum chunk size
721
- min_chunk_size = get_storage_manager().min_upload_size
722
- raw_bytes = b"x" * min_chunk_size + b"y" * 500
723
- io_bytes = io.BytesIO(raw_bytes)
724
- chunk = io_bytes.read(min_chunk_size - 10)
725
-
726
- resp = await client.head(url)
727
-
728
- headers = {
729
- "upload-offset": f"0",
730
- "content-length": f"{len(chunk)}",
731
- }
732
- resp = await client.patch(
733
- url,
734
- content=chunk,
735
- headers=headers,
736
- )
737
- assert resp.status_code == 412
738
- assert resp.json()["detail"].startswith(
739
- "Intermediate chunks cannot be smaller than"
740
- )