nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -1,433 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- from __future__ import annotations
21
-
22
- import asyncio
23
- import contextlib
24
- import logging
25
- import random
26
- from typing import Any, List, Optional, Union
27
-
28
- import backoff
29
-
30
- from nucliadb.common.maindb.driver import (
31
- DEFAULT_BATCH_SCAN_LIMIT,
32
- DEFAULT_SCAN_LIMIT,
33
- Driver,
34
- Transaction,
35
- )
36
- from nucliadb.common.maindb.exceptions import ConflictError
37
- from nucliadb_telemetry import metrics
38
-
39
- try:
40
- from tikv_client import asynchronous # type: ignore
41
-
42
- TiKV = True
43
- except ImportError: # pragma: no cover
44
- TiKV = False
45
-
46
-
47
- class LeaderNotFoundError(Exception):
48
- """
49
- Raised when the tikv client raises an exception indicating that the leader of a region is not found.
50
- This is a transient error and the operation should be retried.
51
- """
52
-
53
- pass
54
-
55
-
56
- class PdClusterTimeout(Exception):
57
- """
58
- Raised with PD cluster fails to respond
59
- """
60
-
61
- pass
62
-
63
-
64
- tikv_observer = metrics.Observer(
65
- "tikv_client",
66
- labels={"type": ""},
67
- error_mappings={
68
- "conflict_error": ConflictError,
69
- "timeout_error": TimeoutError,
70
- "leader_not_found_error": LeaderNotFoundError,
71
- },
72
- )
73
- logger = logging.getLogger(__name__)
74
-
75
-
76
- class TiKVDataLayer:
77
- def __init__(
78
- self, connection: Union[asynchronous.RawClient, asynchronous.Transaction]
79
- ):
80
- self.connection = connection
81
-
82
- async def abort(self):
83
- with tikv_observer({"type": "rollback"}):
84
- try:
85
- await self.connection.rollback()
86
- except Exception:
87
- logger.exception("Error rolling back transaction")
88
-
89
- async def commit(self):
90
- with tikv_observer({"type": "commit"}), self.tikv_error_handler():
91
- await self.connection.commit()
92
-
93
- async def batch_get(self, keys: list[str]) -> list[Optional[bytes]]:
94
- bytes_keys: list[bytes] = [x.encode() for x in keys]
95
- with tikv_observer({"type": "batch_get"}), self.tikv_error_handler():
96
- output = {}
97
- for key, value in await self.connection.batch_get(bytes_keys):
98
- output[key.decode()] = value
99
- return [output.get(key) for key in keys]
100
-
101
- @backoff.on_exception(
102
- backoff.expo,
103
- (TimeoutError, LeaderNotFoundError),
104
- jitter=backoff.random_jitter,
105
- max_tries=2,
106
- )
107
- async def get(self, key: str) -> Optional[bytes]:
108
- with tikv_observer({"type": "get"}), self.tikv_error_handler():
109
- return await self.connection.get(key.encode())
110
-
111
- @contextlib.contextmanager
112
- def tikv_error_handler(self):
113
- """
114
- The tikv_client library does not provide specific exceptions and simply
115
- raises generic Exception class with different error strings. That forces
116
- us to parse the error string to determine the type of error...
117
- """
118
- try:
119
- yield
120
- except Exception as exc:
121
- exc_text = str(exc)
122
- if "WriteConflict" in exc_text:
123
- raise ConflictError(exc_text) from exc
124
- elif "4-DEADLINE_EXCEEDED" in exc_text:
125
- raise TimeoutError(exc_text) from exc
126
- elif "Leader of region" in exc_text and "not found" in exc_text:
127
- raise LeaderNotFoundError(exc_text) from exc
128
- else:
129
- raise
130
-
131
- async def set(self, key: str, value: bytes) -> None:
132
- with tikv_observer({"type": "put"}), self.tikv_error_handler():
133
- await self.connection.put(key.encode(), value)
134
-
135
- async def delete(self, key: str) -> None:
136
- with tikv_observer({"type": "delete"}), self.tikv_error_handler():
137
- await self.connection.delete(key.encode())
138
-
139
- async def keys(
140
- self,
141
- match: str,
142
- count: int = DEFAULT_SCAN_LIMIT,
143
- include_start: bool = True,
144
- ):
145
- """
146
- Get keys from tikv, up to a configurable limit.
147
-
148
- Use -1 as the count of objects keep iterating in batches
149
- until all matching keys are retrieved.
150
- With any other count, only up to count keys will be returned.
151
- """
152
- get_all_keys = count == -1
153
- limit = DEFAULT_BATCH_SCAN_LIMIT if get_all_keys else count
154
- start_key = match.encode()
155
- _include_start = include_start
156
-
157
- while True:
158
- with tikv_observer({"type": "scan_keys"}), self.tikv_error_handler():
159
- keys = await self.connection.scan_keys(
160
- start=start_key,
161
- end=None,
162
- limit=limit,
163
- include_start=_include_start,
164
- )
165
- for key in keys:
166
- str_key = key.decode()
167
- if str_key.startswith(match):
168
- yield str_key
169
- else:
170
- break
171
- else:
172
- if len(keys) == limit and get_all_keys:
173
- # If all keys were requested and it may exist
174
- # some more keys to retrieve
175
- start_key = keys[-1]
176
- _include_start = False
177
- continue
178
-
179
- # If not all keys were requested
180
- # or the for loop found an unmatched key
181
- break
182
-
183
- async def count(self, match: str) -> int:
184
- """
185
- Count the number of keys that match the given prefix
186
- as efficiently as possible with the available API.
187
- """
188
- original_match = match.encode()
189
- start_key = original_match
190
- _include_start = True
191
- batch_size = 5000
192
-
193
- value = 0
194
- while True:
195
- with tikv_observer({"type": "scan_keys"}), self.tikv_error_handler():
196
- keys = await self.connection.scan_keys(
197
- start=start_key,
198
- end=None,
199
- limit=batch_size,
200
- include_start=_include_start,
201
- )
202
- if len(keys) == 0:
203
- break
204
-
205
- if not keys[-1].startswith(original_match):
206
- # done counting this range, find the correct size of the match
207
- # with a binary search and break out
208
- left, right = 0, len(keys) - 1
209
- result_index = 0
210
- match_found = False
211
- while left <= right:
212
- mid = left + (right - left) // 2
213
-
214
- if keys[mid].startswith(original_match):
215
- match_found = True
216
- left = mid + 1 # Move to the right half
217
- result_index = mid
218
- else:
219
- right = mid - 1 # Move to the left half
220
- if match_found:
221
- value += result_index + 1
222
- break
223
- else:
224
- value += len(keys)
225
-
226
- if len(keys) == batch_size:
227
- start_key = keys[-1]
228
- _include_start = False
229
- continue
230
- else:
231
- # done counting
232
- break
233
- return value
234
-
235
-
236
- class TiKVTransaction(Transaction):
237
- driver: TiKVDriver
238
-
239
- def __init__(self, txn: Any, driver: TiKVDriver):
240
- self.txn = txn
241
- self.driver = driver
242
- self.data_layer = TiKVDataLayer(txn)
243
- self.open = True
244
-
245
- async def abort(self):
246
- if not self.open:
247
- return
248
- await self.data_layer.abort()
249
- self.open = False
250
-
251
- async def commit(self):
252
- assert self.open
253
- await self.data_layer.commit()
254
- self.open = False
255
-
256
- async def batch_get(self, keys: list[str]) -> list[Optional[bytes]]:
257
- assert self.open
258
- return await self.data_layer.batch_get(keys)
259
-
260
- @backoff.on_exception(
261
- backoff.expo,
262
- (TimeoutError, LeaderNotFoundError),
263
- jitter=backoff.random_jitter,
264
- max_tries=2,
265
- )
266
- async def get(self, key: str) -> Optional[bytes]:
267
- assert self.open
268
- return await self.data_layer.get(key)
269
-
270
- async def set(self, key: str, value: bytes) -> None:
271
- assert self.open
272
- return await self.data_layer.set(key, value)
273
-
274
- async def delete(self, key: str) -> None:
275
- assert self.open
276
- return await self.data_layer.delete(key)
277
-
278
- async def keys(
279
- self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
280
- ):
281
- assert self.open
282
- # XXX must have connection outside of current txn
283
- conn_holder = self.driver.get_connection_holder()
284
- txn = await conn_holder.get_snapshot()
285
- dl = TiKVDataLayer(txn)
286
-
287
- async for key in dl.keys(match, count, include_start):
288
- yield key
289
-
290
- async def count(self, match: str) -> int:
291
- assert self.open
292
- return await self.data_layer.count(match)
293
-
294
-
295
- class ReadOnlyTiKVTransaction(Transaction):
296
- driver: TiKVDriver
297
-
298
- def __init__(self, connection: asynchronous.Snapshot, driver: TiKVDriver):
299
- self.connection = connection
300
- self.data_layer = TiKVDataLayer(connection)
301
- self.driver = driver
302
- self.open = True
303
-
304
- async def abort(self):
305
- self.open = False
306
- # Read only transactions are implemented as snapshots, which
307
- # are read only and isolated, and they don't need to be aborted.
308
-
309
- async def commit(self):
310
- raise Exception("Cannot commit transaction in read only mode")
311
-
312
- async def batch_get(self, keys: list[str]) -> list[Optional[bytes]]:
313
- assert self.open
314
- return await self.data_layer.batch_get(keys)
315
-
316
- async def get(self, key: str) -> Optional[bytes]:
317
- assert self.open
318
- return await self.data_layer.get(key)
319
-
320
- async def set(self, key: str, value: bytes) -> None:
321
- raise Exception("Cannot set in read only transaction")
322
-
323
- async def delete(self, key: str) -> None:
324
- raise Exception("Cannot delete in read only transaction")
325
-
326
- async def keys(
327
- self, match: str, count: int = DEFAULT_SCAN_LIMIT, include_start: bool = True
328
- ):
329
- assert self.open
330
- async for key in self.data_layer.keys(match, count, include_start):
331
- yield key
332
-
333
- async def count(self, match: str) -> int:
334
- assert self.open
335
- return await self.data_layer.count(match)
336
-
337
-
338
- class ConnectionHolder:
339
- _txn_connection: asynchronous.TransactionClient
340
-
341
- def __init__(self, url: list[str]):
342
- self.url = url
343
- self.connect_lock = asyncio.Lock()
344
-
345
- @backoff.on_exception(
346
- backoff.expo,
347
- PdClusterTimeout,
348
- jitter=backoff.random_jitter,
349
- max_tries=3,
350
- )
351
- async def initialize(self) -> None:
352
- try:
353
- self._txn_connection = await asynchronous.TransactionClient.connect(
354
- self.url
355
- )
356
- except Exception as exc:
357
- if "PD cluster failed to respond" in str(exc):
358
- raise PdClusterTimeout from exc
359
- raise
360
-
361
- async def get_snapshot(
362
- self, timestamp: Optional[float] = None, retried: bool = False
363
- ) -> asynchronous.Snapshot:
364
- if self.connect_lock.locked(): # pragma: no cover
365
- async with self.connect_lock:
366
- ...
367
- try:
368
- if timestamp is None:
369
- with tikv_observer({"type": "current_timestamp"}):
370
- timestamp = await self._txn_connection.current_timestamp()
371
- return self._txn_connection.snapshot(timestamp, pessimistic=False)
372
- except Exception:
373
- if retried:
374
- raise
375
- logger.exception(
376
- f"Error getting snapshot for tikv. Retrying once and then failing."
377
- )
378
- await self.reinitialize()
379
- return await self.get_snapshot(timestamp, retried=True)
380
-
381
- async def begin_transaction(self) -> asynchronous.Transaction:
382
- if self.connect_lock.locked(): # pragma: no cover
383
- async with self.connect_lock:
384
- ...
385
- try:
386
- # pessimistic=False means faster but more conflicts
387
- with tikv_observer({"type": "begin"}):
388
- return await self._txn_connection.begin(pessimistic=False)
389
- except Exception:
390
- logger.exception(
391
- f"Error getting transaction for tikv. Retrying once and then failing."
392
- )
393
- await self.reinitialize()
394
- return await self._txn_connection.begin(pessimistic=False)
395
-
396
- async def reinitialize(self) -> None:
397
- if self.connect_lock.locked():
398
- async with self.connect_lock:
399
- # wait for lock and then just continue because someone else is establishing the connection
400
- return
401
- else:
402
- async with self.connect_lock:
403
- logger.warning("Reconnecting to TiKV")
404
- await self.initialize()
405
-
406
-
407
- class TiKVDriver(Driver):
408
- def __init__(self, url: List[str], pool_size: int = 3):
409
- if TiKV is False:
410
- raise ImportError("TiKV is not installed")
411
- self.url = url
412
- self.pool: list[ConnectionHolder] = []
413
- self.pool_size = pool_size
414
-
415
- async def initialize(self):
416
- self.pool = [ConnectionHolder(self.url) for _ in range(self.pool_size)]
417
- for holder in self.pool:
418
- await holder.reinitialize()
419
-
420
- async def finalize(self):
421
- self.pool.clear()
422
-
423
- def get_connection_holder(self) -> ConnectionHolder:
424
- return random.choice(self.pool)
425
-
426
- async def begin(
427
- self, read_only: bool = False
428
- ) -> Union[TiKVTransaction, ReadOnlyTiKVTransaction]:
429
- conn = self.get_connection_holder()
430
- # if read_only:
431
- # return ReadOnlyTiKVTransaction(await conn.get_snapshot(), self)
432
- # else:
433
- return TiKVTransaction(await conn.begin_transaction(), self)
@@ -1,58 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- from nucliadb_protos.resources_pb2 import CloudFile, FieldLayout
21
-
22
- from nucliadb.ingest.fields.base import Field
23
- from nucliadb_utils.storages.storage import StorageField
24
-
25
-
26
- class NotTheSameFormat(Exception):
27
- pass
28
-
29
-
30
- class Layout(Field):
31
- pbklass = FieldLayout
32
- value: FieldLayout
33
- type: str = "l"
34
-
35
- async def set_value(self, payload: FieldLayout):
36
- # Diff support
37
- actual_payload = await self.get_value()
38
- if actual_payload and payload.format != actual_payload.format:
39
- raise NotTheSameFormat()
40
- if actual_payload is None:
41
- actual_payload = FieldLayout()
42
- actual_payload.format = payload.format
43
- for block in payload.body.deleted_blocks:
44
- if block in actual_payload.body.blocks:
45
- del actual_payload.body.blocks[block]
46
-
47
- for ident, pbblock in payload.body.blocks.items():
48
- if self.storage.needs_move(pbblock.file, self.kbid):
49
- sf: StorageField = self.storage.layout_field(
50
- self.kbid, self.uuid, self.id, ident
51
- )
52
- cf: CloudFile = await self.storage.normalize_binary(pbblock.file, sf)
53
- pbblock.file.CopyFrom(cf)
54
- actual_payload.body.blocks[ident].CopyFrom(pbblock)
55
- await self.db_set_value(actual_payload)
56
-
57
- async def get_value(self) -> FieldLayout:
58
- return await self.db_get_value()
@@ -1,30 +0,0 @@
1
- # Copyright (C) 2021 Bosutech XXI S.L.
2
- #
3
- # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
- # For commercial licensing, contact us at info@nuclia.com.
5
- #
6
- # AGPL:
7
- # This program is free software: you can redistribute it and/or modify
8
- # it under the terms of the GNU Affero General Public License as
9
- # published by the Free Software Foundation, either version 3 of the
10
- # License, or (at your option) any later version.
11
- #
12
- # This program is distributed in the hope that it will be useful,
13
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
- # GNU Affero General Public License for more details.
16
- #
17
- # You should have received a copy of the GNU Affero General Public License
18
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
- #
20
- pytest_plugins = [
21
- "pytest_docker_fixtures",
22
- "nucliadb_utils.tests.nats",
23
- "nucliadb.ingest.tests.fixtures",
24
- "nucliadb.tests.fixtures",
25
- "nucliadb.tests.tikv",
26
- "nucliadb_utils.tests.conftest",
27
- "nucliadb_utils.tests.gcs",
28
- "nucliadb_utils.tests.s3",
29
- "nucliadb_telemetry.tests.telemetry",
30
- ]