nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,10 +18,11 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import urllib.parse
21
- from typing import Optional
21
+ from typing import Sequence
22
22
 
23
+ from nucliadb.ingest.processing import PushPayload
24
+ from nucliadb_models.text import PushTextFormat, Text
23
25
  from nucliadb_protos.resources_pb2 import (
24
- Basic,
25
26
  ExtractedTextWrapper,
26
27
  FieldComputedMetadataWrapper,
27
28
  FieldType,
@@ -30,35 +31,6 @@ from nucliadb_protos.resources_pb2 import (
30
31
  )
31
32
  from nucliadb_protos.writer_pb2 import BrokerMessage
32
33
 
33
- from nucliadb.common.maindb.driver import Transaction
34
- from nucliadb.ingest.processing import PushPayload
35
- from nucliadb.ingest.settings import settings as ingest_settings
36
- from nucliadb_models.text import PushTextFormat, Text
37
-
38
- KB_RESOURCE_BASIC_FS = "/kbs/{kbid}/r/{uuid}/basic" # Only used on FS driver
39
- KB_RESOURCE_BASIC = "/kbs/{kbid}/r/{uuid}"
40
-
41
-
42
- async def set_basic(txn: Transaction, kbid: str, uuid: str, basic: Basic):
43
- if ingest_settings.driver == "local":
44
- await txn.set(
45
- KB_RESOURCE_BASIC_FS.format(kbid=kbid, uuid=uuid),
46
- basic.SerializeToString(),
47
- )
48
- else:
49
- await txn.set(
50
- KB_RESOURCE_BASIC.format(kbid=kbid, uuid=uuid),
51
- basic.SerializeToString(),
52
- )
53
-
54
-
55
- async def get_basic(txn: Transaction, kbid: str, uuid: str) -> Optional[bytes]:
56
- if ingest_settings.driver == "local":
57
- raw_basic = await txn.get(KB_RESOURCE_BASIC_FS.format(kbid=kbid, uuid=uuid))
58
- else:
59
- raw_basic = await txn.get(KB_RESOURCE_BASIC.format(kbid=kbid, uuid=uuid))
60
- return raw_basic
61
-
62
34
 
63
35
  def set_title(writer: BrokerMessage, toprocess: PushPayload, title: str):
64
36
  title = urllib.parse.unquote(title)
@@ -82,3 +54,25 @@ def set_title(writer: BrokerMessage, toprocess: PushPayload, title: str):
82
54
 
83
55
  def compute_paragraph_key(rid: str, paragraph_key: str) -> str:
84
56
  return paragraph_key.replace("N_RID", rid)
57
+
58
+
59
+ def choose_matryoshka_dimension(dimensions: Sequence[int]) -> int:
60
+ """Given a list of matryoshka embedding available dimensions, choose one to
61
+ set the vector dimension.
62
+ """
63
+ if len(dimensions) == 0:
64
+ raise ValueError("Can't choose matryoshka dimension from an empty list")
65
+
66
+ threshold = 2000
67
+ previous = None
68
+ for dimension in sorted(dimensions):
69
+ if dimension > threshold:
70
+ break
71
+ previous = dimension
72
+
73
+ if dimension > threshold:
74
+ if previous is None:
75
+ return dimension
76
+ else:
77
+ return previous
78
+ return dimension
@@ -38,9 +38,7 @@ def assign_partitions(settings: Settings):
38
38
  try:
39
39
  settings.replica_number = int(sts_values[-1])
40
40
  except Exception:
41
- logger.error(
42
- f"Could not extract replica number from hostname: {hostname}"
43
- )
41
+ logger.error(f"Could not extract replica number from hostname: {hostname}")
44
42
  pass
45
43
 
46
44
  if settings.replica_number == -1:
@@ -53,9 +51,5 @@ def assign_partitions(settings: Settings):
53
51
  # update settings AND Environment Varialbe (for this process and its childs) with partition list
54
52
  settings.partitions = partitions_list
55
53
  os.environ["PARTITIONS"] = json.dumps(partitions_list)
56
- logger.info(
57
- f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}"
58
- )
59
- logger.info(
60
- f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}"
61
- )
54
+ logger.info(f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}")
55
+ logger.info(f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}")
@@ -29,17 +29,22 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar
29
29
  import aiohttp
30
30
  import backoff
31
31
  import jwt
32
- from nucliadb_protos.resources_pb2 import CloudFile
33
- from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
34
32
  from pydantic import BaseModel, Field
35
33
 
36
34
  import nucliadb_models as models
37
35
  from nucliadb_models.resource import QueueType
36
+ from nucliadb_protos.resources_pb2 import CloudFile
37
+ from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
38
38
  from nucliadb_telemetry import metrics
39
39
  from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
40
- from nucliadb_utils.settings import nuclia_settings, storage_settings
40
+ from nucliadb_utils.settings import (
41
+ FileBackendConfig,
42
+ is_onprem_nucliadb,
43
+ nuclia_settings,
44
+ storage_settings,
45
+ )
41
46
  from nucliadb_utils.storages.storage import Storage
42
- from nucliadb_utils.utilities import Utility, set_utility
47
+ from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
43
48
 
44
49
  logger = logging.getLogger(__name__)
45
50
 
@@ -70,8 +75,8 @@ class Source(SourceValue, Enum): # type: ignore
70
75
 
71
76
 
72
77
  class ProcessingInfo(BaseModel):
73
- seqid: Optional[int]
74
- account_seq: Optional[int]
78
+ seqid: Optional[int] = None
79
+ account_seq: Optional[int] = None
75
80
  queue: Optional[QueueType] = None
76
81
 
77
82
 
@@ -96,9 +101,6 @@ class PushPayload(BaseModel):
96
101
  # Diff on Text Field
97
102
  textfield: dict[str, models.Text] = {}
98
103
 
99
- # Diff on a Layout Field
100
- layoutfield: dict[str, models.LayoutDiff] = {}
101
-
102
104
  # New conversations to process
103
105
  conversationfield: dict[str, models.PushConversation] = {}
104
106
 
@@ -111,11 +113,11 @@ class PushPayload(BaseModel):
111
113
  )
112
114
 
113
115
 
114
- class PushResponse(BaseModel):
115
- seqid: Optional[int] = None
116
-
117
-
118
116
  async def start_processing_engine():
117
+ processing_engine = get_utility(Utility.PROCESSING)
118
+ if processing_engine is not None:
119
+ return
120
+
119
121
  if nuclia_settings.dummy_processing:
120
122
  processing_engine = DummyProcessingEngine()
121
123
  else:
@@ -133,6 +135,43 @@ async def start_processing_engine():
133
135
  set_utility(Utility.PROCESSING, processing_engine)
134
136
 
135
137
 
138
+ async def stop_processing_engine():
139
+ utility = get_utility(Utility.PROCESSING)
140
+ if utility is not None:
141
+ await utility.finalize()
142
+ clean_utility(Utility.PROCESSING)
143
+
144
+
145
+ class ProcessingDriverType(Enum):
146
+ # XXX IMPORTANT XXX: Make sure the values are in sync with
147
+ # the ones defined in nuclia/learning/processing repository
148
+ GCS = 0
149
+ S3 = 1
150
+ LOCAL = 2
151
+
152
+
153
+ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> ProcessingDriverType:
154
+ """
155
+ Outputs a nuclia-internal backend driver identifier that is used by processing
156
+ to store the blobs of processed metadata in the right bucket folder.
157
+ """
158
+ if is_onprem_nucliadb():
159
+ # On-prem installations are always regarded as local storage from the processing perspective,
160
+ # as Nuclia processing engine will not have direct access to the storage.
161
+ return ProcessingDriverType.LOCAL
162
+
163
+ try:
164
+ return {
165
+ FileBackendConfig.GCS: ProcessingDriverType.GCS,
166
+ FileBackendConfig.S3: ProcessingDriverType.S3,
167
+ }[file_backend_driver]
168
+ except KeyError:
169
+ logger.error(
170
+ f"Not a valid file backend driver to processing, fallback to local: {file_backend_driver}"
171
+ )
172
+ return ProcessingDriverType.LOCAL
173
+
174
+
136
175
  class ProcessingEngine:
137
176
  def __init__(
138
177
  self,
@@ -143,52 +182,30 @@ class ProcessingEngine:
143
182
  onprem: Optional[bool] = False,
144
183
  nuclia_jwt_key: Optional[str] = None,
145
184
  days_to_keep: int = 3,
146
- driver: str = "gcs",
185
+ driver: FileBackendConfig = FileBackendConfig.GCS,
147
186
  ):
148
187
  self.nuclia_service_account = nuclia_service_account
149
188
  self.nuclia_zone = nuclia_zone
150
189
  if nuclia_public_url is not None:
151
- self.nuclia_public_url: Optional[str] = nuclia_public_url.format(
152
- zone=nuclia_zone
153
- )
190
+ self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
154
191
  else:
155
192
  self.nuclia_public_url = None
156
193
 
157
194
  self.onprem = onprem
158
195
  if self.onprem:
159
- self.nuclia_upload_url = (
160
- f"{self.nuclia_public_url}/api/v1/processing/upload"
161
- )
196
+ self.nuclia_upload_url = f"{self.nuclia_public_url}/api/v1/processing/upload"
162
197
  else:
163
- self.nuclia_upload_url = (
164
- f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
165
- )
166
- self.nuclia_internal_push = (
167
- f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
168
- )
198
+ self.nuclia_upload_url = f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
199
+ self.nuclia_internal_push = f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
169
200
  self.nuclia_internal_delete = (
170
201
  f"{nuclia_processing_cluster_url}/api/v1/internal/processing/requests"
171
202
  )
172
- self.nuclia_external_push_v2 = (
173
- f"{self.nuclia_public_url}/api/v1/processing/push"
174
- )
175
- self.nuclia_external_delete = (
176
- f"{self.nuclia_public_url}/api/v1/processing/requests"
177
- )
203
+ self.nuclia_external_push_v2 = f"{self.nuclia_public_url}/api/v1/processing/push"
204
+ self.nuclia_external_delete = f"{self.nuclia_public_url}/api/v1/processing/requests"
178
205
 
179
206
  self.nuclia_jwt_key = nuclia_jwt_key
180
207
  self.days_to_keep = days_to_keep
181
- if driver == "gcs":
182
- self.driver = 0
183
- elif driver == "s3":
184
- self.driver = 1
185
- elif driver in ("local", "pg"):
186
- self.driver = 2
187
- else:
188
- logger.error(
189
- f"Not a valid driver to processing, fallback to local: {driver}"
190
- )
191
- self.driver = 2
208
+ self.driver: ProcessingDriverType = to_processing_driver_type(driver)
192
209
  self._exit_stack = AsyncExitStack()
193
210
 
194
211
  async def initialize(self):
@@ -211,7 +228,7 @@ class ProcessingEngine:
211
228
  "iat": now,
212
229
  "md5": cf.md5,
213
230
  "source": 1, # To indicate that this files comes internally
214
- "driver": self.driver,
231
+ "driver": self.driver.value,
215
232
  "jti": uuid.uuid4().hex,
216
233
  "bucket_name": cf.bucket_name,
217
234
  "filename": cf.filename,
@@ -235,7 +252,7 @@ class ProcessingEngine:
235
252
  "iat": now,
236
253
  "md5": file.file.md5,
237
254
  "source": 1, # To indicate that this files comes internally
238
- "driver": self.driver,
255
+ "driver": self.driver.value,
239
256
  "jti": uuid.uuid4().hex,
240
257
  "bucket_name": file.file.bucket_name,
241
258
  "filename": file.file.filename,
@@ -310,9 +327,7 @@ class ProcessingEngine:
310
327
  max_tries=MAX_TRIES,
311
328
  )
312
329
  @processing_observer.wrap({"type": "file_field_upload_internal"})
313
- async def convert_internal_filefield_to_str(
314
- self, file: FieldFilePB, storage: Storage
315
- ) -> str:
330
+ async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
316
331
  """It's already an internal file that needs to be uploaded"""
317
332
  if self.onprem is False:
318
333
  # Upload the file to processing upload
@@ -321,9 +336,7 @@ class ProcessingEngine:
321
336
  headers = {}
322
337
  headers["X-PASSWORD"] = file.password
323
338
  headers["X-LANGUAGE"] = file.language
324
- headers["X-FILENAME"] = base64.b64encode(
325
- file.file.filename.encode()
326
- ).decode()
339
+ headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode()
327
340
  headers["X-MD5"] = file.file.md5
328
341
  headers["CONTENT-TYPE"] = file.file.content_type
329
342
  if file.file.size:
@@ -331,9 +344,7 @@ class ProcessingEngine:
331
344
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
332
345
 
333
346
  iterator = storage.downloadbytescf_iterator(file.file)
334
- async with self.session.post(
335
- self.nuclia_upload_url, data=iterator, headers=headers
336
- ) as resp:
347
+ async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
337
348
  if resp.status == 200:
338
349
  jwttoken = await resp.text()
339
350
  elif resp.status == 402:
@@ -367,9 +378,7 @@ class ProcessingEngine:
367
378
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
368
379
 
369
380
  iterator = storage.downloadbytescf_iterator(cf)
370
- async with self.session.post(
371
- self.nuclia_upload_url, data=iterator, headers=headers
372
- ) as resp:
381
+ async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
373
382
  if resp.status == 200:
374
383
  jwttoken = await resp.text()
375
384
  elif resp.status == 402:
@@ -389,9 +398,7 @@ class ProcessingEngine:
389
398
  jitter=backoff.random_jitter,
390
399
  max_tries=MAX_TRIES,
391
400
  )
392
- async def send_to_process(
393
- self, item: PushPayload, partition: int
394
- ) -> ProcessingInfo:
401
+ async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
395
402
  op_type = "process_external" if self.onprem else "process_internal"
396
403
  with processing_observer({"type": op_type}):
397
404
  headers = {"CONTENT-TYPE": "application/json"}
@@ -399,15 +406,13 @@ class ProcessingEngine:
399
406
  # Upload the payload
400
407
  item.partition = partition
401
408
  resp = await self.session.post(
402
- url=self.nuclia_internal_push, data=item.json(), headers=headers
409
+ url=self.nuclia_internal_push, data=item.model_dump_json(), headers=headers
403
410
  )
404
411
  else:
405
- headers.update(
406
- {"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"}
407
- )
412
+ headers.update({"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"})
408
413
  # Upload the payload
409
414
  resp = await self.session.post(
410
- url=self.nuclia_external_push_v2, data=item.json(), headers=headers
415
+ url=self.nuclia_external_push_v2, data=item.model_dump_json(), headers=headers
411
416
  )
412
417
  if resp.status == 200:
413
418
  data = await resp.json()
@@ -437,9 +442,7 @@ class ProcessingEngine:
437
442
  queue=QueueType(queue_type) if queue_type is not None else None,
438
443
  )
439
444
 
440
- async def delete_from_processing(
441
- self, *, kbid: str, resource_id: Optional[str] = None
442
- ) -> None:
445
+ async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
443
446
  """
444
447
  Delete a resource from processing. This prevents inflight resources from being processed
445
448
  and wasting resources.
@@ -469,7 +472,7 @@ class ProcessingEngine:
469
472
 
470
473
  class DummyProcessingEngine(ProcessingEngine):
471
474
  def __init__(self):
472
- self.calls: list[list[Any]] = [] # type: ignore
475
+ self.calls: list[list[Any]] = []
473
476
  self.values = defaultdict(list)
474
477
  self.onprem = True
475
478
 
@@ -491,9 +494,7 @@ class DummyProcessingEngine(ProcessingEngine):
491
494
  self.values["convert_external_filefield_to_str"].append(file_field)
492
495
  return f"convert_external_filefield_to_str,{index}"
493
496
 
494
- async def convert_internal_filefield_to_str(
495
- self, file: FieldFilePB, storage: Storage
496
- ) -> str:
497
+ async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
497
498
  self.calls.append([file, storage])
498
499
  index = len(self.values["convert_internal_filefield_to_str"])
499
500
  self.values["convert_internal_filefield_to_str"].append([file, storage])
@@ -505,16 +506,10 @@ class DummyProcessingEngine(ProcessingEngine):
505
506
  self.values["convert_internal_cf_to_str"].append([cf, storage])
506
507
  return f"convert_internal_cf_to_str,{index}"
507
508
 
508
- async def send_to_process(
509
- self, item: PushPayload, partition: int
510
- ) -> ProcessingInfo:
509
+ async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
511
510
  self.calls.append([item, partition])
512
511
  self.values["send_to_process"].append([item, partition])
513
- return ProcessingInfo(
514
- seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED
515
- )
512
+ return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
516
513
 
517
- async def delete_from_processing(
518
- self, *, kbid: str, resource_id: Optional[str] = None
519
- ) -> None:
514
+ async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
520
515
  self.calls.append([kbid, resource_id])
File without changes