nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,7 +18,10 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import urllib.parse
21
+ from typing import Sequence
21
22
 
23
+ from nucliadb.ingest.processing import PushPayload
24
+ from nucliadb_models.text import PushTextFormat, Text
22
25
  from nucliadb_protos.resources_pb2 import (
23
26
  ExtractedTextWrapper,
24
27
  FieldComputedMetadataWrapper,
@@ -28,9 +31,6 @@ from nucliadb_protos.resources_pb2 import (
28
31
  )
29
32
  from nucliadb_protos.writer_pb2 import BrokerMessage
30
33
 
31
- from nucliadb.ingest.processing import PushPayload
32
- from nucliadb_models.text import PushTextFormat, Text
33
-
34
34
 
35
35
  def set_title(writer: BrokerMessage, toprocess: PushPayload, title: str):
36
36
  title = urllib.parse.unquote(title)
@@ -56,7 +56,7 @@ def compute_paragraph_key(rid: str, paragraph_key: str) -> str:
56
56
  return paragraph_key.replace("N_RID", rid)
57
57
 
58
58
 
59
- def choose_matryoshka_dimension(dimensions: list[int]) -> int:
59
+ def choose_matryoshka_dimension(dimensions: Sequence[int]) -> int:
60
60
  """Given a list of matryoshka embedding available dimensions, choose one to
61
61
  set the vector dimension.
62
62
  """
@@ -38,9 +38,7 @@ def assign_partitions(settings: Settings):
38
38
  try:
39
39
  settings.replica_number = int(sts_values[-1])
40
40
  except Exception:
41
- logger.error(
42
- f"Could not extract replica number from hostname: {hostname}"
43
- )
41
+ logger.error(f"Could not extract replica number from hostname: {hostname}")
44
42
  pass
45
43
 
46
44
  if settings.replica_number == -1:
@@ -53,9 +51,5 @@ def assign_partitions(settings: Settings):
53
51
  # update settings AND Environment Varialbe (for this process and its childs) with partition list
54
52
  settings.partitions = partitions_list
55
53
  os.environ["PARTITIONS"] = json.dumps(partitions_list)
56
- logger.info(
57
- f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}"
58
- )
59
- logger.info(
60
- f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}"
61
- )
54
+ logger.info(f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}")
55
+ logger.info(f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}")
@@ -29,17 +29,22 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar
29
29
  import aiohttp
30
30
  import backoff
31
31
  import jwt
32
- from nucliadb_protos.resources_pb2 import CloudFile
33
- from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
34
32
  from pydantic import BaseModel, Field
35
33
 
36
34
  import nucliadb_models as models
37
35
  from nucliadb_models.resource import QueueType
36
+ from nucliadb_protos.resources_pb2 import CloudFile
37
+ from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
38
38
  from nucliadb_telemetry import metrics
39
39
  from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
40
- from nucliadb_utils.settings import FileBackendConfig, nuclia_settings, storage_settings
40
+ from nucliadb_utils.settings import (
41
+ FileBackendConfig,
42
+ is_onprem_nucliadb,
43
+ nuclia_settings,
44
+ storage_settings,
45
+ )
41
46
  from nucliadb_utils.storages.storage import Storage
42
- from nucliadb_utils.utilities import Utility, set_utility
47
+ from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
43
48
 
44
49
  logger = logging.getLogger(__name__)
45
50
 
@@ -96,9 +101,6 @@ class PushPayload(BaseModel):
96
101
  # Diff on Text Field
97
102
  textfield: dict[str, models.Text] = {}
98
103
 
99
- # Diff on a Layout Field
100
- layoutfield: dict[str, models.LayoutDiff] = {}
101
-
102
104
  # New conversations to process
103
105
  conversationfield: dict[str, models.PushConversation] = {}
104
106
 
@@ -112,6 +114,10 @@ class PushPayload(BaseModel):
112
114
 
113
115
 
114
116
  async def start_processing_engine():
117
+ processing_engine = get_utility(Utility.PROCESSING)
118
+ if processing_engine is not None:
119
+ return
120
+
115
121
  if nuclia_settings.dummy_processing:
116
122
  processing_engine = DummyProcessingEngine()
117
123
  else:
@@ -129,22 +135,41 @@ async def start_processing_engine():
129
135
  set_utility(Utility.PROCESSING, processing_engine)
130
136
 
131
137
 
132
- def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> int:
138
+ async def stop_processing_engine():
139
+ utility = get_utility(Utility.PROCESSING)
140
+ if utility is not None:
141
+ await utility.finalize()
142
+ clean_utility(Utility.PROCESSING)
143
+
144
+
145
+ class ProcessingDriverType(Enum):
146
+ # XXX IMPORTANT XXX: Make sure the values are in sync with
147
+ # the ones defined in nuclia/learning/processing repository
148
+ GCS = 0
149
+ S3 = 1
150
+ LOCAL = 2
151
+
152
+
153
+ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> ProcessingDriverType:
133
154
  """
134
155
  Outputs a nuclia-internal backend driver identifier that is used by processing
135
156
  to store the blobs of processed metadata in the right bucket folder.
136
157
  """
137
- if file_backend_driver == FileBackendConfig.GCS:
138
- return 0
139
- elif file_backend_driver == FileBackendConfig.S3:
140
- return 1
141
- elif file_backend_driver in (FileBackendConfig.LOCAL, FileBackendConfig.PG):
142
- return 2
143
- else:
158
+ if is_onprem_nucliadb():
159
+ # On-prem installations are always regarded as local storage from the processing perspective,
160
+ # as Nuclia processing engine will not have direct access to the storage.
161
+ return ProcessingDriverType.LOCAL
162
+
163
+ try:
164
+ return {
165
+ FileBackendConfig.GCS: ProcessingDriverType.GCS,
166
+ FileBackendConfig.S3: ProcessingDriverType.S3,
167
+ }[file_backend_driver]
168
+ except KeyError:
144
169
  logger.error(
145
170
  f"Not a valid file backend driver to processing, fallback to local: {file_backend_driver}"
146
171
  )
147
- return 2
172
+ return ProcessingDriverType.LOCAL
148
173
 
149
174
 
150
175
  class ProcessingEngine:
@@ -162,37 +187,25 @@ class ProcessingEngine:
162
187
  self.nuclia_service_account = nuclia_service_account
163
188
  self.nuclia_zone = nuclia_zone
164
189
  if nuclia_public_url is not None:
165
- self.nuclia_public_url: Optional[str] = nuclia_public_url.format(
166
- zone=nuclia_zone
167
- )
190
+ self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
168
191
  else:
169
192
  self.nuclia_public_url = None
170
193
 
171
194
  self.onprem = onprem
172
195
  if self.onprem:
173
- self.nuclia_upload_url = (
174
- f"{self.nuclia_public_url}/api/v1/processing/upload"
175
- )
196
+ self.nuclia_upload_url = f"{self.nuclia_public_url}/api/v1/processing/upload"
176
197
  else:
177
- self.nuclia_upload_url = (
178
- f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
179
- )
180
- self.nuclia_internal_push = (
181
- f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
182
- )
198
+ self.nuclia_upload_url = f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
199
+ self.nuclia_internal_push = f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
183
200
  self.nuclia_internal_delete = (
184
201
  f"{nuclia_processing_cluster_url}/api/v1/internal/processing/requests"
185
202
  )
186
- self.nuclia_external_push_v2 = (
187
- f"{self.nuclia_public_url}/api/v1/processing/push"
188
- )
189
- self.nuclia_external_delete = (
190
- f"{self.nuclia_public_url}/api/v1/processing/requests"
191
- )
203
+ self.nuclia_external_push_v2 = f"{self.nuclia_public_url}/api/v1/processing/push"
204
+ self.nuclia_external_delete = f"{self.nuclia_public_url}/api/v1/processing/requests"
192
205
 
193
206
  self.nuclia_jwt_key = nuclia_jwt_key
194
207
  self.days_to_keep = days_to_keep
195
- self.driver = to_processing_driver_type(driver)
208
+ self.driver: ProcessingDriverType = to_processing_driver_type(driver)
196
209
  self._exit_stack = AsyncExitStack()
197
210
 
198
211
  async def initialize(self):
@@ -215,7 +228,7 @@ class ProcessingEngine:
215
228
  "iat": now,
216
229
  "md5": cf.md5,
217
230
  "source": 1, # To indicate that this files comes internally
218
- "driver": self.driver,
231
+ "driver": self.driver.value,
219
232
  "jti": uuid.uuid4().hex,
220
233
  "bucket_name": cf.bucket_name,
221
234
  "filename": cf.filename,
@@ -239,7 +252,7 @@ class ProcessingEngine:
239
252
  "iat": now,
240
253
  "md5": file.file.md5,
241
254
  "source": 1, # To indicate that this files comes internally
242
- "driver": self.driver,
255
+ "driver": self.driver.value,
243
256
  "jti": uuid.uuid4().hex,
244
257
  "bucket_name": file.file.bucket_name,
245
258
  "filename": file.file.filename,
@@ -314,9 +327,7 @@ class ProcessingEngine:
314
327
  max_tries=MAX_TRIES,
315
328
  )
316
329
  @processing_observer.wrap({"type": "file_field_upload_internal"})
317
- async def convert_internal_filefield_to_str(
318
- self, file: FieldFilePB, storage: Storage
319
- ) -> str:
330
+ async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
320
331
  """It's already an internal file that needs to be uploaded"""
321
332
  if self.onprem is False:
322
333
  # Upload the file to processing upload
@@ -325,9 +336,7 @@ class ProcessingEngine:
325
336
  headers = {}
326
337
  headers["X-PASSWORD"] = file.password
327
338
  headers["X-LANGUAGE"] = file.language
328
- headers["X-FILENAME"] = base64.b64encode(
329
- file.file.filename.encode()
330
- ).decode()
339
+ headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode()
331
340
  headers["X-MD5"] = file.file.md5
332
341
  headers["CONTENT-TYPE"] = file.file.content_type
333
342
  if file.file.size:
@@ -335,9 +344,7 @@ class ProcessingEngine:
335
344
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
336
345
 
337
346
  iterator = storage.downloadbytescf_iterator(file.file)
338
- async with self.session.post(
339
- self.nuclia_upload_url, data=iterator, headers=headers
340
- ) as resp:
347
+ async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
341
348
  if resp.status == 200:
342
349
  jwttoken = await resp.text()
343
350
  elif resp.status == 402:
@@ -371,9 +378,7 @@ class ProcessingEngine:
371
378
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
372
379
 
373
380
  iterator = storage.downloadbytescf_iterator(cf)
374
- async with self.session.post(
375
- self.nuclia_upload_url, data=iterator, headers=headers
376
- ) as resp:
381
+ async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
377
382
  if resp.status == 200:
378
383
  jwttoken = await resp.text()
379
384
  elif resp.status == 402:
@@ -393,9 +398,7 @@ class ProcessingEngine:
393
398
  jitter=backoff.random_jitter,
394
399
  max_tries=MAX_TRIES,
395
400
  )
396
- async def send_to_process(
397
- self, item: PushPayload, partition: int
398
- ) -> ProcessingInfo:
401
+ async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
399
402
  op_type = "process_external" if self.onprem else "process_internal"
400
403
  with processing_observer({"type": op_type}):
401
404
  headers = {"CONTENT-TYPE": "application/json"}
@@ -403,15 +406,13 @@ class ProcessingEngine:
403
406
  # Upload the payload
404
407
  item.partition = partition
405
408
  resp = await self.session.post(
406
- url=self.nuclia_internal_push, data=item.json(), headers=headers
409
+ url=self.nuclia_internal_push, data=item.model_dump_json(), headers=headers
407
410
  )
408
411
  else:
409
- headers.update(
410
- {"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"}
411
- )
412
+ headers.update({"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"})
412
413
  # Upload the payload
413
414
  resp = await self.session.post(
414
- url=self.nuclia_external_push_v2, data=item.json(), headers=headers
415
+ url=self.nuclia_external_push_v2, data=item.model_dump_json(), headers=headers
415
416
  )
416
417
  if resp.status == 200:
417
418
  data = await resp.json()
@@ -441,9 +442,7 @@ class ProcessingEngine:
441
442
  queue=QueueType(queue_type) if queue_type is not None else None,
442
443
  )
443
444
 
444
- async def delete_from_processing(
445
- self, *, kbid: str, resource_id: Optional[str] = None
446
- ) -> None:
445
+ async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
447
446
  """
448
447
  Delete a resource from processing. This prevents inflight resources from being processed
449
448
  and wasting resources.
@@ -473,7 +472,7 @@ class ProcessingEngine:
473
472
 
474
473
  class DummyProcessingEngine(ProcessingEngine):
475
474
  def __init__(self):
476
- self.calls: list[list[Any]] = [] # type: ignore
475
+ self.calls: list[list[Any]] = []
477
476
  self.values = defaultdict(list)
478
477
  self.onprem = True
479
478
 
@@ -495,9 +494,7 @@ class DummyProcessingEngine(ProcessingEngine):
495
494
  self.values["convert_external_filefield_to_str"].append(file_field)
496
495
  return f"convert_external_filefield_to_str,{index}"
497
496
 
498
- async def convert_internal_filefield_to_str(
499
- self, file: FieldFilePB, storage: Storage
500
- ) -> str:
497
+ async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
501
498
  self.calls.append([file, storage])
502
499
  index = len(self.values["convert_internal_filefield_to_str"])
503
500
  self.values["convert_internal_filefield_to_str"].append([file, storage])
@@ -509,16 +506,10 @@ class DummyProcessingEngine(ProcessingEngine):
509
506
  self.values["convert_internal_cf_to_str"].append([cf, storage])
510
507
  return f"convert_internal_cf_to_str,{index}"
511
508
 
512
- async def send_to_process(
513
- self, item: PushPayload, partition: int
514
- ) -> ProcessingInfo:
509
+ async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
515
510
  self.calls.append([item, partition])
516
511
  self.values["send_to_process"].append([item, partition])
517
- return ProcessingInfo(
518
- seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED
519
- )
512
+ return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
520
513
 
521
- async def delete_from_processing(
522
- self, *, kbid: str, resource_id: Optional[str] = None
523
- ) -> None:
514
+ async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
524
515
  self.calls.append([kbid, resource_id])
File without changes