nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -18,7 +18,10 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import urllib.parse
21
+ from typing import Sequence
21
22
 
23
+ from nucliadb.ingest.processing import PushPayload
24
+ from nucliadb_models.text import PushTextFormat, Text
22
25
  from nucliadb_protos.resources_pb2 import (
23
26
  ExtractedTextWrapper,
24
27
  FieldComputedMetadataWrapper,
@@ -28,9 +31,6 @@ from nucliadb_protos.resources_pb2 import (
28
31
  )
29
32
  from nucliadb_protos.writer_pb2 import BrokerMessage
30
33
 
31
- from nucliadb.ingest.processing import PushPayload
32
- from nucliadb_models.text import PushTextFormat, Text
33
-
34
34
 
35
35
  def set_title(writer: BrokerMessage, toprocess: PushPayload, title: str):
36
36
  title = urllib.parse.unquote(title)
@@ -56,7 +56,7 @@ def compute_paragraph_key(rid: str, paragraph_key: str) -> str:
56
56
  return paragraph_key.replace("N_RID", rid)
57
57
 
58
58
 
59
- def choose_matryoshka_dimension(dimensions: list[int]) -> int:
59
+ def choose_matryoshka_dimension(dimensions: Sequence[int]) -> int:
60
60
  """Given a list of matryoshka embedding available dimensions, choose one to
61
61
  set the vector dimension.
62
62
  """
@@ -38,9 +38,7 @@ def assign_partitions(settings: Settings):
38
38
  try:
39
39
  settings.replica_number = int(sts_values[-1])
40
40
  except Exception:
41
- logger.error(
42
- f"Could not extract replica number from hostname: {hostname}"
43
- )
41
+ logger.error(f"Could not extract replica number from hostname: {hostname}")
44
42
  pass
45
43
 
46
44
  if settings.replica_number == -1:
@@ -53,9 +51,5 @@ def assign_partitions(settings: Settings):
53
51
  # update settings AND Environment Varialbe (for this process and its childs) with partition list
54
52
  settings.partitions = partitions_list
55
53
  os.environ["PARTITIONS"] = json.dumps(partitions_list)
56
- logger.info(
57
- f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}"
58
- )
59
- logger.info(
60
- f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}"
61
- )
54
+ logger.info(f"PARTITIONS: Assigned Partitions (in settings) = {settings.partitions}")
55
+ logger.info(f"PARTITIONS: Assigned Partitions (in environment) = {os.environ['PARTITIONS']}")
@@ -29,17 +29,22 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar
29
29
  import aiohttp
30
30
  import backoff
31
31
  import jwt
32
- from nucliadb_protos.resources_pb2 import CloudFile
33
- from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
34
32
  from pydantic import BaseModel, Field
35
33
 
36
34
  import nucliadb_models as models
37
35
  from nucliadb_models.resource import QueueType
36
+ from nucliadb_protos.resources_pb2 import CloudFile
37
+ from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
38
38
  from nucliadb_telemetry import metrics
39
39
  from nucliadb_utils.exceptions import LimitsExceededError, SendToProcessError
40
- from nucliadb_utils.settings import FileBackendConfig, nuclia_settings, storage_settings
40
+ from nucliadb_utils.settings import (
41
+ FileBackendConfig,
42
+ is_onprem_nucliadb,
43
+ nuclia_settings,
44
+ storage_settings,
45
+ )
41
46
  from nucliadb_utils.storages.storage import Storage
42
- from nucliadb_utils.utilities import Utility, set_utility
47
+ from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
43
48
 
44
49
  logger = logging.getLogger(__name__)
45
50
 
@@ -96,9 +101,6 @@ class PushPayload(BaseModel):
96
101
  # Diff on Text Field
97
102
  textfield: dict[str, models.Text] = {}
98
103
 
99
- # Diff on a Layout Field
100
- layoutfield: dict[str, models.LayoutDiff] = {}
101
-
102
104
  # New conversations to process
103
105
  conversationfield: dict[str, models.PushConversation] = {}
104
106
 
@@ -112,6 +114,10 @@ class PushPayload(BaseModel):
112
114
 
113
115
 
114
116
  async def start_processing_engine():
117
+ processing_engine = get_utility(Utility.PROCESSING)
118
+ if processing_engine is not None:
119
+ return
120
+
115
121
  if nuclia_settings.dummy_processing:
116
122
  processing_engine = DummyProcessingEngine()
117
123
  else:
@@ -129,22 +135,41 @@ async def start_processing_engine():
129
135
  set_utility(Utility.PROCESSING, processing_engine)
130
136
 
131
137
 
132
- def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> int:
138
+ async def stop_processing_engine():
139
+ utility = get_utility(Utility.PROCESSING)
140
+ if utility is not None:
141
+ await utility.finalize()
142
+ clean_utility(Utility.PROCESSING)
143
+
144
+
145
+ class ProcessingDriverType(Enum):
146
+ # XXX IMPORTANT XXX: Make sure the values are in sync with
147
+ # the ones defined in nuclia/learning/processing repository
148
+ GCS = 0
149
+ S3 = 1
150
+ LOCAL = 2
151
+
152
+
153
+ def to_processing_driver_type(file_backend_driver: FileBackendConfig) -> ProcessingDriverType:
133
154
  """
134
155
  Outputs a nuclia-internal backend driver identifier that is used by processing
135
156
  to store the blobs of processed metadata in the right bucket folder.
136
157
  """
137
- if file_backend_driver == FileBackendConfig.GCS:
138
- return 0
139
- elif file_backend_driver == FileBackendConfig.S3:
140
- return 1
141
- elif file_backend_driver in (FileBackendConfig.LOCAL, FileBackendConfig.PG):
142
- return 2
143
- else:
158
+ if is_onprem_nucliadb():
159
+ # On-prem installations are always regarded as local storage from the processing perspective,
160
+ # as Nuclia processing engine will not have direct access to the storage.
161
+ return ProcessingDriverType.LOCAL
162
+
163
+ try:
164
+ return {
165
+ FileBackendConfig.GCS: ProcessingDriverType.GCS,
166
+ FileBackendConfig.S3: ProcessingDriverType.S3,
167
+ }[file_backend_driver]
168
+ except KeyError:
144
169
  logger.error(
145
170
  f"Not a valid file backend driver to processing, fallback to local: {file_backend_driver}"
146
171
  )
147
- return 2
172
+ return ProcessingDriverType.LOCAL
148
173
 
149
174
 
150
175
  class ProcessingEngine:
@@ -162,37 +187,25 @@ class ProcessingEngine:
162
187
  self.nuclia_service_account = nuclia_service_account
163
188
  self.nuclia_zone = nuclia_zone
164
189
  if nuclia_public_url is not None:
165
- self.nuclia_public_url: Optional[str] = nuclia_public_url.format(
166
- zone=nuclia_zone
167
- )
190
+ self.nuclia_public_url: Optional[str] = nuclia_public_url.format(zone=nuclia_zone)
168
191
  else:
169
192
  self.nuclia_public_url = None
170
193
 
171
194
  self.onprem = onprem
172
195
  if self.onprem:
173
- self.nuclia_upload_url = (
174
- f"{self.nuclia_public_url}/api/v1/processing/upload"
175
- )
196
+ self.nuclia_upload_url = f"{self.nuclia_public_url}/api/v1/processing/upload"
176
197
  else:
177
- self.nuclia_upload_url = (
178
- f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
179
- )
180
- self.nuclia_internal_push = (
181
- f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
182
- )
198
+ self.nuclia_upload_url = f"{nuclia_processing_cluster_url}/api/v1/processing/upload"
199
+ self.nuclia_internal_push = f"{nuclia_processing_cluster_url}/api/v1/internal/processing/push"
183
200
  self.nuclia_internal_delete = (
184
201
  f"{nuclia_processing_cluster_url}/api/v1/internal/processing/requests"
185
202
  )
186
- self.nuclia_external_push_v2 = (
187
- f"{self.nuclia_public_url}/api/v1/processing/push"
188
- )
189
- self.nuclia_external_delete = (
190
- f"{self.nuclia_public_url}/api/v1/processing/requests"
191
- )
203
+ self.nuclia_external_push_v2 = f"{self.nuclia_public_url}/api/v1/processing/push"
204
+ self.nuclia_external_delete = f"{self.nuclia_public_url}/api/v1/processing/requests"
192
205
 
193
206
  self.nuclia_jwt_key = nuclia_jwt_key
194
207
  self.days_to_keep = days_to_keep
195
- self.driver = to_processing_driver_type(driver)
208
+ self.driver: ProcessingDriverType = to_processing_driver_type(driver)
196
209
  self._exit_stack = AsyncExitStack()
197
210
 
198
211
  async def initialize(self):
@@ -215,7 +228,7 @@ class ProcessingEngine:
215
228
  "iat": now,
216
229
  "md5": cf.md5,
217
230
  "source": 1, # To indicate that this files comes internally
218
- "driver": self.driver,
231
+ "driver": self.driver.value,
219
232
  "jti": uuid.uuid4().hex,
220
233
  "bucket_name": cf.bucket_name,
221
234
  "filename": cf.filename,
@@ -239,7 +252,7 @@ class ProcessingEngine:
239
252
  "iat": now,
240
253
  "md5": file.file.md5,
241
254
  "source": 1, # To indicate that this files comes internally
242
- "driver": self.driver,
255
+ "driver": self.driver.value,
243
256
  "jti": uuid.uuid4().hex,
244
257
  "bucket_name": file.file.bucket_name,
245
258
  "filename": file.file.filename,
@@ -248,6 +261,7 @@ class ProcessingEngine:
248
261
  "content_type": file.file.content_type,
249
262
  "password": file.password,
250
263
  "language": file.language,
264
+ "extract_strategy": file.extract_strategy,
251
265
  }
252
266
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
253
267
 
@@ -265,6 +279,8 @@ class ProcessingEngine:
265
279
  headers["X-LANGUAGE"] = file.language
266
280
  headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode() # type: ignore
267
281
  headers["X-MD5"] = file.file.md5
282
+ if file.extract_strategy is not None:
283
+ headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
268
284
  headers["CONTENT_TYPE"] = file.file.content_type
269
285
  headers["CONTENT-LENGTH"] = str(len(file.file.payload)) # type: ignore
270
286
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
@@ -304,6 +320,7 @@ class ProcessingEngine:
304
320
  "content_type": file_field.file.content_type,
305
321
  "language": file_field.language,
306
322
  "password": file_field.password,
323
+ "extract_strategy": file_field.extract_strategy,
307
324
  }
308
325
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
309
326
 
@@ -314,9 +331,7 @@ class ProcessingEngine:
314
331
  max_tries=MAX_TRIES,
315
332
  )
316
333
  @processing_observer.wrap({"type": "file_field_upload_internal"})
317
- async def convert_internal_filefield_to_str(
318
- self, file: FieldFilePB, storage: Storage
319
- ) -> str:
334
+ async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
320
335
  """It's already an internal file that needs to be uploaded"""
321
336
  if self.onprem is False:
322
337
  # Upload the file to processing upload
@@ -325,19 +340,17 @@ class ProcessingEngine:
325
340
  headers = {}
326
341
  headers["X-PASSWORD"] = file.password
327
342
  headers["X-LANGUAGE"] = file.language
328
- headers["X-FILENAME"] = base64.b64encode(
329
- file.file.filename.encode()
330
- ).decode()
343
+ headers["X-FILENAME"] = base64.b64encode(file.file.filename.encode()).decode()
331
344
  headers["X-MD5"] = file.file.md5
332
345
  headers["CONTENT-TYPE"] = file.file.content_type
333
346
  if file.file.size:
334
347
  headers["CONTENT-LENGTH"] = str(file.file.size)
348
+ if file.extract_strategy != "":
349
+ headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
335
350
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
336
351
 
337
352
  iterator = storage.downloadbytescf_iterator(file.file)
338
- async with self.session.post(
339
- self.nuclia_upload_url, data=iterator, headers=headers
340
- ) as resp:
353
+ async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
341
354
  if resp.status == 200:
342
355
  jwttoken = await resp.text()
343
356
  elif resp.status == 402:
@@ -371,9 +384,7 @@ class ProcessingEngine:
371
384
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
372
385
 
373
386
  iterator = storage.downloadbytescf_iterator(cf)
374
- async with self.session.post(
375
- self.nuclia_upload_url, data=iterator, headers=headers
376
- ) as resp:
387
+ async with self.session.post(self.nuclia_upload_url, data=iterator, headers=headers) as resp:
377
388
  if resp.status == 200:
378
389
  jwttoken = await resp.text()
379
390
  elif resp.status == 402:
@@ -393,9 +404,7 @@ class ProcessingEngine:
393
404
  jitter=backoff.random_jitter,
394
405
  max_tries=MAX_TRIES,
395
406
  )
396
- async def send_to_process(
397
- self, item: PushPayload, partition: int
398
- ) -> ProcessingInfo:
407
+ async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
399
408
  op_type = "process_external" if self.onprem else "process_internal"
400
409
  with processing_observer({"type": op_type}):
401
410
  headers = {"CONTENT-TYPE": "application/json"}
@@ -403,15 +412,13 @@ class ProcessingEngine:
403
412
  # Upload the payload
404
413
  item.partition = partition
405
414
  resp = await self.session.post(
406
- url=self.nuclia_internal_push, data=item.json(), headers=headers
415
+ url=self.nuclia_internal_push, data=item.model_dump_json(), headers=headers
407
416
  )
408
417
  else:
409
- headers.update(
410
- {"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"}
411
- )
418
+ headers.update({"X-STF-NUAKEY": f"Bearer {self.nuclia_service_account}"})
412
419
  # Upload the payload
413
420
  resp = await self.session.post(
414
- url=self.nuclia_external_push_v2, data=item.json(), headers=headers
421
+ url=self.nuclia_external_push_v2, data=item.model_dump_json(), headers=headers
415
422
  )
416
423
  if resp.status == 200:
417
424
  data = await resp.json()
@@ -441,9 +448,7 @@ class ProcessingEngine:
441
448
  queue=QueueType(queue_type) if queue_type is not None else None,
442
449
  )
443
450
 
444
- async def delete_from_processing(
445
- self, *, kbid: str, resource_id: Optional[str] = None
446
- ) -> None:
451
+ async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
447
452
  """
448
453
  Delete a resource from processing. This prevents inflight resources from being processed
449
454
  and wasting resources.
@@ -473,7 +478,7 @@ class ProcessingEngine:
473
478
 
474
479
  class DummyProcessingEngine(ProcessingEngine):
475
480
  def __init__(self):
476
- self.calls: list[list[Any]] = [] # type: ignore
481
+ self.calls: list[list[Any]] = []
477
482
  self.values = defaultdict(list)
478
483
  self.onprem = True
479
484
 
@@ -495,9 +500,7 @@ class DummyProcessingEngine(ProcessingEngine):
495
500
  self.values["convert_external_filefield_to_str"].append(file_field)
496
501
  return f"convert_external_filefield_to_str,{index}"
497
502
 
498
- async def convert_internal_filefield_to_str(
499
- self, file: FieldFilePB, storage: Storage
500
- ) -> str:
503
+ async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
501
504
  self.calls.append([file, storage])
502
505
  index = len(self.values["convert_internal_filefield_to_str"])
503
506
  self.values["convert_internal_filefield_to_str"].append([file, storage])
@@ -509,16 +512,10 @@ class DummyProcessingEngine(ProcessingEngine):
509
512
  self.values["convert_internal_cf_to_str"].append([cf, storage])
510
513
  return f"convert_internal_cf_to_str,{index}"
511
514
 
512
- async def send_to_process(
513
- self, item: PushPayload, partition: int
514
- ) -> ProcessingInfo:
515
+ async def send_to_process(self, item: PushPayload, partition: int) -> ProcessingInfo:
515
516
  self.calls.append([item, partition])
516
517
  self.values["send_to_process"].append([item, partition])
517
- return ProcessingInfo(
518
- seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED
519
- )
518
+ return ProcessingInfo(seqid=len(self.calls), account_seq=0, queue=QueueType.SHARED)
520
519
 
521
- async def delete_from_processing(
522
- self, *, kbid: str, resource_id: Optional[str] = None
523
- ) -> None:
520
+ async def delete_from_processing(self, *, kbid: str, resource_id: Optional[str] = None) -> None:
524
521
  self.calls.append([kbid, resource_id])
File without changes