nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -45,9 +45,7 @@ from .shard_creator import ShardCreatorHandler
45
45
  def _handle_task_result(task: asyncio.Task) -> None:
46
46
  e = task.exception()
47
47
  if e:
48
- logger.exception(
49
- "Loop stopped by exception. This should not happen. Exiting.", exc_info=e
50
- )
48
+ logger.exception("Loop stopped by exception. This should not happen. Exiting.", exc_info=e)
51
49
  sys.exit(1)
52
50
 
53
51
 
@@ -87,9 +85,7 @@ async def start_ingest_consumers(
87
85
  if transaction_settings.transaction_local:
88
86
  raise ConfigurationError("Can not start ingest consumers in local mode")
89
87
 
90
- while len(
91
- manager.get_index_nodes()
92
- ) == 0 and running_settings.running_environment not in (
88
+ while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
93
89
  "local",
94
90
  "test",
95
91
  ):
@@ -101,9 +97,9 @@ async def start_ingest_consumers(
101
97
  storage = await get_storage(service_name=service_name or SERVICE_NAME)
102
98
  nats_connection_manager = get_nats_manager()
103
99
 
104
- max_concurrent_processing = asyncio.Semaphore(
105
- settings.max_concurrent_ingest_processing
106
- )
100
+ max_concurrent_processing = asyncio.Semaphore(settings.max_concurrent_ingest_processing)
101
+
102
+ consumer_finalizers = []
107
103
 
108
104
  for partition in settings.partitions:
109
105
  consumer = IngestConsumer(
@@ -115,8 +111,15 @@ async def start_ingest_consumers(
115
111
  lock=max_concurrent_processing,
116
112
  )
117
113
  await consumer.initialize()
114
+ consumer_finalizers.append(consumer.finalize)
118
115
 
119
- return nats_connection_manager.finalize
116
+ async def _finalize():
117
+ # Finalize all the consumers and the nats connection manager
118
+ for consumer_finalize in consumer_finalizers:
119
+ await consumer_finalize()
120
+ await nats_connection_manager.finalize()
121
+
122
+ return _finalize
120
123
 
121
124
 
122
125
  async def start_ingest_processed_consumer(
@@ -132,9 +135,7 @@ async def start_ingest_processed_consumer(
132
135
  if transaction_settings.transaction_local:
133
136
  raise ConfigurationError("Can not start ingest consumers in local mode")
134
137
 
135
- while len(
136
- manager.get_index_nodes()
137
- ) == 0 and running_settings.running_environment not in (
138
+ while len(manager.get_index_nodes()) == 0 and running_settings.running_environment not in (
138
139
  "local",
139
140
  "test",
140
141
  ):
@@ -161,19 +162,20 @@ async def start_ingest_processed_consumer(
161
162
  async def start_auditor() -> Callable[[], Awaitable[None]]:
162
163
  audit = get_audit()
163
164
  assert audit is not None
165
+
164
166
  pubsub = await get_pubsub()
165
167
  assert pubsub is not None, "Pubsub is not configured"
166
168
  storage = await get_storage(service_name=SERVICE_NAME)
167
169
  index_auditor = IndexAuditHandler(audit=audit, pubsub=pubsub)
168
- resource_writes_auditor = ResourceWritesAuditHandler(
169
- storage=storage, audit=audit, pubsub=pubsub
170
- )
170
+ resource_writes_auditor = ResourceWritesAuditHandler(storage=storage, audit=audit, pubsub=pubsub)
171
171
 
172
172
  await index_auditor.initialize()
173
173
  await resource_writes_auditor.initialize()
174
174
 
175
175
  return partial(
176
- asyncio.gather, index_auditor.finalize(), resource_writes_auditor.finalize() # type: ignore
176
+ asyncio.gather,
177
+ index_auditor.finalize(),
178
+ resource_writes_auditor.finalize(), # type: ignore
177
179
  )
178
180
 
179
181
 
@@ -82,9 +82,7 @@ class ShardCreatorHandler:
82
82
  metrics.total_messages.inc({"type": "shard_creator", "action": "ignored"})
83
83
  return
84
84
 
85
- self.task_handler.schedule(
86
- notification.kbid, partial(self.process_kb, notification.kbid)
87
- )
85
+ self.task_handler.schedule(notification.kbid, partial(self.process_kb, notification.kbid))
88
86
  metrics.total_messages.inc({"type": "shard_creator", "action": "scheduled"})
89
87
 
90
88
  @metrics.handler_histo.wrap({"type": "shard_creator"})
@@ -105,7 +103,7 @@ class ShardCreatorHandler:
105
103
  async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
106
104
  # remember, a lock will do at least 1+ reads and 1 write.
107
105
  # with heavy writes, this adds some simple k/v pressure
108
- node, shard_id = choose_node(current_shard)
106
+ node, shard_id = choose_node(current_shard, use_nidx=True)
109
107
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
110
108
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
111
109
  )
@@ -48,9 +48,7 @@ class DelayedTaskHandler:
48
48
  for task in list(self.outstanding_tasks.values()):
49
49
  await task
50
50
 
51
- def schedule(
52
- self, key: str, handler: Callable[[], Coroutine[None, None, None]]
53
- ) -> None:
51
+ def schedule(self, key: str, handler: Callable[[], Coroutine[None, None, None]]) -> None:
54
52
  if key in self.to_process:
55
53
  # already waiting to process this key, ignore
56
54
  return
@@ -21,14 +21,20 @@ from __future__ import annotations
21
21
 
22
22
  import enum
23
23
  from datetime import datetime
24
- from typing import Any, Optional, Type
24
+ from typing import Any, Generic, Optional, Type, TypeVar
25
25
 
26
+ from google.protobuf.message import DecodeError, Message
27
+
28
+ from nucliadb.common import datamanagers
29
+ from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
26
30
  from nucliadb_protos.resources_pb2 import (
27
31
  CloudFile,
28
32
  ExtractedTextWrapper,
29
33
  ExtractedVectorsWrapper,
34
+ FieldAuthor,
30
35
  FieldComputedMetadata,
31
36
  FieldComputedMetadataWrapper,
37
+ FieldQuestionAnswers,
32
38
  FieldQuestionAnswerWrapper,
33
39
  LargeComputedMetadata,
34
40
  LargeComputedMetadataWrapper,
@@ -36,34 +42,33 @@ from nucliadb_protos.resources_pb2 import (
36
42
  )
37
43
  from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
38
44
  from nucliadb_protos.writer_pb2 import Error
39
-
40
- from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
41
45
  from nucliadb_utils.storages.storage import Storage, StorageField
42
46
 
43
- KB_RESOURCE_FIELD = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
44
- KB_RESOURCE_ERROR = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/error"
45
-
46
- SUBFIELDFIELDS = ["l", "c"]
47
+ SUBFIELDFIELDS = ("c",)
47
48
 
48
49
 
49
50
  class FieldTypes(str, enum.Enum):
50
51
  FIELD_TEXT = "extracted_text"
51
52
  FIELD_VECTORS = "extracted_vectors"
53
+ FIELD_VECTORSET = "{vectorset}/extracted_vectors"
52
54
  FIELD_METADATA = "metadata"
53
55
  FIELD_LARGE_METADATA = "large_metadata"
54
56
  THUMBNAIL = "thumbnail"
55
57
  QUESTION_ANSWERS = "question_answers"
56
58
 
57
59
 
58
- class Field:
59
- pbklass: Optional[Type] = None
60
+ PbType = TypeVar("PbType", bound=Message)
61
+
62
+
63
+ class Field(Generic[PbType]):
64
+ pbklass: Type[PbType]
60
65
  type: str = "x"
61
66
  value: Optional[Any]
62
67
  extracted_text: Optional[ExtractedText]
63
- extracted_vectors: Optional[VectorObject]
68
+ extracted_vectors: dict[Optional[str], VectorObject]
64
69
  computed_metadata: Optional[FieldComputedMetadata]
65
70
  large_computed_metadata: Optional[LargeComputedMetadata]
66
- question_answers: Optional[QuestionAnswers]
71
+ question_answers: Optional[FieldQuestionAnswers]
67
72
 
68
73
  def __init__(
69
74
  self,
@@ -77,7 +82,7 @@ class Field:
77
82
 
78
83
  self.value = None
79
84
  self.extracted_text: Optional[ExtractedText] = None
80
- self.extracted_vectors = None
85
+ self.extracted_vectors = {}
81
86
  self.computed_metadata = None
82
87
  self.large_computed_metadata = None
83
88
  self.question_answers = None
@@ -112,44 +117,51 @@ class Field:
112
117
  return f"{self.uuid}/{self.type}/{self.id}"
113
118
 
114
119
  def get_storage_field(self, field_type: FieldTypes) -> StorageField:
115
- return self.storage.file_extracted(
116
- self.kbid, self.uuid, self.type, self.id, field_type.value
117
- )
120
+ return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, field_type.value)
121
+
122
+ def _get_extracted_vectors_storage_field(self, vectorset: Optional[str] = None) -> StorageField:
123
+ if vectorset:
124
+ key = FieldTypes.FIELD_VECTORSET.value.format(vectorset=vectorset)
125
+ else:
126
+ key = FieldTypes.FIELD_VECTORS.value
127
+ return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
118
128
 
119
- async def db_get_value(self):
129
+ async def db_get_value(self) -> Optional[PbType]:
120
130
  if self.value is None:
121
- payload = await self.resource.txn.get(
122
- KB_RESOURCE_FIELD.format(
123
- kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
124
- )
131
+ payload = await datamanagers.fields.get_raw(
132
+ self.resource.txn,
133
+ kbid=self.kbid,
134
+ rid=self.uuid,
135
+ field_type=self.type,
136
+ field_id=self.id,
125
137
  )
126
138
  if payload is None:
127
- return
139
+ return None
128
140
 
129
141
  self.value = self.pbklass()
130
142
  self.value.ParseFromString(payload)
131
143
  return self.value
132
144
 
133
145
  async def db_set_value(self, payload: Any):
134
- await self.resource.txn.set(
135
- KB_RESOURCE_FIELD.format(
136
- kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
137
- ),
138
- payload.SerializeToString(),
146
+ await datamanagers.fields.set(
147
+ self.resource.txn,
148
+ kbid=self.kbid,
149
+ rid=self.uuid,
150
+ field_type=self.type,
151
+ field_id=self.id,
152
+ value=payload,
139
153
  )
140
154
  self.value = payload
141
155
  self.resource.modified = True
142
156
 
143
157
  async def delete(self):
144
- field_base_key = KB_RESOURCE_FIELD.format(
145
- kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
158
+ await datamanagers.fields.delete(
159
+ self.resource.txn,
160
+ kbid=self.kbid,
161
+ rid=self.uuid,
162
+ field_type=self.type,
163
+ field_id=self.id,
146
164
  )
147
- # Make sure we explicitly delete the field and any nested key
148
- keys_to_delete = []
149
- async for key in self.resource.txn.keys(field_base_key):
150
- keys_to_delete.append(key)
151
- for key in keys_to_delete:
152
- await self.resource.txn.delete(key)
153
165
  await self.delete_extracted_text()
154
166
  await self.delete_vectors()
155
167
  await self.delete_metadata()
@@ -169,9 +181,9 @@ class Field:
169
181
  except KeyError:
170
182
  pass
171
183
 
172
- async def delete_vectors(self) -> None:
184
+ async def delete_vectors(self, vectorset: Optional[str] = None) -> None:
173
185
  # Try delete vectors
174
- sf = self.get_storage_field(FieldTypes.FIELD_VECTORS)
186
+ sf = self._get_extracted_vectors_storage_field(vectorset)
175
187
  try:
176
188
  await self.storage.delete_upload(sf.key, sf.bucket)
177
189
  except KeyError:
@@ -185,53 +197,79 @@ class Field:
185
197
  pass
186
198
 
187
199
  async def get_error(self) -> Optional[Error]:
188
- payload = await self.resource.txn.get(
189
- KB_RESOURCE_ERROR.format(
190
- kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
191
- )
200
+ return await datamanagers.fields.get_error(
201
+ self.resource.txn,
202
+ kbid=self.kbid,
203
+ rid=self.uuid,
204
+ field_type=self.type,
205
+ field_id=self.id,
192
206
  )
193
- if payload is None:
194
- return None
195
- pberror = Error()
196
- pberror.ParseFromString(payload)
197
- return pberror
198
207
 
199
208
  async def set_error(self, error: Error) -> None:
200
- await self.resource.txn.set(
201
- KB_RESOURCE_ERROR.format(
202
- kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
203
- ),
204
- error.SerializeToString(),
209
+ await datamanagers.fields.set_error(
210
+ self.resource.txn,
211
+ kbid=self.kbid,
212
+ rid=self.uuid,
213
+ field_type=self.type,
214
+ field_id=self.id,
215
+ error=error,
205
216
  )
206
217
 
207
- async def get_question_answers(self) -> Optional[QuestionAnswers]:
208
- if self.question_answers is None:
218
+ async def get_question_answers(self, force=False) -> Optional[FieldQuestionAnswers]:
219
+ if self.question_answers is None or force:
209
220
  sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
210
- payload = await self.storage.download_pb(sf, QuestionAnswers)
221
+ try:
222
+ payload = await self.storage.download_pb(sf, FieldQuestionAnswers)
223
+ except DecodeError:
224
+ deprecated_payload = await self.storage.download_pb(sf, QuestionAnswers)
225
+ if deprecated_payload is not None:
226
+ payload = FieldQuestionAnswers()
227
+ payload.question_answers.CopyFrom(deprecated_payload)
211
228
  if payload is not None:
212
229
  self.question_answers = payload
213
230
  return self.question_answers
214
231
 
215
232
  async def set_question_answers(self, payload: FieldQuestionAnswerWrapper) -> None:
233
+ if self.type in SUBFIELDFIELDS:
234
+ try:
235
+ actual_payload: Optional[FieldQuestionAnswers] = await self.get_question_answers(
236
+ force=True
237
+ )
238
+ except KeyError:
239
+ actual_payload = None
240
+ else:
241
+ actual_payload = None
216
242
  sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
217
243
 
218
- if payload.HasField("file"):
219
- raw_payload = await self.storage.downloadbytescf(payload.file)
220
- pb = QuestionAnswers()
221
- pb.ParseFromString(raw_payload.read())
222
- raw_payload.flush()
223
- self.question_answers = pb
244
+ if actual_payload is None:
245
+ # Its first question answer
246
+ if payload.HasField("file"):
247
+ await self.storage.normalize_binary(payload.file, sf)
248
+ else:
249
+ await self.storage.upload_pb(sf, payload.question_answers)
250
+ self.question_answers = payload.question_answers
224
251
  else:
225
- self.question_answers = payload.question_answers
226
-
227
- await self.storage.upload_pb(sf, self.question_answers)
252
+ if payload.HasField("file"):
253
+ raw_payload = await self.storage.downloadbytescf(payload.file)
254
+ pb = FieldQuestionAnswers()
255
+ pb.ParseFromString(raw_payload.read())
256
+ raw_payload.flush()
257
+ payload.question_answers.CopyFrom(pb)
258
+ # We know its payload.question_answers
259
+ for key, value in payload.question_answers.split_question_answers.items():
260
+ actual_payload.split_question_answers[key] = value
261
+ for key in payload.question_answers.deleted_splits:
262
+ if key in actual_payload.split_question_answers:
263
+ del actual_payload.split_question_answers[key]
264
+ if payload.question_answers.HasField("question_answers") != "":
265
+ actual_payload.question_answers.CopyFrom(payload.question_answers.question_answers)
266
+ await self.storage.upload_pb(sf, actual_payload)
267
+ self.question_answers = actual_payload
228
268
 
229
269
  async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
230
270
  if self.type in SUBFIELDFIELDS:
231
271
  try:
232
- actual_payload: Optional[ExtractedText] = await self.get_extracted_text(
233
- force=True
234
- )
272
+ actual_payload: Optional[ExtractedText] = await self.get_extracted_text(force=True)
235
273
  except KeyError:
236
274
  actual_payload = None
237
275
  else:
@@ -271,23 +309,21 @@ class Field:
271
309
  self.extracted_text = payload
272
310
  return self.extracted_text
273
311
 
274
- async def set_vectors(
275
- self, payload: ExtractedVectorsWrapper
276
- ) -> tuple[Optional[VectorObject], bool, list[str]]:
312
+ async def set_vectors(self, payload: ExtractedVectorsWrapper) -> Optional[VectorObject]:
313
+ vectorset = payload.vectorset_id or None
277
314
  if self.type in SUBFIELDFIELDS:
278
315
  try:
279
316
  actual_payload: Optional[VectorObject] = await self.get_vectors(
280
- force=True
317
+ vectorset=vectorset,
318
+ force=True,
281
319
  )
282
320
  except KeyError:
283
321
  actual_payload = None
284
322
  else:
285
323
  actual_payload = None
286
324
 
287
- sf = self.get_storage_field(FieldTypes.FIELD_VECTORS)
325
+ sf = self._get_extracted_vectors_storage_field(vectorset)
288
326
  vo: Optional[VectorObject] = None
289
- replace_field: bool = True
290
- replace_splits = []
291
327
  if actual_payload is None:
292
328
  # Its first extracted text
293
329
  if payload.HasField("file"):
@@ -296,7 +332,7 @@ class Field:
296
332
  else:
297
333
  await self.storage.upload_pb(sf, payload.vectors)
298
334
  vo = payload.vectors
299
- self.extracted_vectors = payload.vectors
335
+ self.extracted_vectors[vectorset] = payload.vectors
300
336
  else:
301
337
  if payload.HasField("file"):
302
338
  raw_payload = await self.storage.downloadbytescf(payload.file)
@@ -304,36 +340,38 @@ class Field:
304
340
  pb.ParseFromString(raw_payload.read())
305
341
  raw_payload.flush()
306
342
  payload.vectors.CopyFrom(pb)
307
- vo = payload.vectors
343
+ vo = actual_payload
308
344
  # We know its payload.body
309
345
  for key, value in payload.vectors.split_vectors.items():
310
346
  actual_payload.split_vectors[key].CopyFrom(value)
311
347
  for key in payload.vectors.deleted_splits:
312
348
  if key in actual_payload.split_vectors:
313
- replace_splits.append(key)
314
349
  del actual_payload.split_vectors[key]
315
350
  if len(payload.vectors.vectors.vectors) > 0:
316
- replace_field = True
317
351
  actual_payload.vectors.CopyFrom(payload.vectors.vectors)
318
352
  await self.storage.upload_pb(sf, actual_payload)
319
- self.extracted_vectors = actual_payload
320
- return vo, replace_field, replace_splits
321
-
322
- async def get_vectors(self, force=False) -> Optional[VectorObject]:
323
- if self.extracted_vectors is None or force:
324
- sf = self.get_storage_field(FieldTypes.FIELD_VECTORS)
353
+ self.extracted_vectors[vectorset] = actual_payload
354
+ return vo
355
+
356
+ async def get_vectors(
357
+ self, vectorset: Optional[str] = None, force: bool = False
358
+ ) -> Optional[VectorObject]:
359
+ # compat with vectorsets coming from protobuffers where no value is
360
+ # empty string instead of None. This shouldn't be handled here but we
361
+ # have to make sure it gets the correct vectorset
362
+ vectorset = vectorset or None
363
+ if self.extracted_vectors.get(vectorset, None) is None or force:
364
+ sf = self._get_extracted_vectors_storage_field(vectorset)
325
365
  payload = await self.storage.download_pb(sf, VectorObject)
326
366
  if payload is not None:
327
- self.extracted_vectors = payload
328
- return self.extracted_vectors
367
+ self.extracted_vectors[vectorset] = payload
368
+ return self.extracted_vectors.get(vectorset, None)
329
369
 
330
- async def set_field_metadata(
331
- self, payload: FieldComputedMetadataWrapper
332
- ) -> tuple[FieldComputedMetadata, list[str], dict[str, list[str]]]:
370
+ async def set_field_metadata(self, payload: FieldComputedMetadataWrapper) -> FieldComputedMetadata:
333
371
  if self.type in SUBFIELDFIELDS:
334
372
  try:
335
- actual_payload: Optional[FieldComputedMetadata] = (
336
- await self.get_field_metadata(force=True)
373
+ actual_payload: Optional[FieldComputedMetadata] = await self.get_field_metadata(
374
+ force=True
337
375
  )
338
376
  except KeyError:
339
377
  actual_payload = None
@@ -359,8 +397,6 @@ class Field:
359
397
  metadata.thumbnail.CopyFrom(cf_split)
360
398
  metadata.last_index.FromDatetime(datetime.now())
361
399
 
362
- replace_field = []
363
- replace_splits = {}
364
400
  if actual_payload is None:
365
401
  # Its first metadata
366
402
  await self.storage.upload_pb(sf, payload.metadata)
@@ -371,22 +407,15 @@ class Field:
371
407
  actual_payload.split_metadata[key].CopyFrom(value)
372
408
  for key in payload.metadata.deleted_splits:
373
409
  if key in actual_payload.split_metadata:
374
- replace_splits[key] = [
375
- f"{x.start}-{x.end}"
376
- for x in actual_payload.split_metadata[key].paragraphs
377
- ]
378
410
  del actual_payload.split_metadata[key]
379
411
  if payload.metadata.metadata:
380
412
  actual_payload.metadata.CopyFrom(payload.metadata.metadata)
381
- replace_field = [f"{x.start}-{x.end}" for x in metadata.paragraphs]
382
413
  await self.storage.upload_pb(sf, actual_payload)
383
414
  self.computed_metadata = actual_payload
384
415
 
385
- return self.computed_metadata, replace_field, replace_splits
416
+ return self.computed_metadata
386
417
 
387
- async def get_field_metadata(
388
- self, force: bool = False
389
- ) -> Optional[FieldComputedMetadata]:
418
+ async def get_field_metadata(self, force: bool = False) -> Optional[FieldComputedMetadata]:
390
419
  if self.computed_metadata is None or force:
391
420
  sf = self.get_storage_field(FieldTypes.FIELD_METADATA)
392
421
  payload = await self.storage.download_pb(sf, FieldComputedMetadata)
@@ -397,8 +426,8 @@ class Field:
397
426
  async def set_large_field_metadata(self, payload: LargeComputedMetadataWrapper):
398
427
  if self.type in SUBFIELDFIELDS:
399
428
  try:
400
- actual_payload: Optional[LargeComputedMetadata] = (
401
- await self.get_large_field_metadata(force=True)
429
+ actual_payload: Optional[LargeComputedMetadata] = await self.get_large_field_metadata(
430
+ force=True
402
431
  )
403
432
  except KeyError:
404
433
  actual_payload = None
@@ -434,9 +463,7 @@ class Field:
434
463
 
435
464
  return self.large_computed_metadata
436
465
 
437
- async def get_large_field_metadata(
438
- self, force: bool = False
439
- ) -> Optional[LargeComputedMetadata]:
466
+ async def get_large_field_metadata(self, force: bool = False) -> Optional[LargeComputedMetadata]:
440
467
  if self.large_computed_metadata is None or force:
441
468
  sf = self.get_storage_field(FieldTypes.FIELD_LARGE_METADATA)
442
469
  payload = await self.storage.download_pb(
@@ -447,6 +474,11 @@ class Field:
447
474
  self.large_computed_metadata = payload
448
475
  return self.large_computed_metadata
449
476
 
477
+ async def generated_by(self) -> FieldAuthor:
478
+ author = FieldAuthor()
479
+ author.user.SetInParent()
480
+ return author
481
+
450
482
  def serialize(self):
451
483
  return self.value.SerializeToString()
452
484
 
@@ -20,11 +20,9 @@
20
20
  import uuid
21
21
  from typing import Any, Optional
22
22
 
23
- from nucliadb_protos.resources_pb2 import CloudFile
24
- from nucliadb_protos.resources_pb2 import Conversation as PBConversation
25
- from nucliadb_protos.resources_pb2 import FieldConversation
26
-
27
23
  from nucliadb.ingest.fields.base import Field
24
+ from nucliadb_protos.resources_pb2 import CloudFile, FieldConversation
25
+ from nucliadb_protos.resources_pb2 import Conversation as PBConversation
28
26
  from nucliadb_utils.storages.storage import StorageField
29
27
 
30
28
  PAGE_SIZE = 200
@@ -36,7 +34,7 @@ class PageNotFound(Exception):
36
34
  pass
37
35
 
38
36
 
39
- class Conversation(Field):
37
+ class Conversation(Field[PBConversation]):
40
38
  pbklass = PBConversation
41
39
  type: str = "c"
42
40
  value: dict[int, PBConversation]
@@ -120,6 +118,21 @@ class Conversation(Field):
120
118
  except PageNotFound:
121
119
  return None
122
120
 
121
+ async def get_full_conversation(self) -> Optional[PBConversation]:
122
+ """
123
+ Messages of a conversations may be stored across several pages.
124
+ This method fetches them all and returns a single complete conversation.
125
+ """
126
+ full_conv = PBConversation()
127
+ n_page = 1
128
+ while True:
129
+ page = await self.get_value(page=n_page)
130
+ if page is None:
131
+ break
132
+ full_conv.messages.extend(page.messages)
133
+ n_page += 1
134
+ return full_conv
135
+
123
136
  async def get_metadata(self) -> FieldConversation:
124
137
  if self.metadata is None:
125
138
  payload = await self.resource.txn.get(
@@ -28,7 +28,4 @@ class InvalidPBClass(Exception):
28
28
  def __init__(self, source: Type, destination: Type):
29
29
  self.source = source
30
30
  self.destination = destination
31
- super().__init__(
32
- "Source and destination does not match "
33
- f"{self.source} - {self.destination}"
34
- )
31
+ super().__init__("Source and destination does not match " f"{self.source} - {self.destination}")
@@ -19,15 +19,14 @@
19
19
  #
20
20
  from typing import Any, Optional
21
21
 
22
- from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
23
-
24
22
  from nucliadb.ingest.fields.base import Field
23
+ from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FileExtractedData
25
24
  from nucliadb_utils.storages.storage import StorageField
26
25
 
27
26
  FILE_METADATA = "file_metadata"
28
27
 
29
28
 
30
- class File(Field):
29
+ class File(Field[FieldFile]):
31
30
  pbklass = FieldFile
32
31
  value: FieldFile
33
32
  type: str = "f"
@@ -52,15 +51,13 @@ class File(Field):
52
51
 
53
52
  is_external_file = payload.file.source == CloudFile.Source.EXTERNAL
54
53
  if not is_external_file:
55
- sf: StorageField = self.storage.file_field(
56
- self.kbid, self.uuid, self.id, old_cf
57
- )
54
+ sf: StorageField = self.storage.file_field(self.kbid, self.uuid, self.id, old_cf)
58
55
  cf: CloudFile = await self.storage.normalize_binary(payload.file, sf)
59
56
  payload.file.CopyFrom(cf)
60
57
 
61
58
  await self.db_set_value(payload)
62
59
 
63
- async def get_value(self) -> FieldFile:
60
+ async def get_value(self) -> Optional[FieldFile]:
64
61
  return await self.db_get_value()
65
62
 
66
63
  async def set_file_extracted_data(self, file_extracted_data: FileExtractedData):
@@ -80,17 +77,13 @@ class File(Field):
80
77
  cf_file_page_preview: CloudFile = await self.storage.normalize_binary(
81
78
  preview, sf_file_page_preview
82
79
  )
83
- file_extracted_data.file_pages_previews.pages[page].CopyFrom(
84
- cf_file_page_preview
85
- )
80
+ file_extracted_data.file_pages_previews.pages[page].CopyFrom(cf_file_page_preview)
86
81
 
87
82
  for fileid, origincf in file_extracted_data.file_generated.items():
88
83
  sf_generated: StorageField = self.storage.file_extracted(
89
84
  self.kbid, self.uuid, self.type, self.id, f"generated/{fileid}"
90
85
  )
91
- cf_generated: CloudFile = await self.storage.normalize_binary(
92
- origincf, sf_generated
93
- )
86
+ cf_generated: CloudFile = await self.storage.normalize_binary(origincf, sf_generated)
94
87
  file_extracted_data.file_generated[fileid].CopyFrom(cf_generated)
95
88
 
96
89
  if file_extracted_data.HasField("file_thumbnail"):
@@ -113,7 +106,5 @@ class File(Field):
113
106
  sf: StorageField = self.storage.file_extracted(
114
107
  self.kbid, self.uuid, self.type, self.id, FILE_METADATA
115
108
  )
116
- self.file_extracted_data = await self.storage.download_pb(
117
- sf, FileExtractedData
118
- )
109
+ self.file_extracted_data = await self.storage.download_pb(sf, FileExtractedData)
119
110
  return self.file_extracted_data