nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -23,19 +23,33 @@ import asyncio
23
23
  import logging
24
24
  from concurrent.futures import ThreadPoolExecutor
25
25
  from functools import partial
26
- from typing import TYPE_CHECKING, Any, AsyncIterator, Optional, Type
26
+ from typing import TYPE_CHECKING, Any, AsyncIterator, MutableMapping, Optional, Type
27
27
 
28
+ from nucliadb.common import datamanagers
29
+ from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
30
+ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
31
+ from nucliadb.common.maindb.driver import Transaction
32
+ from nucliadb.ingest.fields.base import Field
33
+ from nucliadb.ingest.fields.conversation import Conversation
34
+ from nucliadb.ingest.fields.file import File
35
+ from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
36
+ from nucliadb.ingest.fields.link import Link
37
+ from nucliadb.ingest.fields.text import Text
38
+ from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
39
+ from nucliadb.ingest.orm.metrics import processor_observer
40
+ from nucliadb_models import content_types
41
+ from nucliadb_models.common import CloudLink
42
+ from nucliadb_models.content_types import GENERIC_MIME_TYPE
43
+ from nucliadb_protos import utils_pb2, writer_pb2
28
44
  from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
29
- from nucliadb_protos.resources_pb2 import Basic
30
- from nucliadb_protos.resources_pb2 import Basic as PBBasic
31
- from nucliadb_protos.resources_pb2 import CloudFile
32
- from nucliadb_protos.resources_pb2 import Conversation as PBConversation
33
- from nucliadb_protos.resources_pb2 import Extra as PBExtra
34
45
  from nucliadb_protos.resources_pb2 import (
46
+ Basic,
47
+ CloudFile,
35
48
  ExtractedTextWrapper,
36
49
  ExtractedVectorsWrapper,
37
50
  FieldClassifications,
38
51
  FieldComputedMetadataWrapper,
52
+ FieldFile,
39
53
  FieldID,
40
54
  FieldMetadata,
41
55
  FieldQuestionAnswerWrapper,
@@ -44,41 +58,27 @@ from nucliadb_protos.resources_pb2 import (
44
58
  FileExtractedData,
45
59
  LargeComputedMetadataWrapper,
46
60
  LinkExtractedData,
61
+ Metadata,
62
+ Paragraph,
63
+ ParagraphAnnotation,
47
64
  )
48
- from nucliadb_protos.resources_pb2 import Metadata
65
+ from nucliadb_protos.resources_pb2 import Basic as PBBasic
66
+ from nucliadb_protos.resources_pb2 import Conversation as PBConversation
67
+ from nucliadb_protos.resources_pb2 import Extra as PBExtra
49
68
  from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
50
69
  from nucliadb_protos.resources_pb2 import Origin as PBOrigin
51
- from nucliadb_protos.resources_pb2 import Paragraph, ParagraphAnnotation
52
70
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
53
- from nucliadb_protos.resources_pb2 import UserVectorsWrapper
54
- from nucliadb_protos.train_pb2 import EnabledMetadata
55
- from nucliadb_protos.train_pb2 import Position as TrainPosition
56
71
  from nucliadb_protos.train_pb2 import (
72
+ EnabledMetadata,
57
73
  TrainField,
58
74
  TrainMetadata,
59
75
  TrainParagraph,
60
76
  TrainResource,
61
77
  TrainSentence,
62
78
  )
79
+ from nucliadb_protos.train_pb2 import Position as TrainPosition
63
80
  from nucliadb_protos.utils_pb2 import Relation as PBRelation
64
81
  from nucliadb_protos.writer_pb2 import BrokerMessage
65
-
66
- from nucliadb.common.maindb.driver import Transaction
67
- from nucliadb.ingest.fields.base import Field
68
- from nucliadb.ingest.fields.conversation import Conversation
69
- from nucliadb.ingest.fields.date import Datetime
70
- from nucliadb.ingest.fields.file import File
71
- from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
72
- from nucliadb.ingest.fields.keywordset import Keywordset
73
- from nucliadb.ingest.fields.layout import Layout
74
- from nucliadb.ingest.fields.link import Link
75
- from nucliadb.ingest.fields.text import Text
76
- from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
77
- from nucliadb.ingest.orm.metrics import processor_observer
78
- from nucliadb.ingest.orm.utils import get_basic, set_basic
79
- from nucliadb_models.common import CloudLink
80
- from nucliadb_models.writer import GENERIC_MIME_TYPE
81
- from nucliadb_protos import utils_pb2, writer_pb2
82
82
  from nucliadb_utils.storages.storage import Storage
83
83
 
84
84
  if TYPE_CHECKING: # pragma: no cover
@@ -86,41 +86,14 @@ if TYPE_CHECKING: # pragma: no cover
86
86
 
87
87
  logger = logging.getLogger(__name__)
88
88
 
89
- KB_RESOURCE_ORIGIN = "/kbs/{kbid}/r/{uuid}/origin"
90
- KB_RESOURCE_EXTRA = "/kbs/{kbid}/r/{uuid}/extra"
91
- KB_RESOURCE_SECURITY = "/kbs/{kbid}/r/{uuid}/security"
92
- KB_RESOURCE_METADATA = "/kbs/{kbid}/r/{uuid}/metadata"
93
- KB_RESOURCE_RELATIONS = "/kbs/{kbid}/r/{uuid}/relations"
94
- KB_RESOURCE_FIELDS = "/kbs/{kbid}/r/{uuid}/f/"
95
- KB_RESOURCE_ALL_FIELDS = "/kbs/{kbid}/r/{uuid}/allfields"
96
- KB_RESOURCE_SLUG_BASE = "/kbs/{kbid}/s/"
97
- KB_RESOURCE_SLUG = f"{KB_RESOURCE_SLUG_BASE}{{slug}}"
98
- KB_RESOURCE_CONVERSATION = "/kbs/{kbid}/r/{uuid}/c/{page}"
99
- GLOBAL_FIELD = "a"
100
89
  KB_FIELDS: dict[int, Type] = {
101
- FieldType.LAYOUT: Layout,
102
90
  FieldType.TEXT: Text,
103
91
  FieldType.FILE: File,
104
92
  FieldType.LINK: Link,
105
- FieldType.DATETIME: Datetime,
106
- FieldType.KEYWORDSET: Keywordset,
107
93
  FieldType.GENERIC: Generic,
108
94
  FieldType.CONVERSATION: Conversation,
109
95
  }
110
96
 
111
- KB_REVERSE: dict[str, FieldType.ValueType] = {
112
- "l": FieldType.LAYOUT,
113
- "t": FieldType.TEXT,
114
- "f": FieldType.FILE,
115
- "u": FieldType.LINK,
116
- "d": FieldType.DATETIME,
117
- "k": FieldType.KEYWORDSET,
118
- "a": FieldType.GENERIC,
119
- "c": FieldType.CONVERSATION,
120
- }
121
-
122
- FIELD_TYPE_TO_ID = {v: k for k, v in KB_REVERSE.items()}
123
-
124
97
  _executor = ThreadPoolExecutor(10)
125
98
 
126
99
 
@@ -131,6 +104,8 @@ PB_TEXT_FORMAT_TO_MIMETYPE = {
131
104
  FieldText.Format.MARKDOWN: "text/markdown",
132
105
  FieldText.Format.JSON: "application/json",
133
106
  FieldText.Format.KEEP_MARKDOWN: "text/markdown",
107
+ FieldText.Format.JSONL: "application/x-ndjson",
108
+ FieldText.Format.PLAIN_BLANKLINE_SPLIT: "text/plain+blankline",
134
109
  }
135
110
 
136
111
  BASIC_IMMUTABLE_FIELDS = ("icon",)
@@ -179,32 +154,11 @@ class Resource:
179
154
  new_key = KB_RESOURCE_SLUG.format(kbid=self.kb.kbid, slug=basic.slug)
180
155
  await self.txn.set(new_key, self.uuid.encode())
181
156
 
182
- @staticmethod
183
- def parse_basic(payload: bytes) -> PBBasic:
184
- pb = PBBasic()
185
- if payload is None:
186
- return None
187
-
188
- pb.ParseFromString(payload)
189
- return pb
190
-
191
- async def exists(self) -> bool:
192
- exists = True
193
- if self.basic is None:
194
- payload = await get_basic(self.txn, self.kb.kbid, self.uuid)
195
- if payload is not None:
196
- pb = PBBasic()
197
- pb.ParseFromString(payload)
198
- self.basic = pb
199
- else:
200
- exists = False
201
- return exists
202
-
203
157
  # Basic
204
158
  async def get_basic(self) -> Optional[PBBasic]:
205
159
  if self.basic is None:
206
- payload = await get_basic(self.txn, self.kb.kbid, self.uuid)
207
- self.basic = self.parse_basic(payload) if payload is not None else PBBasic()
160
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
161
+ self.basic = basic if basic is not None else PBBasic()
208
162
  return self.basic
209
163
 
210
164
  def set_processing_status(self, current_basic: PBBasic, basic_in_payload: PBBasic):
@@ -249,9 +203,7 @@ class Resource:
249
203
  fields.append(field_id)
250
204
  positions[field_id] = i
251
205
 
252
- updated = [
253
- self.basic.fieldmetadata[positions[field]] for field in fields
254
- ]
206
+ updated = [self.basic.fieldmetadata[positions[field]] for field in fields]
255
207
 
256
208
  del self.basic.fieldmetadata[:]
257
209
  self.basic.fieldmetadata.extend(updated)
@@ -272,11 +224,10 @@ class Resource:
272
224
  self.indexer.apply_field_metadata(
273
225
  field_id,
274
226
  field_metadata,
275
- replace_field=[],
276
- replace_splits={},
277
227
  page_positions=page_positions,
278
228
  extracted_text=await field_obj.get_extracted_text(),
279
229
  basic_user_field_metadata=user_field_metadata,
230
+ replace_field=True,
280
231
  )
281
232
 
282
233
  # Some basic fields are computed off field metadata.
@@ -284,27 +235,21 @@ class Resource:
284
235
  if deleted_fields is not None and len(deleted_fields) > 0:
285
236
  remove_field_classifications(self.basic, deleted_fields=deleted_fields)
286
237
 
287
- await set_basic(self.txn, self.kb.kbid, self.uuid, self.basic)
238
+ await datamanagers.resources.set_basic(
239
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, basic=self.basic
240
+ )
288
241
  self.modified = True
289
242
 
290
243
  # Origin
291
244
  async def get_origin(self) -> Optional[PBOrigin]:
292
245
  if self.origin is None:
293
- pb = PBOrigin()
294
- payload = await self.txn.get(
295
- KB_RESOURCE_ORIGIN.format(kbid=self.kb.kbid, uuid=self.uuid)
296
- )
297
- if payload is None:
298
- return None
299
-
300
- pb.ParseFromString(payload)
301
- self.origin = pb
246
+ origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
247
+ self.origin = origin
302
248
  return self.origin
303
249
 
304
250
  async def set_origin(self, payload: PBOrigin):
305
- await self.txn.set(
306
- KB_RESOURCE_ORIGIN.format(kbid=self.kb.kbid, uuid=self.uuid),
307
- payload.SerializeToString(),
251
+ await datamanagers.resources.set_origin(
252
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, origin=payload
308
253
  )
309
254
  self.modified = True
310
255
  self.origin = payload
@@ -312,42 +257,27 @@ class Resource:
312
257
  # Extra
313
258
  async def get_extra(self) -> Optional[PBExtra]:
314
259
  if self.extra is None:
315
- pb = PBExtra()
316
- payload = await self.txn.get(
317
- KB_RESOURCE_EXTRA.format(kbid=self.kb.kbid, uuid=self.uuid)
318
- )
319
- if payload is None:
320
- return None
321
- pb.ParseFromString(payload)
322
- self.extra = pb
260
+ extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
261
+ self.extra = extra
323
262
  return self.extra
324
263
 
325
264
  async def set_extra(self, payload: PBExtra):
326
- key = KB_RESOURCE_EXTRA.format(kbid=self.kb.kbid, uuid=self.uuid)
327
- await self.txn.set(
328
- key,
329
- payload.SerializeToString(),
330
- )
265
+ await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
331
266
  self.modified = True
332
267
  self.extra = payload
333
268
 
334
269
  # Security
335
270
  async def get_security(self) -> Optional[utils_pb2.Security]:
336
271
  if self.security is None:
337
- pb = utils_pb2.Security()
338
- key = KB_RESOURCE_SECURITY.format(kbid=self.kb.kbid, uuid=self.uuid)
339
- payload = await self.txn.get(key)
340
- if payload is None:
341
- return None
342
- pb.ParseFromString(payload)
343
- self.security = pb
272
+ security = await datamanagers.resources.get_security(
273
+ self.txn, kbid=self.kb.kbid, rid=self.uuid
274
+ )
275
+ self.security = security
344
276
  return self.security
345
277
 
346
278
  async def set_security(self, payload: utils_pb2.Security) -> None:
347
- key = KB_RESOURCE_SECURITY.format(kbid=self.kb.kbid, uuid=self.uuid)
348
- await self.txn.set(
349
- key,
350
- payload.SerializeToString(),
279
+ await datamanagers.resources.set_security(
280
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, security=payload
351
281
  )
352
282
  self.modified = True
353
283
  self.security = payload
@@ -355,29 +285,24 @@ class Resource:
355
285
  # Relations
356
286
  async def get_relations(self) -> Optional[PBRelations]:
357
287
  if self.relations is None:
358
- pb = PBRelations()
359
- payload = await self.txn.get(
360
- KB_RESOURCE_RELATIONS.format(kbid=self.kb.kbid, uuid=self.uuid)
288
+ relations = await datamanagers.resources.get_relations(
289
+ self.txn, kbid=self.kb.kbid, rid=self.uuid
361
290
  )
362
- if payload is None:
363
- return None
364
- pb.ParseFromString(payload)
365
- self.relations = pb
291
+ self.relations = relations
366
292
  return self.relations
367
293
 
368
294
  async def set_relations(self, payload: list[PBRelation]):
369
295
  relations = PBRelations()
370
296
  for relation in payload:
371
297
  relations.relations.append(relation)
372
- await self.txn.set(
373
- KB_RESOURCE_RELATIONS.format(kbid=self.kb.kbid, uuid=self.uuid),
374
- relations.SerializeToString(),
298
+ await datamanagers.resources.set_relations(
299
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, relations=relations
375
300
  )
376
301
  self.modified = True
377
302
  self.relations = relations
378
303
 
379
304
  @processor_observer.wrap({"type": "generate_index_message"})
380
- async def generate_index_message(self) -> ResourceBrain:
305
+ async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
381
306
  brain = ResourceBrain(rid=self.uuid)
382
307
  origin = await self.get_origin()
383
308
  basic = await self.get_basic()
@@ -387,7 +312,7 @@ class Resource:
387
312
  await self.compute_global_tags(brain)
388
313
  fields = await self.get_fields(force=True)
389
314
  for (type_id, field_id), field in fields.items():
390
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
315
+ fieldid = FieldID(field_type=type_id, field=field_id)
391
316
  await self.compute_global_text_field(fieldid, brain)
392
317
 
393
318
  field_metadata = await field.get_field_metadata()
@@ -403,251 +328,66 @@ class Resource:
403
328
  (
404
329
  fm
405
330
  for fm in basic.fieldmetadata
406
- if fm.field.field == field_id
407
- and fm.field.field_type == type_id
331
+ if fm.field.field == field_id and fm.field.field_type == type_id
408
332
  ),
409
333
  None,
410
334
  )
411
335
  brain.apply_field_metadata(
412
336
  field_key,
413
337
  field_metadata,
414
- replace_field=[],
415
- replace_splits={},
416
338
  page_positions=page_positions,
417
339
  extracted_text=await field.get_extracted_text(),
418
340
  basic_user_field_metadata=user_field_metadata,
341
+ replace_field=reindex,
419
342
  )
420
343
 
421
344
  if self.disable_vectors is False:
345
+ # XXX: while we don't remove the "default" vectorset concept, we
346
+ # need to do use None as the default one
422
347
  vo = await field.get_vectors()
423
348
  if vo is not None:
424
- brain.apply_field_vectors(field_key, vo, False, [])
349
+ async with datamanagers.with_ro_transaction() as ro_txn:
350
+ dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
351
+ ro_txn, kbid=self.kb.kbid
352
+ )
353
+ brain.apply_field_vectors(
354
+ field_key,
355
+ vo,
356
+ matryoshka_vector_dimension=dimension,
357
+ replace_field=reindex,
358
+ )
425
359
 
426
- vu = await field.get_user_vectors()
427
- if vu is not None:
428
- vectors_to_delete = {} # type: ignore
429
- brain.apply_user_vectors(field_key, vu, vectors_to_delete) # type: ignore
360
+ vectorset_configs = []
361
+ async with datamanagers.with_ro_transaction() as ro_txn:
362
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
363
+ ro_txn, kbid=self.kb.kbid
364
+ ):
365
+ vectorset_configs.append(vectorset_config)
366
+ for vectorset_config in vectorset_configs:
367
+ vo = await field.get_vectors(vectorset=vectorset_config.vectorset_id)
368
+ if vo is not None:
369
+ dimension = vectorset_config.vectorset_index_config.vector_dimension
370
+ brain.apply_field_vectors(
371
+ field_key,
372
+ vo,
373
+ vectorset=vectorset_config.vectorset_id,
374
+ matryoshka_vector_dimension=dimension,
375
+ replace_field=reindex,
376
+ )
430
377
  return brain
431
378
 
432
- async def generate_field_vectors(
433
- self,
434
- bm: BrokerMessage,
435
- type_id: FieldType.ValueType,
436
- field_id: str,
437
- field: Field,
438
- ):
439
- vo = await field.get_vectors()
440
- if vo is None:
441
- return
442
- evw = ExtractedVectorsWrapper()
443
- evw.field.field = field_id
444
- evw.field.field_type = type_id # type: ignore
445
- evw.vectors.CopyFrom(vo)
446
- bm.field_vectors.append(evw)
447
-
448
- async def generate_user_vectors(
449
- self,
450
- bm: BrokerMessage,
451
- type_id: FieldType.ValueType,
452
- field_id: str,
453
- field: Field,
454
- ):
455
- uv = await field.get_user_vectors()
456
- if uv is None:
457
- return
458
- uvw = UserVectorsWrapper()
459
- uvw.field.field = field_id
460
- uvw.field.field_type = type_id # type: ignore
461
- uvw.vectors.CopyFrom(uv)
462
- bm.user_vectors.append(uvw)
463
-
464
- async def generate_field_large_computed_metadata(
465
- self,
466
- bm: BrokerMessage,
467
- type_id: FieldType.ValueType,
468
- field_id: str,
469
- field: Field,
470
- ):
471
- lcm = await field.get_large_field_metadata()
472
- if lcm is None:
473
- return
474
- lcmw = LargeComputedMetadataWrapper()
475
- lcmw.field.field = field_id
476
- lcmw.field.field_type = type_id # type: ignore
477
- lcmw.real.CopyFrom(lcm)
478
- bm.field_large_metadata.append(lcmw)
479
-
480
- async def generate_field_computed_metadata(
481
- self,
482
- bm: BrokerMessage,
483
- type_id: FieldType.ValueType,
484
- field_id: str,
485
- field: Field,
486
- ):
487
- fcmw = FieldComputedMetadataWrapper()
488
- fcmw.field.field = field_id
489
- fcmw.field.field_type = type_id # type: ignore
490
-
491
- field_metadata = await field.get_field_metadata()
492
- if field_metadata is not None:
493
- fcmw.metadata.CopyFrom(field_metadata)
494
- fcmw.field.field = field_id
495
- fcmw.field.field_type = type_id # type: ignore
496
- bm.field_metadata.append(fcmw)
497
- # Make sure cloud files are removed for exporting
498
-
499
- async def generate_extracted_text(
500
- self,
501
- bm: BrokerMessage,
502
- type_id: FieldType.ValueType,
503
- field_id: str,
504
- field: Field,
505
- ):
506
- etw = ExtractedTextWrapper()
507
- etw.field.field = field_id
508
- etw.field.field_type = type_id # type: ignore
509
- extracted_text = await field.get_extracted_text()
510
- if extracted_text is not None:
511
- etw.body.CopyFrom(extracted_text)
512
- bm.extracted_text.append(etw)
513
-
514
- async def generate_field(
515
- self,
516
- bm: BrokerMessage,
517
- type_id: FieldType.ValueType,
518
- field_id: str,
519
- field: Field,
520
- ):
521
- # Used for exporting a field
522
- if type_id == FieldType.TEXT:
523
- value = await field.get_value()
524
- bm.texts[field_id].CopyFrom(value)
525
- elif type_id == FieldType.LINK:
526
- value = await field.get_value()
527
- bm.links[field_id].CopyFrom(value)
528
- elif type_id == FieldType.FILE:
529
- value = await field.get_value()
530
- bm.files[field_id].CopyFrom(value)
531
- elif type_id == FieldType.CONVERSATION:
532
- value = await self.get_full_conversation(field) # type: ignore
533
- bm.conversations[field_id].CopyFrom(value)
534
- elif type_id == FieldType.KEYWORDSET:
535
- value = await field.get_value()
536
- bm.keywordsets[field_id].CopyFrom(value)
537
- elif type_id == FieldType.DATETIME:
538
- value = await field.get_value()
539
- bm.datetimes[field_id].CopyFrom(value)
540
- elif type_id == FieldType.LAYOUT:
541
- value = await field.get_value()
542
- bm.layouts[field_id].CopyFrom(value)
543
-
544
- async def get_full_conversation(
545
- self,
546
- conversation_field: Conversation,
547
- ) -> Optional[PBConversation]:
548
- """
549
- Messages of a conversations may be stored across several pages.
550
- This method fetches them all and returns a single complete conversation.
551
- """
552
- full_conv = PBConversation()
553
- n_page = 1
554
- while True:
555
- page = await conversation_field.get_value(page=n_page)
556
- if page is None:
557
- break
558
- full_conv.messages.extend(page.messages)
559
- n_page += 1
560
- return full_conv
561
-
562
- async def generate_broker_message(self) -> BrokerMessage:
563
- # full means downloading all the pointers
564
- # minuts the ones to external files that are not PB
565
- # Go for all fields and recreate brain
566
- bm = BrokerMessage()
567
- bm.kbid = self.kb.kbid
568
- bm.uuid = self.uuid
569
- basic = await self.get_basic()
570
- if basic is not None:
571
- bm.basic.CopyFrom(basic)
572
- bm.slug = bm.basic.slug
573
- origin = await self.get_origin()
574
- if origin is not None:
575
- bm.origin.CopyFrom(origin)
576
- relations = await self.get_relations()
577
- if relations is not None:
578
- for relation in relations.relations:
579
- bm.relations.append(relation)
580
-
581
- fields = await self.get_fields(force=True)
582
- for (type_id, field_id), field in fields.items():
583
- # Value
584
- await self.generate_field(bm, type_id, field_id, field)
585
-
586
- # Extracted text
587
- await self.generate_extracted_text(bm, type_id, field_id, field)
588
-
589
- # Field Computed Metadata
590
- await self.generate_field_computed_metadata(bm, type_id, field_id, field)
591
-
592
- if type_id == FieldType.FILE and isinstance(field, File):
593
- field_extracted_data = await field.get_file_extracted_data()
594
- if field_extracted_data is not None:
595
- bm.file_extracted_data.append(field_extracted_data)
596
-
597
- elif type_id == FieldType.LINK and isinstance(field, Link):
598
- link_extracted_data = await field.get_link_extracted_data()
599
- if link_extracted_data is not None:
600
- bm.link_extracted_data.append(link_extracted_data)
601
-
602
- # Field vectors
603
- await self.generate_field_vectors(bm, type_id, field_id, field)
604
-
605
- # User vectors
606
- await self.generate_user_vectors(bm, type_id, field_id, field)
607
-
608
- # Large metadata
609
- await self.generate_field_large_computed_metadata(
610
- bm, type_id, field_id, field
611
- )
612
-
613
- return bm
614
-
615
379
  # Fields
616
- async def get_fields(
617
- self, force: bool = False
618
- ) -> dict[tuple[FieldType.ValueType, str], Field]:
380
+ async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
619
381
  # Get all fields
620
382
  for type, field in await self.get_fields_ids(force=force):
621
383
  if (type, field) not in self.fields:
622
384
  self.fields[(type, field)] = await self.get_field(field, type)
623
385
  return self.fields
624
386
 
625
- async def _deprecated_scan_fields_ids(
626
- self,
627
- ) -> AsyncIterator[tuple[FieldType.ValueType, str]]:
628
- logger.warning("Scanning fields ids. This is not optimal.")
629
- prefix = KB_RESOURCE_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
630
- allfields = set()
631
- async for key in self.txn.keys(prefix, count=-1):
632
- # The [6:8] `slicing purpose is to match exactly the two
633
- # splitted parts corresponding to type and field, and nothing else!
634
- type, field = key.split("/")[6:8]
635
- type_id = KB_REVERSE.get(type)
636
- if type_id is None:
637
- raise AttributeError("Invalid field type")
638
- result = (type_id, field)
639
- if result not in allfields:
640
- # fields can have errors that are stored in a subkey:
641
- # - field key -> kbs/kbid/r/ruuid/f/myfield
642
- # - field error key -> kbs/kbid/r/ruuid/f/myfield/errors
643
- # and that would return duplicates here.
644
- yield result
645
- allfields.add(result)
646
-
647
387
  async def _inner_get_fields_ids(self) -> list[tuple[FieldType.ValueType, str]]:
648
388
  # Use a set to make sure we don't have duplicate field ids
649
389
  result = set()
650
- all_fields = await self.get_all_field_ids()
390
+ all_fields = await self.get_all_field_ids(for_update=False)
651
391
  if all_fields is not None:
652
392
  for f in all_fields.fields:
653
393
  result.add((f.field_type, f.field))
@@ -664,9 +404,7 @@ class Resource:
664
404
  result.add((FieldType.GENERIC, generic))
665
405
  return list(result)
666
406
 
667
- async def get_fields_ids(
668
- self, force: bool = False
669
- ) -> list[tuple[FieldType.ValueType, str]]:
407
+ async def get_fields_ids(self, force: bool = False) -> list[tuple[FieldType.ValueType, str]]:
670
408
  """
671
409
  Get all ids of the fields of the resource and cache them.
672
410
  """
@@ -710,32 +448,26 @@ class Resource:
710
448
  if field in self.all_fields_keys:
711
449
  self.all_fields_keys.remove(field)
712
450
 
713
- field_key = self.generate_field_id(FieldID(field_type=type, field=key)) # type: ignore
714
- vo = await field_obj.get_vectors()
715
- if vo is not None:
716
- self.indexer.delete_vectors(field_key=field_key, vo=vo)
451
+ field_key = self.generate_field_id(FieldID(field_type=type, field=key))
717
452
 
718
453
  metadata = await field_obj.get_field_metadata()
719
454
  if metadata is not None:
720
- self.indexer.delete_metadata(field_key=field_key, metadata=metadata)
455
+ self.indexer.delete_field(field_key=field_key)
721
456
 
722
457
  await field_obj.delete()
723
458
 
724
459
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
725
460
  return (type, field) in self.fields
726
461
 
727
- async def get_all_field_ids(self) -> Optional[PBAllFieldIDs]:
728
- key = KB_RESOURCE_ALL_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
729
- payload = await self.txn.get(key)
730
- if payload is None:
731
- return None
732
- all_fields = PBAllFieldIDs()
733
- all_fields.ParseFromString(payload)
734
- return all_fields
462
+ async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
463
+ return await datamanagers.resources.get_all_field_ids(
464
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
465
+ )
735
466
 
736
467
  async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
737
- key = KB_RESOURCE_ALL_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
738
- await self.txn.set(key, all_fields.SerializeToString())
468
+ return await datamanagers.resources.set_all_field_ids(
469
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, allfields=all_fields
470
+ )
739
471
 
740
472
  async def update_all_field_ids(
741
473
  self,
@@ -745,7 +477,7 @@ class Resource:
745
477
  errors: Optional[list[writer_pb2.Error]] = None,
746
478
  ):
747
479
  needs_update = False
748
- all_fields = await self.get_all_field_ids()
480
+ all_fields = await self.get_all_field_ids(for_update=True)
749
481
  if all_fields is None:
750
482
  needs_update = True
751
483
  all_fields = PBAllFieldIDs()
@@ -772,26 +504,12 @@ class Resource:
772
504
  @processor_observer.wrap({"type": "apply_fields"})
773
505
  async def apply_fields(self, message: BrokerMessage):
774
506
  message_updated_fields = []
775
- for field, layout in message.layouts.items():
776
- fid = FieldID(field_type=FieldType.LAYOUT, field=field)
777
- await self.set_field(fid.field_type, fid.field, layout)
778
- message_updated_fields.append(fid)
779
507
 
780
508
  for field, text in message.texts.items():
781
509
  fid = FieldID(field_type=FieldType.TEXT, field=field)
782
510
  await self.set_field(fid.field_type, fid.field, text)
783
511
  message_updated_fields.append(fid)
784
512
 
785
- for field, keywordset in message.keywordsets.items():
786
- fid = FieldID(field_type=FieldType.KEYWORDSET, field=field)
787
- await self.set_field(fid.field_type, fid.field, keywordset)
788
- message_updated_fields.append(fid)
789
-
790
- for field, datetimeobj in message.datetimes.items():
791
- fid = FieldID(field_type=FieldType.DATETIME, field=field)
792
- await self.set_field(fid.field_type, fid.field, datetimeobj)
793
- message_updated_fields.append(fid)
794
-
795
513
  for field, link in message.links.items():
796
514
  fid = FieldID(field_type=FieldType.LINK, field=field)
797
515
  await self.set_field(fid.field_type, fid.field, link)
@@ -810,13 +528,11 @@ class Resource:
810
528
  for fieldid in message.delete_fields:
811
529
  await self.delete_field(fieldid.field_type, fieldid.field)
812
530
 
813
- if (
814
- len(message_updated_fields)
815
- or len(message.delete_fields)
816
- or len(message.errors)
817
- ):
531
+ if len(message_updated_fields) or len(message.delete_fields) or len(message.errors):
818
532
  await self.update_all_field_ids(
819
- updated=message_updated_fields, deleted=message.delete_fields, errors=message.errors # type: ignore
533
+ updated=message_updated_fields,
534
+ deleted=message.delete_fields, # type: ignore
535
+ errors=message.errors, # type: ignore
820
536
  )
821
537
 
822
538
  @processor_observer.wrap({"type": "apply_extracted"})
@@ -852,13 +568,15 @@ class Resource:
852
568
 
853
569
  for link_extracted_data in message.link_extracted_data:
854
570
  await self._apply_link_extracted_data(link_extracted_data)
855
- await self.maybe_update_title_metadata(link_extracted_data)
571
+ await self.maybe_update_resource_title_from_link(link_extracted_data)
856
572
  extracted_languages.append(link_extracted_data.language)
857
573
 
858
574
  for file_extracted_data in message.file_extracted_data:
859
575
  await self._apply_file_extracted_data(file_extracted_data)
860
576
  extracted_languages.append(file_extracted_data.language)
861
577
 
578
+ await self.maybe_update_resource_title_from_file_extracted_data(message)
579
+
862
580
  # Metadata should go first
863
581
  for field_metadata in message.field_metadata:
864
582
  await self._apply_field_computed_metadata(field_metadata)
@@ -869,10 +587,9 @@ class Resource:
869
587
  # Upload to binary storage
870
588
  # Vector indexing
871
589
  if self.disable_vectors is False:
590
+ await self.get_fields(force=True)
872
591
  for field_vectors in message.field_vectors:
873
592
  await self._apply_extracted_vectors(field_vectors)
874
- for user_vectors in message.user_vectors:
875
- await self._apply_user_vectors(user_vectors)
876
593
 
877
594
  # Only uploading to binary storage
878
595
  for field_large_metadata in message.field_large_metadata:
@@ -896,9 +613,7 @@ class Resource:
896
613
  extracted_text.field,
897
614
  )
898
615
 
899
- async def _apply_question_answers(
900
- self, question_answers: FieldQuestionAnswerWrapper
901
- ):
616
+ async def _apply_question_answers(self, question_answers: FieldQuestionAnswerWrapper):
902
617
  field = question_answers.field
903
618
  field_obj = await self.get_field(field.field, field.field_type, load=False)
904
619
  await field_obj.set_question_answers(question_answers)
@@ -918,19 +633,27 @@ class Resource:
918
633
 
919
634
  maybe_update_basic_summary(self.basic, link_extracted_data.description)
920
635
 
921
- async def maybe_update_title_metadata(self, link_extracted_data: LinkExtractedData):
636
+ async def maybe_update_resource_title_from_link(self, link_extracted_data: LinkExtractedData):
637
+ """
638
+ When parsing link extracted data, we want to replace the resource title for the first link
639
+ that gets processed and has a title, and only if the current title is a URL, which we take
640
+ as a hint that the title was not set by the user.
641
+ """
922
642
  assert self.basic is not None
923
643
  if not link_extracted_data.title:
924
644
  return
925
645
  if not (self.basic.title.startswith("http") or self.basic.title == ""):
926
646
  return
927
-
928
647
  title = link_extracted_data.title
929
- self.basic.title = title
648
+ await self.update_resource_title(title)
649
+
650
+ async def update_resource_title(self, computed_title: str) -> None:
651
+ assert self.basic is not None
652
+ self.basic.title = computed_title
930
653
  # Extracted text
931
654
  field = await self.get_field("title", FieldType.GENERIC, load=False)
932
655
  etw = ExtractedTextWrapper()
933
- etw.body.text = title
656
+ etw.body.text = computed_title
934
657
  await field.set_extracted_text(etw)
935
658
 
936
659
  # Field computed metadata
@@ -942,11 +665,8 @@ class Resource:
942
665
  fcm = await field.get_field_metadata(force=True)
943
666
  if fcm is not None:
944
667
  fcmw.metadata.CopyFrom(fcm)
945
-
946
668
  fcmw.metadata.metadata.ClearField("paragraphs")
947
- paragraph = Paragraph(
948
- start=0, end=len(title), kind=Paragraph.TypeParagraph.TITLE
949
- )
669
+ paragraph = Paragraph(start=0, end=len(computed_title), kind=Paragraph.TypeParagraph.TITLE)
950
670
  fcmw.metadata.metadata.paragraphs.append(paragraph)
951
671
 
952
672
  await field.set_field_metadata(fcmw)
@@ -963,9 +683,54 @@ class Resource:
963
683
  maybe_update_basic_icon(self.basic, file_extracted_data.icon)
964
684
  maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
965
685
 
966
- async def _apply_field_computed_metadata(
967
- self, field_metadata: FieldComputedMetadataWrapper
968
- ):
686
+ async def _should_update_resource_title_from_file_metadata(self) -> bool:
687
+ """
688
+ We only want to update resource title from file metadata if the title is empty,
689
+ equal to the resource uuid or equal to any of the file filenames in the resource.
690
+ """
691
+ basic = await self.get_basic()
692
+ if basic is None:
693
+ return True
694
+ current_title = basic.title
695
+ if current_title == "":
696
+ # If the title is empty, we should update it
697
+ return True
698
+ if current_title == self.uuid:
699
+ # If the title is the same as the resource uuid, we should update it
700
+ return True
701
+ fields = await self.get_fields(force=True)
702
+ filenames = set()
703
+ for (field_type, _), field_obj in fields.items():
704
+ if field_type == FieldType.FILE:
705
+ field_value: Optional[FieldFile] = await field_obj.get_value()
706
+ if field_value is not None:
707
+ if field_value.file.filename not in ("", None):
708
+ filenames.add(field_value.file.filename)
709
+ if current_title in filenames:
710
+ # If the title is equal to any of the file filenames, we should update it
711
+ return True
712
+ return False
713
+
714
+ async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
715
+ """
716
+ Update the resource title with the first file that has a title extracted.
717
+ """
718
+ if not await self._should_update_resource_title_from_file_metadata():
719
+ return
720
+ for fed in message.file_extracted_data:
721
+ if fed.title == "":
722
+ # Skip if the extracted title is empty
723
+ continue
724
+ fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
725
+ logger.info(
726
+ "Updating resource title from file extracted data",
727
+ extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
728
+ )
729
+ await self.update_resource_title(fed.title)
730
+ # Break after the first file with a title is found
731
+ break
732
+
733
+ async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
969
734
  assert self.basic is not None
970
735
  maybe_update_basic_summary(self.basic, field_metadata.metadata.metadata.summary)
971
736
 
@@ -974,17 +739,11 @@ class Resource:
974
739
  field_metadata.field.field_type,
975
740
  load=False,
976
741
  )
977
- (
978
- metadata,
979
- replace_field,
980
- replace_splits,
981
- ) = await field_obj.set_field_metadata(field_metadata)
742
+ metadata = await field_obj.set_field_metadata(field_metadata)
982
743
  field_key = self.generate_field_id(field_metadata.field)
983
744
 
984
745
  page_positions: Optional[FilePagePositions] = None
985
- if field_metadata.field.field_type == FieldType.FILE and isinstance(
986
- field_obj, File
987
- ):
746
+ if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
988
747
  page_positions = await get_file_page_positions(field_obj)
989
748
 
990
749
  user_field_metadata = next(
@@ -1002,29 +761,24 @@ class Resource:
1002
761
  self.indexer.apply_field_metadata,
1003
762
  field_key,
1004
763
  metadata,
1005
- replace_field=replace_field,
1006
- replace_splits=replace_splits,
1007
764
  page_positions=page_positions,
1008
765
  extracted_text=extracted_text,
1009
766
  basic_user_field_metadata=user_field_metadata,
767
+ replace_field=True,
1010
768
  )
1011
769
  loop = asyncio.get_running_loop()
1012
770
  await loop.run_in_executor(_executor, apply_field_metadata)
1013
771
 
1014
- maybe_update_basic_thumbnail(
1015
- self.basic, field_metadata.metadata.metadata.thumbnail
1016
- )
772
+ maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
1017
773
 
1018
774
  add_field_classifications(self.basic, field_metadata)
1019
775
 
1020
776
  async def _apply_extracted_vectors(self, field_vectors: ExtractedVectorsWrapper):
1021
- if not self.has_field(
1022
- field_vectors.field.field_type, field_vectors.field.field
1023
- ):
777
+ # Store vectors in the resource
778
+
779
+ if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
1024
780
  # skipping because field does not exist
1025
- logger.warning(
1026
- f'Field "{field_vectors.field.field}" does not exist, skipping vectors'
1027
- )
781
+ logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
1028
782
  return
1029
783
 
1030
784
  field_obj = await self.get_field(
@@ -1032,49 +786,44 @@ class Resource:
1032
786
  field_vectors.field.field_type,
1033
787
  load=False,
1034
788
  )
1035
- (
1036
- vo,
1037
- replace_field_sentences,
1038
- replace_splits_sentences,
1039
- ) = await field_obj.set_vectors(field_vectors)
789
+ vo = await field_obj.set_vectors(field_vectors)
790
+
791
+ # Prepare vectors to be indexed
792
+
1040
793
  field_key = self.generate_field_id(field_vectors.field)
1041
794
  if vo is not None:
1042
- apply_field_vectors = partial(
795
+ vectorset_id = field_vectors.vectorset_id or None
796
+ if vectorset_id is None:
797
+ dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
798
+ self.txn, kbid=self.kb.kbid
799
+ )
800
+ else:
801
+ config = await datamanagers.vectorsets.get(
802
+ self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
803
+ )
804
+ if config is None:
805
+ logger.warning(
806
+ f"Trying to apply a resource on vectorset '{vectorset_id}' that doesn't exist."
807
+ )
808
+ return
809
+ dimension = config.vectorset_index_config.vector_dimension
810
+ if not dimension:
811
+ raise ValueError(f"Vector dimension not set for vectorset '{vectorset_id}'")
812
+
813
+ apply_field_vectors_partial = partial(
1043
814
  self.indexer.apply_field_vectors,
1044
815
  field_key,
1045
816
  vo,
1046
- replace_field_sentences,
1047
- replace_splits_sentences,
817
+ vectorset=vectorset_id,
818
+ replace_field=True,
819
+ matryoshka_vector_dimension=dimension,
1048
820
  )
1049
821
  loop = asyncio.get_running_loop()
1050
- await loop.run_in_executor(_executor, apply_field_vectors)
822
+ await loop.run_in_executor(_executor, apply_field_vectors_partial)
1051
823
  else:
1052
824
  raise AttributeError("VO not found on set")
1053
825
 
1054
- async def _apply_user_vectors(self, user_vectors: UserVectorsWrapper):
1055
- field_obj = await self.get_field(
1056
- user_vectors.field.field,
1057
- user_vectors.field.field_type,
1058
- load=False,
1059
- )
1060
- uv, vectors_to_delete = await field_obj.set_user_vectors(user_vectors)
1061
- field_key = self.generate_field_id(user_vectors.field)
1062
- if uv is not None:
1063
- # We need to make sure that the vectors replaced are not on the new vectors
1064
- # So we extend the vectors to delete with the one replaced by the update
1065
- for vectorset, vectors in vectors_to_delete.items():
1066
- for vector in vectors.vectors:
1067
- if vector not in user_vectors.vectors_to_delete[vectorset].vectors:
1068
- user_vectors.vectors_to_delete[vectorset].vectors.append(vector)
1069
- self.indexer.apply_user_vectors(
1070
- field_key, uv, user_vectors.vectors_to_delete
1071
- )
1072
- else:
1073
- raise AttributeError("User Vectors not found on set")
1074
-
1075
- async def _apply_field_large_metadata(
1076
- self, field_large_metadata: LargeComputedMetadataWrapper
1077
- ):
826
+ async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
1078
827
  field_obj = await self.get_field(
1079
828
  field_large_metadata.field.field,
1080
829
  field_large_metadata.field.field_type,
@@ -1083,7 +832,7 @@ class Resource:
1083
832
  await field_obj.set_large_field_metadata(field_large_metadata)
1084
833
 
1085
834
  def generate_field_id(self, field: FieldID) -> str:
1086
- return f"{FIELD_TYPE_TO_ID[field.field_type]}/{field.field}"
835
+ return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
1087
836
 
1088
837
  async def compute_security(self, brain: ResourceBrain):
1089
838
  security = await self.get_security()
@@ -1102,7 +851,7 @@ class Resource:
1102
851
  brain.set_resource_metadata(basic=basic, origin=origin)
1103
852
  for type, field in await self.get_fields_ids(force=True):
1104
853
  fieldobj = await self.get_field(field, type, load=False)
1105
- fieldid = FieldID(field_type=type, field=field) # type: ignore
854
+ fieldid = FieldID(field_type=type, field=field)
1106
855
  fieldkey = self.generate_field_id(fieldid)
1107
856
  extracted_metadata = await fieldobj.get_field_metadata()
1108
857
  valid_user_field_metadata = None
@@ -1113,16 +862,16 @@ class Resource:
1113
862
  ):
1114
863
  valid_user_field_metadata = user_field_metadata
1115
864
  break
865
+
866
+ generated_by = await fieldobj.generated_by()
1116
867
  brain.apply_field_labels(
1117
868
  fieldkey,
1118
869
  extracted_metadata,
1119
870
  self.uuid,
871
+ generated_by,
1120
872
  basic.usermetadata,
1121
873
  valid_user_field_metadata,
1122
874
  )
1123
- if type == FieldType.KEYWORDSET:
1124
- field_data = await fieldobj.db_get_value()
1125
- brain.process_keywordset_fields(fieldkey, field_data)
1126
875
 
1127
876
  @processor_observer.wrap({"type": "compute_global_text"})
1128
877
  async def compute_global_text(self):
@@ -1159,12 +908,10 @@ class Resource:
1159
908
  for fieldmetadata in self.basic.fieldmetadata:
1160
909
  field_id = self.generate_field_id(fieldmetadata.field)
1161
910
  for annotationparagraph in fieldmetadata.paragraphs:
1162
- userdefinedparagraphclass[
1163
- annotationparagraph.key
1164
- ] = annotationparagraph
911
+ userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
1165
912
 
1166
913
  for (type_id, field_id), field in fields.items():
1167
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
914
+ fieldid = FieldID(field_type=type_id, field=field_id)
1168
915
  field_key = self.generate_field_id(fieldid)
1169
916
  fm = await field.get_field_metadata()
1170
917
  extracted_text = None
@@ -1179,9 +926,7 @@ class Resource:
1179
926
  if fm is None:
1180
927
  continue
1181
928
 
1182
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1183
- (None, fm.metadata)
1184
- ]
929
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1185
930
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1186
931
  field_metadatas.append((subfield_metadata, splitted_metadata))
1187
932
 
@@ -1192,7 +937,7 @@ class Resource:
1192
937
 
1193
938
  entities: dict[str, str] = {}
1194
939
  if enabled_metadata.entities:
1195
- entities.update(field_metadata.ner)
940
+ _update_entities_dict(entities, field_metadata)
1196
941
 
1197
942
  precomputed_vectors = {}
1198
943
  if vo is not None:
@@ -1203,9 +948,7 @@ class Resource:
1203
948
  vectors = vo.vectors
1204
949
  base_vector_key = f"{self.uuid}/{field_key}"
1205
950
  for index, vector in enumerate(vectors.vectors):
1206
- vector_key = (
1207
- f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
1208
- )
951
+ vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
1209
952
  precomputed_vectors[vector_key] = vector.vector
1210
953
 
1211
954
  if extracted_text is not None:
@@ -1216,11 +959,11 @@ class Resource:
1216
959
 
1217
960
  for paragraph in field_metadata.paragraphs:
1218
961
  if subfield is not None:
1219
- paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1220
- else:
1221
962
  paragraph_key = (
1222
- f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
963
+ f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1223
964
  )
965
+ else:
966
+ paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1224
967
 
1225
968
  if enabled_metadata.labels:
1226
969
  metadata.labels.ClearField("field")
@@ -1234,7 +977,9 @@ class Resource:
1234
977
  if subfield is not None:
1235
978
  sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
1236
979
  else:
1237
- sentence_key = f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
980
+ sentence_key = (
981
+ f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
982
+ )
1238
983
 
1239
984
  if vo is not None:
1240
985
  metadata.ClearField("vector")
@@ -1273,12 +1018,10 @@ class Resource:
1273
1018
  for fieldmetadata in self.basic.fieldmetadata:
1274
1019
  field_id = self.generate_field_id(fieldmetadata.field)
1275
1020
  for annotationparagraph in fieldmetadata.paragraphs:
1276
- userdefinedparagraphclass[
1277
- annotationparagraph.key
1278
- ] = annotationparagraph
1021
+ userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
1279
1022
 
1280
1023
  for (type_id, field_id), field in fields.items():
1281
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
1024
+ fieldid = FieldID(field_type=type_id, field=field_id)
1282
1025
  field_key = self.generate_field_id(fieldid)
1283
1026
  fm = await field.get_field_metadata()
1284
1027
  extracted_text = None
@@ -1289,9 +1032,7 @@ class Resource:
1289
1032
  if fm is None:
1290
1033
  continue
1291
1034
 
1292
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1293
- (None, fm.metadata)
1294
- ]
1035
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1295
1036
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1296
1037
  field_metadatas.append((subfield_metadata, splitted_metadata))
1297
1038
 
@@ -1302,7 +1043,7 @@ class Resource:
1302
1043
 
1303
1044
  entities: dict[str, str] = {}
1304
1045
  if enabled_metadata.entities:
1305
- entities.update(field_metadata.ner)
1046
+ _update_entities_dict(entities, field_metadata)
1306
1047
 
1307
1048
  if extracted_text is not None:
1308
1049
  if subfield is not None:
@@ -1312,11 +1053,11 @@ class Resource:
1312
1053
 
1313
1054
  for paragraph in field_metadata.paragraphs:
1314
1055
  if subfield is not None:
1315
- paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1316
- else:
1317
1056
  paragraph_key = (
1318
- f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1057
+ f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1319
1058
  )
1059
+ else:
1060
+ paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1320
1061
 
1321
1062
  if enabled_metadata.labels:
1322
1063
  metadata.labels.ClearField("paragraph")
@@ -1344,9 +1085,7 @@ class Resource:
1344
1085
 
1345
1086
  yield pb_paragraph
1346
1087
 
1347
- async def iterate_fields(
1348
- self, enabled_metadata: EnabledMetadata
1349
- ) -> AsyncIterator[TrainField]:
1088
+ async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
1350
1089
  fields = await self.get_fields(force=True)
1351
1090
  metadata = TrainMetadata()
1352
1091
  if enabled_metadata.labels:
@@ -1356,7 +1095,7 @@ class Resource:
1356
1095
  metadata.labels.resource.extend(self.basic.usermetadata.classifications)
1357
1096
 
1358
1097
  for (type_id, field_id), field in fields.items():
1359
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
1098
+ fieldid = FieldID(field_type=type_id, field=field_id)
1360
1099
  fm = await field.get_field_metadata()
1361
1100
  extracted_text = None
1362
1101
 
@@ -1366,9 +1105,7 @@ class Resource:
1366
1105
  if fm is None:
1367
1106
  continue
1368
1107
 
1369
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1370
- (None, fm.metadata)
1371
- ]
1108
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1372
1109
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1373
1110
  field_metadatas.append((subfield_metadata, splitted_metadata))
1374
1111
 
@@ -1385,7 +1122,7 @@ class Resource:
1385
1122
 
1386
1123
  if enabled_metadata.entities:
1387
1124
  metadata.ClearField("entities")
1388
- metadata.entities.update(splitted_metadata.ner)
1125
+ _update_entities_dict(metadata.entities, splitted_metadata)
1389
1126
 
1390
1127
  pb_field = TrainField()
1391
1128
  pb_field.uuid = self.uuid
@@ -1393,9 +1130,7 @@ class Resource:
1393
1130
  pb_field.metadata.CopyFrom(metadata)
1394
1131
  yield pb_field
1395
1132
 
1396
- async def generate_train_resource(
1397
- self, enabled_metadata: EnabledMetadata
1398
- ) -> TrainResource:
1133
+ async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
1399
1134
  fields = await self.get_fields(force=True)
1400
1135
  metadata = TrainMetadata()
1401
1136
  if enabled_metadata.labels:
@@ -1422,9 +1157,7 @@ class Resource:
1422
1157
  if fm is None:
1423
1158
  continue
1424
1159
 
1425
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1426
- (None, fm.metadata)
1427
- ]
1160
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1428
1161
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1429
1162
  field_metadatas.append((subfield_metadata, splitted_metadata))
1430
1163
 
@@ -1433,7 +1166,7 @@ class Resource:
1433
1166
  metadata.labels.field.extend(splitted_metadata.classifications)
1434
1167
 
1435
1168
  if enabled_metadata.entities:
1436
- metadata.entities.update(splitted_metadata.ner)
1169
+ _update_entities_dict(metadata.entities, splitted_metadata)
1437
1170
 
1438
1171
  pb_resource = TrainResource()
1439
1172
  pb_resource.uuid = self.uuid
@@ -1462,33 +1195,35 @@ def remove_field_classifications(basic: PBBasic, deleted_fields: list[FieldID]):
1462
1195
  Clean classifications of fields that have been deleted
1463
1196
  """
1464
1197
  field_classifications = [
1465
- fc
1466
- for fc in basic.computedmetadata.field_classifications
1467
- if fc.field not in deleted_fields
1198
+ fc for fc in basic.computedmetadata.field_classifications if fc.field not in deleted_fields
1468
1199
  ]
1469
1200
  basic.computedmetadata.ClearField("field_classifications")
1470
1201
  basic.computedmetadata.field_classifications.extend(field_classifications)
1471
1202
 
1472
1203
 
1473
- def add_field_classifications(
1474
- basic: PBBasic, fcmw: FieldComputedMetadataWrapper
1475
- ) -> bool:
1204
+ def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper) -> bool:
1476
1205
  """
1477
1206
  Returns whether some new field classifications were added
1478
1207
  """
1479
- if len(fcmw.metadata.metadata.classifications) == 0:
1208
+ if len(fcmw.metadata.metadata.classifications) == 0 and all(
1209
+ len(split.classifications) == 0 for split in fcmw.metadata.split_metadata.values()
1210
+ ):
1480
1211
  return False
1212
+
1481
1213
  remove_field_classifications(basic, [fcmw.field])
1482
1214
  fcfs = FieldClassifications()
1483
1215
  fcfs.field.CopyFrom(fcmw.field)
1484
1216
  fcfs.classifications.extend(fcmw.metadata.metadata.classifications)
1217
+
1218
+ for split_id, split in fcmw.metadata.split_metadata.items():
1219
+ if split_id not in fcmw.metadata.deleted_splits:
1220
+ fcfs.classifications.extend(split.classifications)
1221
+
1485
1222
  basic.computedmetadata.field_classifications.append(fcfs)
1486
1223
  return True
1487
1224
 
1488
1225
 
1489
- def add_entities_to_metadata(
1490
- entities: dict[str, str], local_text: str, metadata: TrainMetadata
1491
- ) -> None:
1226
+ def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
1492
1227
  for entity_key, entity_value in entities.items():
1493
1228
  if entity_key not in local_text:
1494
1229
  # Add the entity only if found in text
@@ -1502,9 +1237,7 @@ def add_entities_to_metadata(
1502
1237
  for _ in range(local_text.count(entity_key)):
1503
1238
  start = local_text.index(entity_key, last_occurrence_end)
1504
1239
  end = start + len(entity_key)
1505
- metadata.entity_positions[poskey].positions.append(
1506
- TrainPosition(start=start, end=end)
1507
- )
1240
+ metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
1508
1241
  last_occurrence_end = end
1509
1242
 
1510
1243
 
@@ -1519,15 +1252,22 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
1519
1252
  if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
1520
1253
  # Icon already set or detected
1521
1254
  return False
1255
+
1522
1256
  if not mimetype:
1523
1257
  return False
1258
+
1259
+ if not content_types.valid(mimetype):
1260
+ logger.warning(
1261
+ "Invalid mimetype. Skipping icon update.",
1262
+ extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
1263
+ )
1264
+ return False
1265
+
1524
1266
  basic.icon = mimetype
1525
1267
  return True
1526
1268
 
1527
1269
 
1528
- def maybe_update_basic_thumbnail(
1529
- basic: PBBasic, thumbnail: Optional[CloudFile]
1530
- ) -> bool:
1270
+ def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile]) -> bool:
1531
1271
  if basic.thumbnail or thumbnail is None:
1532
1272
  return False
1533
1273
  basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
@@ -1569,3 +1309,23 @@ def extract_field_metadata_languages(
1569
1309
  for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
1570
1310
  languages.add(splitted_metadata.language)
1571
1311
  return list(languages)
1312
+
1313
+
1314
+ def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
1315
+ """
1316
+ Update the entities dict with the entities from the field metadata.
1317
+ Method created to ease the transition from legacy ner field to new entities field.
1318
+ """
1319
+ # Data Augmentation + Processor entities
1320
+ # This will overwrite entities detected from more than one data augmentation task
1321
+ # TODO: Change TrainMetadata proto to accept multiple entities with the same text
1322
+ entity_map = {
1323
+ entity.text: entity.label
1324
+ for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
1325
+ for entity in entities_wrapper.entities
1326
+ }
1327
+ target_entites_dict.update(entity_map)
1328
+
1329
+ # Legacy processor entities
1330
+ # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
1331
+ target_entites_dict.update(field_metadata.ner)