nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -23,19 +23,33 @@ import asyncio
23
23
  import logging
24
24
  from concurrent.futures import ThreadPoolExecutor
25
25
  from functools import partial
26
- from typing import TYPE_CHECKING, Any, AsyncIterator, Optional, Type
26
+ from typing import TYPE_CHECKING, Any, AsyncIterator, MutableMapping, Optional, Type
27
27
 
28
+ from nucliadb.common import datamanagers
29
+ from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
30
+ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
31
+ from nucliadb.common.maindb.driver import Transaction
32
+ from nucliadb.ingest.fields.base import Field
33
+ from nucliadb.ingest.fields.conversation import Conversation
34
+ from nucliadb.ingest.fields.file import File
35
+ from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
36
+ from nucliadb.ingest.fields.link import Link
37
+ from nucliadb.ingest.fields.text import Text
38
+ from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
39
+ from nucliadb.ingest.orm.metrics import processor_observer
40
+ from nucliadb_models import content_types
41
+ from nucliadb_models.common import CloudLink
42
+ from nucliadb_models.content_types import GENERIC_MIME_TYPE
43
+ from nucliadb_protos import utils_pb2, writer_pb2
28
44
  from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
29
- from nucliadb_protos.resources_pb2 import Basic
30
- from nucliadb_protos.resources_pb2 import Basic as PBBasic
31
- from nucliadb_protos.resources_pb2 import CloudFile
32
- from nucliadb_protos.resources_pb2 import Conversation as PBConversation
33
- from nucliadb_protos.resources_pb2 import Extra as PBExtra
34
45
  from nucliadb_protos.resources_pb2 import (
46
+ Basic,
47
+ CloudFile,
35
48
  ExtractedTextWrapper,
36
49
  ExtractedVectorsWrapper,
37
50
  FieldClassifications,
38
51
  FieldComputedMetadataWrapper,
52
+ FieldFile,
39
53
  FieldID,
40
54
  FieldMetadata,
41
55
  FieldQuestionAnswerWrapper,
@@ -44,40 +58,27 @@ from nucliadb_protos.resources_pb2 import (
44
58
  FileExtractedData,
45
59
  LargeComputedMetadataWrapper,
46
60
  LinkExtractedData,
61
+ Metadata,
62
+ Paragraph,
63
+ ParagraphAnnotation,
47
64
  )
48
- from nucliadb_protos.resources_pb2 import Metadata
65
+ from nucliadb_protos.resources_pb2 import Basic as PBBasic
66
+ from nucliadb_protos.resources_pb2 import Conversation as PBConversation
67
+ from nucliadb_protos.resources_pb2 import Extra as PBExtra
49
68
  from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
50
69
  from nucliadb_protos.resources_pb2 import Origin as PBOrigin
51
- from nucliadb_protos.resources_pb2 import Paragraph, ParagraphAnnotation
52
70
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
53
- from nucliadb_protos.train_pb2 import EnabledMetadata
54
- from nucliadb_protos.train_pb2 import Position as TrainPosition
55
71
  from nucliadb_protos.train_pb2 import (
72
+ EnabledMetadata,
56
73
  TrainField,
57
74
  TrainMetadata,
58
75
  TrainParagraph,
59
76
  TrainResource,
60
77
  TrainSentence,
61
78
  )
79
+ from nucliadb_protos.train_pb2 import Position as TrainPosition
62
80
  from nucliadb_protos.utils_pb2 import Relation as PBRelation
63
81
  from nucliadb_protos.writer_pb2 import BrokerMessage
64
-
65
- from nucliadb.common import datamanagers
66
- from nucliadb.common.maindb.driver import Transaction
67
- from nucliadb.ingest.fields.base import Field
68
- from nucliadb.ingest.fields.conversation import Conversation
69
- from nucliadb.ingest.fields.date import Datetime
70
- from nucliadb.ingest.fields.file import File
71
- from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
72
- from nucliadb.ingest.fields.keywordset import Keywordset
73
- from nucliadb.ingest.fields.layout import Layout
74
- from nucliadb.ingest.fields.link import Link
75
- from nucliadb.ingest.fields.text import Text
76
- from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
77
- from nucliadb.ingest.orm.metrics import processor_observer
78
- from nucliadb_models.common import CloudLink
79
- from nucliadb_models.writer import GENERIC_MIME_TYPE
80
- from nucliadb_protos import utils_pb2, writer_pb2
81
82
  from nucliadb_utils.storages.storage import Storage
82
83
 
83
84
  if TYPE_CHECKING: # pragma: no cover
@@ -85,33 +86,14 @@ if TYPE_CHECKING: # pragma: no cover
85
86
 
86
87
  logger = logging.getLogger(__name__)
87
88
 
88
- KB_RESOURCE_FIELDS = "/kbs/{kbid}/r/{uuid}/f/"
89
- KB_RESOURCE_SLUG_BASE = "/kbs/{kbid}/s/"
90
- KB_RESOURCE_SLUG = f"{KB_RESOURCE_SLUG_BASE}{{slug}}"
91
89
  KB_FIELDS: dict[int, Type] = {
92
- FieldType.LAYOUT: Layout,
93
90
  FieldType.TEXT: Text,
94
91
  FieldType.FILE: File,
95
92
  FieldType.LINK: Link,
96
- FieldType.DATETIME: Datetime,
97
- FieldType.KEYWORDSET: Keywordset,
98
93
  FieldType.GENERIC: Generic,
99
94
  FieldType.CONVERSATION: Conversation,
100
95
  }
101
96
 
102
- KB_REVERSE: dict[str, FieldType.ValueType] = {
103
- "l": FieldType.LAYOUT,
104
- "t": FieldType.TEXT,
105
- "f": FieldType.FILE,
106
- "u": FieldType.LINK,
107
- "d": FieldType.DATETIME,
108
- "k": FieldType.KEYWORDSET,
109
- "a": FieldType.GENERIC,
110
- "c": FieldType.CONVERSATION,
111
- }
112
-
113
- FIELD_TYPE_TO_ID = {v: k for k, v in KB_REVERSE.items()}
114
-
115
97
  _executor = ThreadPoolExecutor(10)
116
98
 
117
99
 
@@ -122,6 +104,8 @@ PB_TEXT_FORMAT_TO_MIMETYPE = {
122
104
  FieldText.Format.MARKDOWN: "text/markdown",
123
105
  FieldText.Format.JSON: "application/json",
124
106
  FieldText.Format.KEEP_MARKDOWN: "text/markdown",
107
+ FieldText.Format.JSONL: "application/x-ndjson",
108
+ FieldText.Format.PLAIN_BLANKLINE_SPLIT: "text/plain+blankline",
125
109
  }
126
110
 
127
111
  BASIC_IMMUTABLE_FIELDS = ("icon",)
@@ -173,9 +157,7 @@ class Resource:
173
157
  # Basic
174
158
  async def get_basic(self) -> Optional[PBBasic]:
175
159
  if self.basic is None:
176
- basic = await datamanagers.resources.get_basic(
177
- self.txn, kbid=self.kb.kbid, rid=self.uuid
178
- )
160
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
179
161
  self.basic = basic if basic is not None else PBBasic()
180
162
  return self.basic
181
163
 
@@ -221,9 +203,7 @@ class Resource:
221
203
  fields.append(field_id)
222
204
  positions[field_id] = i
223
205
 
224
- updated = [
225
- self.basic.fieldmetadata[positions[field]] for field in fields
226
- ]
206
+ updated = [self.basic.fieldmetadata[positions[field]] for field in fields]
227
207
 
228
208
  del self.basic.fieldmetadata[:]
229
209
  self.basic.fieldmetadata.extend(updated)
@@ -244,11 +224,10 @@ class Resource:
244
224
  self.indexer.apply_field_metadata(
245
225
  field_id,
246
226
  field_metadata,
247
- replace_field=[],
248
- replace_splits={},
249
227
  page_positions=page_positions,
250
228
  extracted_text=await field_obj.get_extracted_text(),
251
229
  basic_user_field_metadata=user_field_metadata,
230
+ replace_field=True,
252
231
  )
253
232
 
254
233
  # Some basic fields are computed off field metadata.
@@ -264,9 +243,7 @@ class Resource:
264
243
  # Origin
265
244
  async def get_origin(self) -> Optional[PBOrigin]:
266
245
  if self.origin is None:
267
- origin = await datamanagers.resources.get_origin(
268
- self.txn, kbid=self.kb.kbid, rid=self.uuid
269
- )
246
+ origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
270
247
  self.origin = origin
271
248
  return self.origin
272
249
 
@@ -280,16 +257,12 @@ class Resource:
280
257
  # Extra
281
258
  async def get_extra(self) -> Optional[PBExtra]:
282
259
  if self.extra is None:
283
- extra = await datamanagers.resources.get_extra(
284
- self.txn, kbid=self.kb.kbid, rid=self.uuid
285
- )
260
+ extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
286
261
  self.extra = extra
287
262
  return self.extra
288
263
 
289
264
  async def set_extra(self, payload: PBExtra):
290
- await datamanagers.resources.set_extra(
291
- self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload
292
- )
265
+ await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
293
266
  self.modified = True
294
267
  self.extra = payload
295
268
 
@@ -329,7 +302,7 @@ class Resource:
329
302
  self.relations = relations
330
303
 
331
304
  @processor_observer.wrap({"type": "generate_index_message"})
332
- async def generate_index_message(self) -> ResourceBrain:
305
+ async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
333
306
  brain = ResourceBrain(rid=self.uuid)
334
307
  origin = await self.get_origin()
335
308
  basic = await self.get_basic()
@@ -339,7 +312,7 @@ class Resource:
339
312
  await self.compute_global_tags(brain)
340
313
  fields = await self.get_fields(force=True)
341
314
  for (type_id, field_id), field in fields.items():
342
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
315
+ fieldid = FieldID(field_type=type_id, field=field_id)
343
316
  await self.compute_global_text_field(fieldid, brain)
344
317
 
345
318
  field_metadata = await field.get_field_metadata()
@@ -355,234 +328,66 @@ class Resource:
355
328
  (
356
329
  fm
357
330
  for fm in basic.fieldmetadata
358
- if fm.field.field == field_id
359
- and fm.field.field_type == type_id
331
+ if fm.field.field == field_id and fm.field.field_type == type_id
360
332
  ),
361
333
  None,
362
334
  )
363
335
  brain.apply_field_metadata(
364
336
  field_key,
365
337
  field_metadata,
366
- replace_field=[],
367
- replace_splits={},
368
338
  page_positions=page_positions,
369
339
  extracted_text=await field.get_extracted_text(),
370
340
  basic_user_field_metadata=user_field_metadata,
341
+ replace_field=reindex,
371
342
  )
372
343
 
373
344
  if self.disable_vectors is False:
345
+ # XXX: while we don't remove the "default" vectorset concept, we
346
+ # need to do use None as the default one
374
347
  vo = await field.get_vectors()
375
348
  if vo is not None:
376
- dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
377
- self.txn, kbid=self.kb.kbid
378
- )
349
+ async with datamanagers.with_ro_transaction() as ro_txn:
350
+ dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
351
+ ro_txn, kbid=self.kb.kbid
352
+ )
379
353
  brain.apply_field_vectors(
380
354
  field_key,
381
355
  vo,
382
356
  matryoshka_vector_dimension=dimension,
357
+ replace_field=reindex,
383
358
  )
384
- return brain
385
-
386
- async def generate_field_vectors(
387
- self,
388
- bm: BrokerMessage,
389
- type_id: FieldType.ValueType,
390
- field_id: str,
391
- field: Field,
392
- ):
393
- vo = await field.get_vectors()
394
- if vo is None:
395
- return
396
- evw = ExtractedVectorsWrapper()
397
- evw.field.field = field_id
398
- evw.field.field_type = type_id # type: ignore
399
- evw.vectors.CopyFrom(vo)
400
- bm.field_vectors.append(evw)
401
-
402
- async def generate_field_large_computed_metadata(
403
- self,
404
- bm: BrokerMessage,
405
- type_id: FieldType.ValueType,
406
- field_id: str,
407
- field: Field,
408
- ):
409
- lcm = await field.get_large_field_metadata()
410
- if lcm is None:
411
- return
412
- lcmw = LargeComputedMetadataWrapper()
413
- lcmw.field.field = field_id
414
- lcmw.field.field_type = type_id # type: ignore
415
- lcmw.real.CopyFrom(lcm)
416
- bm.field_large_metadata.append(lcmw)
417
-
418
- async def generate_field_computed_metadata(
419
- self,
420
- bm: BrokerMessage,
421
- type_id: FieldType.ValueType,
422
- field_id: str,
423
- field: Field,
424
- ):
425
- fcmw = FieldComputedMetadataWrapper()
426
- fcmw.field.field = field_id
427
- fcmw.field.field_type = type_id # type: ignore
428
-
429
- field_metadata = await field.get_field_metadata()
430
- if field_metadata is not None:
431
- fcmw.metadata.CopyFrom(field_metadata)
432
- fcmw.field.field = field_id
433
- fcmw.field.field_type = type_id # type: ignore
434
- bm.field_metadata.append(fcmw)
435
- # Make sure cloud files are removed for exporting
436
-
437
- async def generate_extracted_text(
438
- self,
439
- bm: BrokerMessage,
440
- type_id: FieldType.ValueType,
441
- field_id: str,
442
- field: Field,
443
- ):
444
- etw = ExtractedTextWrapper()
445
- etw.field.field = field_id
446
- etw.field.field_type = type_id # type: ignore
447
- extracted_text = await field.get_extracted_text()
448
- if extracted_text is not None:
449
- etw.body.CopyFrom(extracted_text)
450
- bm.extracted_text.append(etw)
451
-
452
- async def generate_field(
453
- self,
454
- bm: BrokerMessage,
455
- type_id: FieldType.ValueType,
456
- field_id: str,
457
- field: Field,
458
- ):
459
- # Used for exporting a field
460
- if type_id == FieldType.TEXT:
461
- value = await field.get_value()
462
- bm.texts[field_id].CopyFrom(value)
463
- elif type_id == FieldType.LINK:
464
- value = await field.get_value()
465
- bm.links[field_id].CopyFrom(value)
466
- elif type_id == FieldType.FILE:
467
- value = await field.get_value()
468
- bm.files[field_id].CopyFrom(value)
469
- elif type_id == FieldType.CONVERSATION:
470
- value = await self.get_full_conversation(field) # type: ignore
471
- bm.conversations[field_id].CopyFrom(value)
472
- elif type_id == FieldType.KEYWORDSET:
473
- value = await field.get_value()
474
- bm.keywordsets[field_id].CopyFrom(value)
475
- elif type_id == FieldType.DATETIME:
476
- value = await field.get_value()
477
- bm.datetimes[field_id].CopyFrom(value)
478
- elif type_id == FieldType.LAYOUT:
479
- value = await field.get_value()
480
- bm.layouts[field_id].CopyFrom(value)
481
-
482
- async def get_full_conversation(
483
- self,
484
- conversation_field: Conversation,
485
- ) -> Optional[PBConversation]:
486
- """
487
- Messages of a conversations may be stored across several pages.
488
- This method fetches them all and returns a single complete conversation.
489
- """
490
- full_conv = PBConversation()
491
- n_page = 1
492
- while True:
493
- page = await conversation_field.get_value(page=n_page)
494
- if page is None:
495
- break
496
- full_conv.messages.extend(page.messages)
497
- n_page += 1
498
- return full_conv
499
-
500
- async def generate_broker_message(self) -> BrokerMessage:
501
- # full means downloading all the pointers
502
- # minuts the ones to external files that are not PB
503
- # Go for all fields and recreate brain
504
- bm = BrokerMessage()
505
- bm.kbid = self.kb.kbid
506
- bm.uuid = self.uuid
507
- basic = await self.get_basic()
508
- if basic is not None:
509
- bm.basic.CopyFrom(basic)
510
- bm.slug = bm.basic.slug
511
- origin = await self.get_origin()
512
- if origin is not None:
513
- bm.origin.CopyFrom(origin)
514
- relations = await self.get_relations()
515
- if relations is not None:
516
- for relation in relations.relations:
517
- bm.relations.append(relation)
518
-
519
- fields = await self.get_fields(force=True)
520
- for (type_id, field_id), field in fields.items():
521
- # Value
522
- await self.generate_field(bm, type_id, field_id, field)
523
-
524
- # Extracted text
525
- await self.generate_extracted_text(bm, type_id, field_id, field)
526
-
527
- # Field Computed Metadata
528
- await self.generate_field_computed_metadata(bm, type_id, field_id, field)
529
-
530
- if type_id == FieldType.FILE and isinstance(field, File):
531
- field_extracted_data = await field.get_file_extracted_data()
532
- if field_extracted_data is not None:
533
- bm.file_extracted_data.append(field_extracted_data)
534
-
535
- elif type_id == FieldType.LINK and isinstance(field, Link):
536
- link_extracted_data = await field.get_link_extracted_data()
537
- if link_extracted_data is not None:
538
- bm.link_extracted_data.append(link_extracted_data)
539
359
 
540
- # Field vectors
541
- await self.generate_field_vectors(bm, type_id, field_id, field)
542
-
543
- # Large metadata
544
- await self.generate_field_large_computed_metadata(
545
- bm, type_id, field_id, field
546
- )
547
-
548
- return bm
360
+ vectorset_configs = []
361
+ async with datamanagers.with_ro_transaction() as ro_txn:
362
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
363
+ ro_txn, kbid=self.kb.kbid
364
+ ):
365
+ vectorset_configs.append(vectorset_config)
366
+ for vectorset_config in vectorset_configs:
367
+ vo = await field.get_vectors(vectorset=vectorset_config.vectorset_id)
368
+ if vo is not None:
369
+ dimension = vectorset_config.vectorset_index_config.vector_dimension
370
+ brain.apply_field_vectors(
371
+ field_key,
372
+ vo,
373
+ vectorset=vectorset_config.vectorset_id,
374
+ matryoshka_vector_dimension=dimension,
375
+ replace_field=reindex,
376
+ )
377
+ return brain
549
378
 
550
379
  # Fields
551
- async def get_fields(
552
- self, force: bool = False
553
- ) -> dict[tuple[FieldType.ValueType, str], Field]:
380
+ async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
554
381
  # Get all fields
555
382
  for type, field in await self.get_fields_ids(force=force):
556
383
  if (type, field) not in self.fields:
557
384
  self.fields[(type, field)] = await self.get_field(field, type)
558
385
  return self.fields
559
386
 
560
- async def _deprecated_scan_fields_ids(
561
- self,
562
- ) -> AsyncIterator[tuple[FieldType.ValueType, str]]:
563
- logger.warning("Scanning fields ids. This is not optimal.")
564
- prefix = KB_RESOURCE_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
565
- allfields = set()
566
- async for key in self.txn.keys(prefix, count=-1):
567
- # The [6:8] `slicing purpose is to match exactly the two
568
- # splitted parts corresponding to type and field, and nothing else!
569
- type, field = key.split("/")[6:8]
570
- type_id = KB_REVERSE.get(type)
571
- if type_id is None:
572
- raise AttributeError("Invalid field type")
573
- result = (type_id, field)
574
- if result not in allfields:
575
- # fields can have errors that are stored in a subkey:
576
- # - field key -> kbs/kbid/r/ruuid/f/myfield
577
- # - field error key -> kbs/kbid/r/ruuid/f/myfield/errors
578
- # and that would return duplicates here.
579
- yield result
580
- allfields.add(result)
581
-
582
387
  async def _inner_get_fields_ids(self) -> list[tuple[FieldType.ValueType, str]]:
583
388
  # Use a set to make sure we don't have duplicate field ids
584
389
  result = set()
585
- all_fields = await self.get_all_field_ids()
390
+ all_fields = await self.get_all_field_ids(for_update=False)
586
391
  if all_fields is not None:
587
392
  for f in all_fields.fields:
588
393
  result.add((f.field_type, f.field))
@@ -599,9 +404,7 @@ class Resource:
599
404
  result.add((FieldType.GENERIC, generic))
600
405
  return list(result)
601
406
 
602
- async def get_fields_ids(
603
- self, force: bool = False
604
- ) -> list[tuple[FieldType.ValueType, str]]:
407
+ async def get_fields_ids(self, force: bool = False) -> list[tuple[FieldType.ValueType, str]]:
605
408
  """
606
409
  Get all ids of the fields of the resource and cache them.
607
410
  """
@@ -645,23 +448,20 @@ class Resource:
645
448
  if field in self.all_fields_keys:
646
449
  self.all_fields_keys.remove(field)
647
450
 
648
- field_key = self.generate_field_id(FieldID(field_type=type, field=key)) # type: ignore
649
- vo = await field_obj.get_vectors()
650
- if vo is not None:
651
- self.indexer.delete_vectors(field_key=field_key, vo=vo)
451
+ field_key = self.generate_field_id(FieldID(field_type=type, field=key))
652
452
 
653
453
  metadata = await field_obj.get_field_metadata()
654
454
  if metadata is not None:
655
- self.indexer.delete_metadata(field_key=field_key, metadata=metadata)
455
+ self.indexer.delete_field(field_key=field_key)
656
456
 
657
457
  await field_obj.delete()
658
458
 
659
459
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
660
460
  return (type, field) in self.fields
661
461
 
662
- async def get_all_field_ids(self) -> Optional[PBAllFieldIDs]:
462
+ async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
663
463
  return await datamanagers.resources.get_all_field_ids(
664
- self.txn, kbid=self.kb.kbid, rid=self.uuid
464
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
665
465
  )
666
466
 
667
467
  async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
@@ -677,7 +477,7 @@ class Resource:
677
477
  errors: Optional[list[writer_pb2.Error]] = None,
678
478
  ):
679
479
  needs_update = False
680
- all_fields = await self.get_all_field_ids()
480
+ all_fields = await self.get_all_field_ids(for_update=True)
681
481
  if all_fields is None:
682
482
  needs_update = True
683
483
  all_fields = PBAllFieldIDs()
@@ -704,26 +504,11 @@ class Resource:
704
504
  @processor_observer.wrap({"type": "apply_fields"})
705
505
  async def apply_fields(self, message: BrokerMessage):
706
506
  message_updated_fields = []
707
- for field, layout in message.layouts.items():
708
- fid = FieldID(field_type=FieldType.LAYOUT, field=field)
709
- await self.set_field(fid.field_type, fid.field, layout)
710
- message_updated_fields.append(fid)
711
-
712
507
  for field, text in message.texts.items():
713
508
  fid = FieldID(field_type=FieldType.TEXT, field=field)
714
509
  await self.set_field(fid.field_type, fid.field, text)
715
510
  message_updated_fields.append(fid)
716
511
 
717
- for field, keywordset in message.keywordsets.items():
718
- fid = FieldID(field_type=FieldType.KEYWORDSET, field=field)
719
- await self.set_field(fid.field_type, fid.field, keywordset)
720
- message_updated_fields.append(fid)
721
-
722
- for field, datetimeobj in message.datetimes.items():
723
- fid = FieldID(field_type=FieldType.DATETIME, field=field)
724
- await self.set_field(fid.field_type, fid.field, datetimeobj)
725
- message_updated_fields.append(fid)
726
-
727
512
  for field, link in message.links.items():
728
513
  fid = FieldID(field_type=FieldType.LINK, field=field)
729
514
  await self.set_field(fid.field_type, fid.field, link)
@@ -742,13 +527,11 @@ class Resource:
742
527
  for fieldid in message.delete_fields:
743
528
  await self.delete_field(fieldid.field_type, fieldid.field)
744
529
 
745
- if (
746
- len(message_updated_fields)
747
- or len(message.delete_fields)
748
- or len(message.errors)
749
- ):
530
+ if len(message_updated_fields) or len(message.delete_fields) or len(message.errors):
750
531
  await self.update_all_field_ids(
751
- updated=message_updated_fields, deleted=message.delete_fields, errors=message.errors # type: ignore
532
+ updated=message_updated_fields,
533
+ deleted=message.delete_fields, # type: ignore
534
+ errors=message.errors, # type: ignore
752
535
  )
753
536
 
754
537
  @processor_observer.wrap({"type": "apply_extracted"})
@@ -784,13 +567,15 @@ class Resource:
784
567
 
785
568
  for link_extracted_data in message.link_extracted_data:
786
569
  await self._apply_link_extracted_data(link_extracted_data)
787
- await self.maybe_update_title_metadata(link_extracted_data)
570
+ await self.maybe_update_resource_title_from_link(link_extracted_data)
788
571
  extracted_languages.append(link_extracted_data.language)
789
572
 
790
573
  for file_extracted_data in message.file_extracted_data:
791
574
  await self._apply_file_extracted_data(file_extracted_data)
792
575
  extracted_languages.append(file_extracted_data.language)
793
576
 
577
+ await self.maybe_update_resource_title_from_file_extracted_data(message)
578
+
794
579
  # Metadata should go first
795
580
  for field_metadata in message.field_metadata:
796
581
  await self._apply_field_computed_metadata(field_metadata)
@@ -801,6 +586,7 @@ class Resource:
801
586
  # Upload to binary storage
802
587
  # Vector indexing
803
588
  if self.disable_vectors is False:
589
+ await self.get_fields(force=True)
804
590
  for field_vectors in message.field_vectors:
805
591
  await self._apply_extracted_vectors(field_vectors)
806
592
 
@@ -826,9 +612,7 @@ class Resource:
826
612
  extracted_text.field,
827
613
  )
828
614
 
829
- async def _apply_question_answers(
830
- self, question_answers: FieldQuestionAnswerWrapper
831
- ):
615
+ async def _apply_question_answers(self, question_answers: FieldQuestionAnswerWrapper):
832
616
  field = question_answers.field
833
617
  field_obj = await self.get_field(field.field, field.field_type, load=False)
834
618
  await field_obj.set_question_answers(question_answers)
@@ -848,19 +632,27 @@ class Resource:
848
632
 
849
633
  maybe_update_basic_summary(self.basic, link_extracted_data.description)
850
634
 
851
- async def maybe_update_title_metadata(self, link_extracted_data: LinkExtractedData):
635
+ async def maybe_update_resource_title_from_link(self, link_extracted_data: LinkExtractedData):
636
+ """
637
+ When parsing link extracted data, we want to replace the resource title for the first link
638
+ that gets processed and has a title, and only if the current title is a URL, which we take
639
+ as a hint that the title was not set by the user.
640
+ """
852
641
  assert self.basic is not None
853
642
  if not link_extracted_data.title:
854
643
  return
855
644
  if not (self.basic.title.startswith("http") or self.basic.title == ""):
856
645
  return
857
-
858
646
  title = link_extracted_data.title
859
- self.basic.title = title
647
+ await self.update_resource_title(title)
648
+
649
+ async def update_resource_title(self, computed_title: str) -> None:
650
+ assert self.basic is not None
651
+ self.basic.title = computed_title
860
652
  # Extracted text
861
653
  field = await self.get_field("title", FieldType.GENERIC, load=False)
862
654
  etw = ExtractedTextWrapper()
863
- etw.body.text = title
655
+ etw.body.text = computed_title
864
656
  await field.set_extracted_text(etw)
865
657
 
866
658
  # Field computed metadata
@@ -872,11 +664,8 @@ class Resource:
872
664
  fcm = await field.get_field_metadata(force=True)
873
665
  if fcm is not None:
874
666
  fcmw.metadata.CopyFrom(fcm)
875
-
876
667
  fcmw.metadata.metadata.ClearField("paragraphs")
877
- paragraph = Paragraph(
878
- start=0, end=len(title), kind=Paragraph.TypeParagraph.TITLE
879
- )
668
+ paragraph = Paragraph(start=0, end=len(computed_title), kind=Paragraph.TypeParagraph.TITLE)
880
669
  fcmw.metadata.metadata.paragraphs.append(paragraph)
881
670
 
882
671
  await field.set_field_metadata(fcmw)
@@ -893,9 +682,54 @@ class Resource:
893
682
  maybe_update_basic_icon(self.basic, file_extracted_data.icon)
894
683
  maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
895
684
 
896
- async def _apply_field_computed_metadata(
897
- self, field_metadata: FieldComputedMetadataWrapper
898
- ):
685
+ async def _should_update_resource_title_from_file_metadata(self) -> bool:
686
+ """
687
+ We only want to update resource title from file metadata if the title is empty,
688
+ equal to the resource uuid or equal to any of the file filenames in the resource.
689
+ """
690
+ basic = await self.get_basic()
691
+ if basic is None:
692
+ return True
693
+ current_title = basic.title
694
+ if current_title == "":
695
+ # If the title is empty, we should update it
696
+ return True
697
+ if current_title == self.uuid:
698
+ # If the title is the same as the resource uuid, we should update it
699
+ return True
700
+ fields = await self.get_fields(force=True)
701
+ filenames = set()
702
+ for (field_type, _), field_obj in fields.items():
703
+ if field_type == FieldType.FILE:
704
+ field_value: Optional[FieldFile] = await field_obj.get_value()
705
+ if field_value is not None:
706
+ if field_value.file.filename not in ("", None):
707
+ filenames.add(field_value.file.filename)
708
+ if current_title in filenames:
709
+ # If the title is equal to any of the file filenames, we should update it
710
+ return True
711
+ return False
712
+
713
+ async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
714
+ """
715
+ Update the resource title with the first file that has a title extracted.
716
+ """
717
+ if not await self._should_update_resource_title_from_file_metadata():
718
+ return
719
+ for fed in message.file_extracted_data:
720
+ if fed.title == "":
721
+ # Skip if the extracted title is empty
722
+ continue
723
+ fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
724
+ logger.info(
725
+ "Updating resource title from file extracted data",
726
+ extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
727
+ )
728
+ await self.update_resource_title(fed.title)
729
+ # Break after the first file with a title is found
730
+ break
731
+
732
+ async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
899
733
  assert self.basic is not None
900
734
  maybe_update_basic_summary(self.basic, field_metadata.metadata.metadata.summary)
901
735
 
@@ -904,17 +738,11 @@ class Resource:
904
738
  field_metadata.field.field_type,
905
739
  load=False,
906
740
  )
907
- (
908
- metadata,
909
- replace_field,
910
- replace_splits,
911
- ) = await field_obj.set_field_metadata(field_metadata)
741
+ metadata = await field_obj.set_field_metadata(field_metadata)
912
742
  field_key = self.generate_field_id(field_metadata.field)
913
743
 
914
744
  page_positions: Optional[FilePagePositions] = None
915
- if field_metadata.field.field_type == FieldType.FILE and isinstance(
916
- field_obj, File
917
- ):
745
+ if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
918
746
  page_positions = await get_file_page_positions(field_obj)
919
747
 
920
748
  user_field_metadata = next(
@@ -932,29 +760,24 @@ class Resource:
932
760
  self.indexer.apply_field_metadata,
933
761
  field_key,
934
762
  metadata,
935
- replace_field=replace_field,
936
- replace_splits=replace_splits,
937
763
  page_positions=page_positions,
938
764
  extracted_text=extracted_text,
939
765
  basic_user_field_metadata=user_field_metadata,
766
+ replace_field=True,
940
767
  )
941
768
  loop = asyncio.get_running_loop()
942
769
  await loop.run_in_executor(_executor, apply_field_metadata)
943
770
 
944
- maybe_update_basic_thumbnail(
945
- self.basic, field_metadata.metadata.metadata.thumbnail
946
- )
771
+ maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
947
772
 
948
773
  add_field_classifications(self.basic, field_metadata)
949
774
 
950
775
  async def _apply_extracted_vectors(self, field_vectors: ExtractedVectorsWrapper):
951
- if not self.has_field(
952
- field_vectors.field.field_type, field_vectors.field.field
953
- ):
776
+ # Store vectors in the resource
777
+
778
+ if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
954
779
  # skipping because field does not exist
955
- logger.warning(
956
- f'Field "{field_vectors.field.field}" does not exist, skipping vectors'
957
- )
780
+ logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
958
781
  return
959
782
 
960
783
  field_obj = await self.get_field(
@@ -962,22 +785,36 @@ class Resource:
962
785
  field_vectors.field.field_type,
963
786
  load=False,
964
787
  )
965
- (
966
- vo,
967
- replace_field_sentences,
968
- replace_splits_sentences,
969
- ) = await field_obj.set_vectors(field_vectors)
788
+ vo = await field_obj.set_vectors(field_vectors)
789
+
790
+ # Prepare vectors to be indexed
791
+
970
792
  field_key = self.generate_field_id(field_vectors.field)
971
793
  if vo is not None:
972
- dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
973
- self.txn, kbid=self.kb.kbid
974
- )
794
+ vectorset_id = field_vectors.vectorset_id or None
795
+ if vectorset_id is None:
796
+ dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
797
+ self.txn, kbid=self.kb.kbid
798
+ )
799
+ else:
800
+ config = await datamanagers.vectorsets.get(
801
+ self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
802
+ )
803
+ if config is None:
804
+ logger.warning(
805
+ f"Trying to apply a resource on vectorset '{vectorset_id}' that doesn't exist."
806
+ )
807
+ return
808
+ dimension = config.vectorset_index_config.vector_dimension
809
+ if not dimension:
810
+ raise ValueError(f"Vector dimension not set for vectorset '{vectorset_id}'")
811
+
975
812
  apply_field_vectors_partial = partial(
976
813
  self.indexer.apply_field_vectors,
977
814
  field_key,
978
815
  vo,
979
- replace_field=replace_field_sentences,
980
- replace_splits=replace_splits_sentences,
816
+ vectorset=vectorset_id,
817
+ replace_field=True,
981
818
  matryoshka_vector_dimension=dimension,
982
819
  )
983
820
  loop = asyncio.get_running_loop()
@@ -985,9 +822,7 @@ class Resource:
985
822
  else:
986
823
  raise AttributeError("VO not found on set")
987
824
 
988
- async def _apply_field_large_metadata(
989
- self, field_large_metadata: LargeComputedMetadataWrapper
990
- ):
825
+ async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
991
826
  field_obj = await self.get_field(
992
827
  field_large_metadata.field.field,
993
828
  field_large_metadata.field.field_type,
@@ -996,7 +831,7 @@ class Resource:
996
831
  await field_obj.set_large_field_metadata(field_large_metadata)
997
832
 
998
833
  def generate_field_id(self, field: FieldID) -> str:
999
- return f"{FIELD_TYPE_TO_ID[field.field_type]}/{field.field}"
834
+ return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
1000
835
 
1001
836
  async def compute_security(self, brain: ResourceBrain):
1002
837
  security = await self.get_security()
@@ -1015,7 +850,7 @@ class Resource:
1015
850
  brain.set_resource_metadata(basic=basic, origin=origin)
1016
851
  for type, field in await self.get_fields_ids(force=True):
1017
852
  fieldobj = await self.get_field(field, type, load=False)
1018
- fieldid = FieldID(field_type=type, field=field) # type: ignore
853
+ fieldid = FieldID(field_type=type, field=field)
1019
854
  fieldkey = self.generate_field_id(fieldid)
1020
855
  extracted_metadata = await fieldobj.get_field_metadata()
1021
856
  valid_user_field_metadata = None
@@ -1026,16 +861,16 @@ class Resource:
1026
861
  ):
1027
862
  valid_user_field_metadata = user_field_metadata
1028
863
  break
864
+
865
+ generated_by = await fieldobj.generated_by()
1029
866
  brain.apply_field_labels(
1030
867
  fieldkey,
1031
868
  extracted_metadata,
1032
869
  self.uuid,
870
+ generated_by,
1033
871
  basic.usermetadata,
1034
872
  valid_user_field_metadata,
1035
873
  )
1036
- if type == FieldType.KEYWORDSET:
1037
- field_data = await fieldobj.db_get_value()
1038
- brain.process_keywordset_fields(fieldkey, field_data)
1039
874
 
1040
875
  @processor_observer.wrap({"type": "compute_global_text"})
1041
876
  async def compute_global_text(self):
@@ -1072,12 +907,10 @@ class Resource:
1072
907
  for fieldmetadata in self.basic.fieldmetadata:
1073
908
  field_id = self.generate_field_id(fieldmetadata.field)
1074
909
  for annotationparagraph in fieldmetadata.paragraphs:
1075
- userdefinedparagraphclass[annotationparagraph.key] = (
1076
- annotationparagraph
1077
- )
910
+ userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
1078
911
 
1079
912
  for (type_id, field_id), field in fields.items():
1080
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
913
+ fieldid = FieldID(field_type=type_id, field=field_id)
1081
914
  field_key = self.generate_field_id(fieldid)
1082
915
  fm = await field.get_field_metadata()
1083
916
  extracted_text = None
@@ -1092,9 +925,7 @@ class Resource:
1092
925
  if fm is None:
1093
926
  continue
1094
927
 
1095
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1096
- (None, fm.metadata)
1097
- ]
928
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1098
929
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1099
930
  field_metadatas.append((subfield_metadata, splitted_metadata))
1100
931
 
@@ -1105,7 +936,7 @@ class Resource:
1105
936
 
1106
937
  entities: dict[str, str] = {}
1107
938
  if enabled_metadata.entities:
1108
- entities.update(field_metadata.ner)
939
+ _update_entities_dict(entities, field_metadata)
1109
940
 
1110
941
  precomputed_vectors = {}
1111
942
  if vo is not None:
@@ -1116,9 +947,7 @@ class Resource:
1116
947
  vectors = vo.vectors
1117
948
  base_vector_key = f"{self.uuid}/{field_key}"
1118
949
  for index, vector in enumerate(vectors.vectors):
1119
- vector_key = (
1120
- f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
1121
- )
950
+ vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
1122
951
  precomputed_vectors[vector_key] = vector.vector
1123
952
 
1124
953
  if extracted_text is not None:
@@ -1129,11 +958,11 @@ class Resource:
1129
958
 
1130
959
  for paragraph in field_metadata.paragraphs:
1131
960
  if subfield is not None:
1132
- paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1133
- else:
1134
961
  paragraph_key = (
1135
- f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
962
+ f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1136
963
  )
964
+ else:
965
+ paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1137
966
 
1138
967
  if enabled_metadata.labels:
1139
968
  metadata.labels.ClearField("field")
@@ -1147,7 +976,9 @@ class Resource:
1147
976
  if subfield is not None:
1148
977
  sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
1149
978
  else:
1150
- sentence_key = f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
979
+ sentence_key = (
980
+ f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
981
+ )
1151
982
 
1152
983
  if vo is not None:
1153
984
  metadata.ClearField("vector")
@@ -1186,12 +1017,10 @@ class Resource:
1186
1017
  for fieldmetadata in self.basic.fieldmetadata:
1187
1018
  field_id = self.generate_field_id(fieldmetadata.field)
1188
1019
  for annotationparagraph in fieldmetadata.paragraphs:
1189
- userdefinedparagraphclass[annotationparagraph.key] = (
1190
- annotationparagraph
1191
- )
1020
+ userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
1192
1021
 
1193
1022
  for (type_id, field_id), field in fields.items():
1194
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
1023
+ fieldid = FieldID(field_type=type_id, field=field_id)
1195
1024
  field_key = self.generate_field_id(fieldid)
1196
1025
  fm = await field.get_field_metadata()
1197
1026
  extracted_text = None
@@ -1202,9 +1031,7 @@ class Resource:
1202
1031
  if fm is None:
1203
1032
  continue
1204
1033
 
1205
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1206
- (None, fm.metadata)
1207
- ]
1034
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1208
1035
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1209
1036
  field_metadatas.append((subfield_metadata, splitted_metadata))
1210
1037
 
@@ -1215,7 +1042,7 @@ class Resource:
1215
1042
 
1216
1043
  entities: dict[str, str] = {}
1217
1044
  if enabled_metadata.entities:
1218
- entities.update(field_metadata.ner)
1045
+ _update_entities_dict(entities, field_metadata)
1219
1046
 
1220
1047
  if extracted_text is not None:
1221
1048
  if subfield is not None:
@@ -1225,11 +1052,11 @@ class Resource:
1225
1052
 
1226
1053
  for paragraph in field_metadata.paragraphs:
1227
1054
  if subfield is not None:
1228
- paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1229
- else:
1230
1055
  paragraph_key = (
1231
- f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1056
+ f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1232
1057
  )
1058
+ else:
1059
+ paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1233
1060
 
1234
1061
  if enabled_metadata.labels:
1235
1062
  metadata.labels.ClearField("paragraph")
@@ -1257,9 +1084,7 @@ class Resource:
1257
1084
 
1258
1085
  yield pb_paragraph
1259
1086
 
1260
- async def iterate_fields(
1261
- self, enabled_metadata: EnabledMetadata
1262
- ) -> AsyncIterator[TrainField]:
1087
+ async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
1263
1088
  fields = await self.get_fields(force=True)
1264
1089
  metadata = TrainMetadata()
1265
1090
  if enabled_metadata.labels:
@@ -1269,7 +1094,7 @@ class Resource:
1269
1094
  metadata.labels.resource.extend(self.basic.usermetadata.classifications)
1270
1095
 
1271
1096
  for (type_id, field_id), field in fields.items():
1272
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
1097
+ fieldid = FieldID(field_type=type_id, field=field_id)
1273
1098
  fm = await field.get_field_metadata()
1274
1099
  extracted_text = None
1275
1100
 
@@ -1279,9 +1104,7 @@ class Resource:
1279
1104
  if fm is None:
1280
1105
  continue
1281
1106
 
1282
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1283
- (None, fm.metadata)
1284
- ]
1107
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1285
1108
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1286
1109
  field_metadatas.append((subfield_metadata, splitted_metadata))
1287
1110
 
@@ -1298,7 +1121,7 @@ class Resource:
1298
1121
 
1299
1122
  if enabled_metadata.entities:
1300
1123
  metadata.ClearField("entities")
1301
- metadata.entities.update(splitted_metadata.ner)
1124
+ _update_entities_dict(metadata.entities, splitted_metadata)
1302
1125
 
1303
1126
  pb_field = TrainField()
1304
1127
  pb_field.uuid = self.uuid
@@ -1306,9 +1129,7 @@ class Resource:
1306
1129
  pb_field.metadata.CopyFrom(metadata)
1307
1130
  yield pb_field
1308
1131
 
1309
- async def generate_train_resource(
1310
- self, enabled_metadata: EnabledMetadata
1311
- ) -> TrainResource:
1132
+ async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
1312
1133
  fields = await self.get_fields(force=True)
1313
1134
  metadata = TrainMetadata()
1314
1135
  if enabled_metadata.labels:
@@ -1335,9 +1156,7 @@ class Resource:
1335
1156
  if fm is None:
1336
1157
  continue
1337
1158
 
1338
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1339
- (None, fm.metadata)
1340
- ]
1159
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1341
1160
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1342
1161
  field_metadatas.append((subfield_metadata, splitted_metadata))
1343
1162
 
@@ -1346,7 +1165,7 @@ class Resource:
1346
1165
  metadata.labels.field.extend(splitted_metadata.classifications)
1347
1166
 
1348
1167
  if enabled_metadata.entities:
1349
- metadata.entities.update(splitted_metadata.ner)
1168
+ _update_entities_dict(metadata.entities, splitted_metadata)
1350
1169
 
1351
1170
  pb_resource = TrainResource()
1352
1171
  pb_resource.uuid = self.uuid
@@ -1375,33 +1194,35 @@ def remove_field_classifications(basic: PBBasic, deleted_fields: list[FieldID]):
1375
1194
  Clean classifications of fields that have been deleted
1376
1195
  """
1377
1196
  field_classifications = [
1378
- fc
1379
- for fc in basic.computedmetadata.field_classifications
1380
- if fc.field not in deleted_fields
1197
+ fc for fc in basic.computedmetadata.field_classifications if fc.field not in deleted_fields
1381
1198
  ]
1382
1199
  basic.computedmetadata.ClearField("field_classifications")
1383
1200
  basic.computedmetadata.field_classifications.extend(field_classifications)
1384
1201
 
1385
1202
 
1386
- def add_field_classifications(
1387
- basic: PBBasic, fcmw: FieldComputedMetadataWrapper
1388
- ) -> bool:
1203
+ def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper) -> bool:
1389
1204
  """
1390
1205
  Returns whether some new field classifications were added
1391
1206
  """
1392
- if len(fcmw.metadata.metadata.classifications) == 0:
1207
+ if len(fcmw.metadata.metadata.classifications) == 0 and all(
1208
+ len(split.classifications) == 0 for split in fcmw.metadata.split_metadata.values()
1209
+ ):
1393
1210
  return False
1211
+
1394
1212
  remove_field_classifications(basic, [fcmw.field])
1395
1213
  fcfs = FieldClassifications()
1396
1214
  fcfs.field.CopyFrom(fcmw.field)
1397
1215
  fcfs.classifications.extend(fcmw.metadata.metadata.classifications)
1216
+
1217
+ for split_id, split in fcmw.metadata.split_metadata.items():
1218
+ if split_id not in fcmw.metadata.deleted_splits:
1219
+ fcfs.classifications.extend(split.classifications)
1220
+
1398
1221
  basic.computedmetadata.field_classifications.append(fcfs)
1399
1222
  return True
1400
1223
 
1401
1224
 
1402
- def add_entities_to_metadata(
1403
- entities: dict[str, str], local_text: str, metadata: TrainMetadata
1404
- ) -> None:
1225
+ def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
1405
1226
  for entity_key, entity_value in entities.items():
1406
1227
  if entity_key not in local_text:
1407
1228
  # Add the entity only if found in text
@@ -1415,9 +1236,7 @@ def add_entities_to_metadata(
1415
1236
  for _ in range(local_text.count(entity_key)):
1416
1237
  start = local_text.index(entity_key, last_occurrence_end)
1417
1238
  end = start + len(entity_key)
1418
- metadata.entity_positions[poskey].positions.append(
1419
- TrainPosition(start=start, end=end)
1420
- )
1239
+ metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
1421
1240
  last_occurrence_end = end
1422
1241
 
1423
1242
 
@@ -1432,15 +1251,22 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
1432
1251
  if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
1433
1252
  # Icon already set or detected
1434
1253
  return False
1254
+
1435
1255
  if not mimetype:
1436
1256
  return False
1257
+
1258
+ if not content_types.valid(mimetype):
1259
+ logger.warning(
1260
+ "Invalid mimetype. Skipping icon update.",
1261
+ extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
1262
+ )
1263
+ return False
1264
+
1437
1265
  basic.icon = mimetype
1438
1266
  return True
1439
1267
 
1440
1268
 
1441
- def maybe_update_basic_thumbnail(
1442
- basic: PBBasic, thumbnail: Optional[CloudFile]
1443
- ) -> bool:
1269
+ def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile]) -> bool:
1444
1270
  if basic.thumbnail or thumbnail is None:
1445
1271
  return False
1446
1272
  basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
@@ -1482,3 +1308,23 @@ def extract_field_metadata_languages(
1482
1308
  for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
1483
1309
  languages.add(splitted_metadata.language)
1484
1310
  return list(languages)
1311
+
1312
+
1313
+ def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
1314
+ """
1315
+ Update the entities dict with the entities from the field metadata.
1316
+ Method created to ease the transition from legacy ner field to new entities field.
1317
+ """
1318
+ # Data Augmentation + Processor entities
1319
+ # This will overwrite entities detected from more than one data augmentation task
1320
+ # TODO: Change TrainMetadata proto to accept multiple entities with the same text
1321
+ entity_map = {
1322
+ entity.text: entity.label
1323
+ for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
1324
+ for entity in entities_wrapper.entities
1325
+ }
1326
+ target_entites_dict.update(entity_map)
1327
+
1328
+ # Legacy processor entities
1329
+ # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
1330
+ target_entites_dict.update(field_metadata.ner)