nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -23,19 +23,33 @@ import asyncio
23
23
  import logging
24
24
  from concurrent.futures import ThreadPoolExecutor
25
25
  from functools import partial
26
- from typing import TYPE_CHECKING, Any, AsyncIterator, Optional, Type
26
+ from typing import TYPE_CHECKING, Any, AsyncIterator, MutableMapping, Optional, Type
27
27
 
28
+ from nucliadb.common import datamanagers
29
+ from nucliadb.common.datamanagers.resources import KB_RESOURCE_SLUG
30
+ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR, FieldId
31
+ from nucliadb.common.maindb.driver import Transaction
32
+ from nucliadb.ingest.fields.base import Field
33
+ from nucliadb.ingest.fields.conversation import Conversation
34
+ from nucliadb.ingest.fields.file import File
35
+ from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
36
+ from nucliadb.ingest.fields.link import Link
37
+ from nucliadb.ingest.fields.text import Text
38
+ from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
39
+ from nucliadb.ingest.orm.metrics import processor_observer
40
+ from nucliadb_models import content_types
41
+ from nucliadb_models.common import CloudLink
42
+ from nucliadb_models.content_types import GENERIC_MIME_TYPE
43
+ from nucliadb_protos import utils_pb2, writer_pb2
28
44
  from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
29
- from nucliadb_protos.resources_pb2 import Basic
30
- from nucliadb_protos.resources_pb2 import Basic as PBBasic
31
- from nucliadb_protos.resources_pb2 import CloudFile
32
- from nucliadb_protos.resources_pb2 import Conversation as PBConversation
33
- from nucliadb_protos.resources_pb2 import Extra as PBExtra
34
45
  from nucliadb_protos.resources_pb2 import (
46
+ Basic,
47
+ CloudFile,
35
48
  ExtractedTextWrapper,
36
49
  ExtractedVectorsWrapper,
37
50
  FieldClassifications,
38
51
  FieldComputedMetadataWrapper,
52
+ FieldFile,
39
53
  FieldID,
40
54
  FieldMetadata,
41
55
  FieldQuestionAnswerWrapper,
@@ -44,40 +58,27 @@ from nucliadb_protos.resources_pb2 import (
44
58
  FileExtractedData,
45
59
  LargeComputedMetadataWrapper,
46
60
  LinkExtractedData,
61
+ Metadata,
62
+ Paragraph,
63
+ ParagraphAnnotation,
47
64
  )
48
- from nucliadb_protos.resources_pb2 import Metadata
65
+ from nucliadb_protos.resources_pb2 import Basic as PBBasic
66
+ from nucliadb_protos.resources_pb2 import Conversation as PBConversation
67
+ from nucliadb_protos.resources_pb2 import Extra as PBExtra
49
68
  from nucliadb_protos.resources_pb2 import Metadata as PBMetadata
50
69
  from nucliadb_protos.resources_pb2 import Origin as PBOrigin
51
- from nucliadb_protos.resources_pb2 import Paragraph, ParagraphAnnotation
52
70
  from nucliadb_protos.resources_pb2 import Relations as PBRelations
53
- from nucliadb_protos.train_pb2 import EnabledMetadata
54
- from nucliadb_protos.train_pb2 import Position as TrainPosition
55
71
  from nucliadb_protos.train_pb2 import (
72
+ EnabledMetadata,
56
73
  TrainField,
57
74
  TrainMetadata,
58
75
  TrainParagraph,
59
76
  TrainResource,
60
77
  TrainSentence,
61
78
  )
79
+ from nucliadb_protos.train_pb2 import Position as TrainPosition
62
80
  from nucliadb_protos.utils_pb2 import Relation as PBRelation
63
81
  from nucliadb_protos.writer_pb2 import BrokerMessage
64
-
65
- from nucliadb.common import datamanagers
66
- from nucliadb.common.maindb.driver import Transaction
67
- from nucliadb.ingest.fields.base import Field
68
- from nucliadb.ingest.fields.conversation import Conversation
69
- from nucliadb.ingest.fields.date import Datetime
70
- from nucliadb.ingest.fields.file import File
71
- from nucliadb.ingest.fields.generic import VALID_GENERIC_FIELDS, Generic
72
- from nucliadb.ingest.fields.keywordset import Keywordset
73
- from nucliadb.ingest.fields.layout import Layout
74
- from nucliadb.ingest.fields.link import Link
75
- from nucliadb.ingest.fields.text import Text
76
- from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
77
- from nucliadb.ingest.orm.metrics import processor_observer
78
- from nucliadb_models.common import CloudLink
79
- from nucliadb_models.writer import GENERIC_MIME_TYPE
80
- from nucliadb_protos import utils_pb2, writer_pb2
81
82
  from nucliadb_utils.storages.storage import Storage
82
83
 
83
84
  if TYPE_CHECKING: # pragma: no cover
@@ -85,33 +86,14 @@ if TYPE_CHECKING: # pragma: no cover
85
86
 
86
87
  logger = logging.getLogger(__name__)
87
88
 
88
- KB_RESOURCE_FIELDS = "/kbs/{kbid}/r/{uuid}/f/"
89
- KB_RESOURCE_SLUG_BASE = "/kbs/{kbid}/s/"
90
- KB_RESOURCE_SLUG = f"{KB_RESOURCE_SLUG_BASE}{{slug}}"
91
89
  KB_FIELDS: dict[int, Type] = {
92
- FieldType.LAYOUT: Layout,
93
90
  FieldType.TEXT: Text,
94
91
  FieldType.FILE: File,
95
92
  FieldType.LINK: Link,
96
- FieldType.DATETIME: Datetime,
97
- FieldType.KEYWORDSET: Keywordset,
98
93
  FieldType.GENERIC: Generic,
99
94
  FieldType.CONVERSATION: Conversation,
100
95
  }
101
96
 
102
- KB_REVERSE: dict[str, FieldType.ValueType] = {
103
- "l": FieldType.LAYOUT,
104
- "t": FieldType.TEXT,
105
- "f": FieldType.FILE,
106
- "u": FieldType.LINK,
107
- "d": FieldType.DATETIME,
108
- "k": FieldType.KEYWORDSET,
109
- "a": FieldType.GENERIC,
110
- "c": FieldType.CONVERSATION,
111
- }
112
-
113
- FIELD_TYPE_TO_ID = {v: k for k, v in KB_REVERSE.items()}
114
-
115
97
  _executor = ThreadPoolExecutor(10)
116
98
 
117
99
 
@@ -122,6 +104,8 @@ PB_TEXT_FORMAT_TO_MIMETYPE = {
122
104
  FieldText.Format.MARKDOWN: "text/markdown",
123
105
  FieldText.Format.JSON: "application/json",
124
106
  FieldText.Format.KEEP_MARKDOWN: "text/markdown",
107
+ FieldText.Format.JSONL: "application/x-ndjson",
108
+ FieldText.Format.PLAIN_BLANKLINE_SPLIT: "text/plain+blankline",
125
109
  }
126
110
 
127
111
  BASIC_IMMUTABLE_FIELDS = ("icon",)
@@ -173,9 +157,7 @@ class Resource:
173
157
  # Basic
174
158
  async def get_basic(self) -> Optional[PBBasic]:
175
159
  if self.basic is None:
176
- basic = await datamanagers.resources.get_basic(
177
- self.txn, kbid=self.kb.kbid, rid=self.uuid
178
- )
160
+ basic = await datamanagers.resources.get_basic(self.txn, kbid=self.kb.kbid, rid=self.uuid)
179
161
  self.basic = basic if basic is not None else PBBasic()
180
162
  return self.basic
181
163
 
@@ -221,9 +203,7 @@ class Resource:
221
203
  fields.append(field_id)
222
204
  positions[field_id] = i
223
205
 
224
- updated = [
225
- self.basic.fieldmetadata[positions[field]] for field in fields
226
- ]
206
+ updated = [self.basic.fieldmetadata[positions[field]] for field in fields]
227
207
 
228
208
  del self.basic.fieldmetadata[:]
229
209
  self.basic.fieldmetadata.extend(updated)
@@ -244,11 +224,10 @@ class Resource:
244
224
  self.indexer.apply_field_metadata(
245
225
  field_id,
246
226
  field_metadata,
247
- replace_field=[],
248
- replace_splits={},
249
227
  page_positions=page_positions,
250
228
  extracted_text=await field_obj.get_extracted_text(),
251
229
  basic_user_field_metadata=user_field_metadata,
230
+ replace_field=True,
252
231
  )
253
232
 
254
233
  # Some basic fields are computed off field metadata.
@@ -264,9 +243,7 @@ class Resource:
264
243
  # Origin
265
244
  async def get_origin(self) -> Optional[PBOrigin]:
266
245
  if self.origin is None:
267
- origin = await datamanagers.resources.get_origin(
268
- self.txn, kbid=self.kb.kbid, rid=self.uuid
269
- )
246
+ origin = await datamanagers.resources.get_origin(self.txn, kbid=self.kb.kbid, rid=self.uuid)
270
247
  self.origin = origin
271
248
  return self.origin
272
249
 
@@ -280,16 +257,12 @@ class Resource:
280
257
  # Extra
281
258
  async def get_extra(self) -> Optional[PBExtra]:
282
259
  if self.extra is None:
283
- extra = await datamanagers.resources.get_extra(
284
- self.txn, kbid=self.kb.kbid, rid=self.uuid
285
- )
260
+ extra = await datamanagers.resources.get_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid)
286
261
  self.extra = extra
287
262
  return self.extra
288
263
 
289
264
  async def set_extra(self, payload: PBExtra):
290
- await datamanagers.resources.set_extra(
291
- self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload
292
- )
265
+ await datamanagers.resources.set_extra(self.txn, kbid=self.kb.kbid, rid=self.uuid, extra=payload)
293
266
  self.modified = True
294
267
  self.extra = payload
295
268
 
@@ -329,7 +302,7 @@ class Resource:
329
302
  self.relations = relations
330
303
 
331
304
  @processor_observer.wrap({"type": "generate_index_message"})
332
- async def generate_index_message(self) -> ResourceBrain:
305
+ async def generate_index_message(self, reindex: bool = False) -> ResourceBrain:
333
306
  brain = ResourceBrain(rid=self.uuid)
334
307
  origin = await self.get_origin()
335
308
  basic = await self.get_basic()
@@ -339,7 +312,7 @@ class Resource:
339
312
  await self.compute_global_tags(brain)
340
313
  fields = await self.get_fields(force=True)
341
314
  for (type_id, field_id), field in fields.items():
342
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
315
+ fieldid = FieldID(field_type=type_id, field=field_id)
343
316
  await self.compute_global_text_field(fieldid, brain)
344
317
 
345
318
  field_metadata = await field.get_field_metadata()
@@ -355,234 +328,66 @@ class Resource:
355
328
  (
356
329
  fm
357
330
  for fm in basic.fieldmetadata
358
- if fm.field.field == field_id
359
- and fm.field.field_type == type_id
331
+ if fm.field.field == field_id and fm.field.field_type == type_id
360
332
  ),
361
333
  None,
362
334
  )
363
335
  brain.apply_field_metadata(
364
336
  field_key,
365
337
  field_metadata,
366
- replace_field=[],
367
- replace_splits={},
368
338
  page_positions=page_positions,
369
339
  extracted_text=await field.get_extracted_text(),
370
340
  basic_user_field_metadata=user_field_metadata,
341
+ replace_field=reindex,
371
342
  )
372
343
 
373
344
  if self.disable_vectors is False:
345
+ # XXX: while we don't remove the "default" vectorset concept, we
346
+ # need to do use None as the default one
374
347
  vo = await field.get_vectors()
375
348
  if vo is not None:
376
- dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
377
- self.txn, kbid=self.kb.kbid
378
- )
349
+ async with datamanagers.with_ro_transaction() as ro_txn:
350
+ dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
351
+ ro_txn, kbid=self.kb.kbid
352
+ )
379
353
  brain.apply_field_vectors(
380
354
  field_key,
381
355
  vo,
382
356
  matryoshka_vector_dimension=dimension,
357
+ replace_field=reindex,
383
358
  )
384
- return brain
385
-
386
- async def generate_field_vectors(
387
- self,
388
- bm: BrokerMessage,
389
- type_id: FieldType.ValueType,
390
- field_id: str,
391
- field: Field,
392
- ):
393
- vo = await field.get_vectors()
394
- if vo is None:
395
- return
396
- evw = ExtractedVectorsWrapper()
397
- evw.field.field = field_id
398
- evw.field.field_type = type_id # type: ignore
399
- evw.vectors.CopyFrom(vo)
400
- bm.field_vectors.append(evw)
401
-
402
- async def generate_field_large_computed_metadata(
403
- self,
404
- bm: BrokerMessage,
405
- type_id: FieldType.ValueType,
406
- field_id: str,
407
- field: Field,
408
- ):
409
- lcm = await field.get_large_field_metadata()
410
- if lcm is None:
411
- return
412
- lcmw = LargeComputedMetadataWrapper()
413
- lcmw.field.field = field_id
414
- lcmw.field.field_type = type_id # type: ignore
415
- lcmw.real.CopyFrom(lcm)
416
- bm.field_large_metadata.append(lcmw)
417
-
418
- async def generate_field_computed_metadata(
419
- self,
420
- bm: BrokerMessage,
421
- type_id: FieldType.ValueType,
422
- field_id: str,
423
- field: Field,
424
- ):
425
- fcmw = FieldComputedMetadataWrapper()
426
- fcmw.field.field = field_id
427
- fcmw.field.field_type = type_id # type: ignore
428
-
429
- field_metadata = await field.get_field_metadata()
430
- if field_metadata is not None:
431
- fcmw.metadata.CopyFrom(field_metadata)
432
- fcmw.field.field = field_id
433
- fcmw.field.field_type = type_id # type: ignore
434
- bm.field_metadata.append(fcmw)
435
- # Make sure cloud files are removed for exporting
436
-
437
- async def generate_extracted_text(
438
- self,
439
- bm: BrokerMessage,
440
- type_id: FieldType.ValueType,
441
- field_id: str,
442
- field: Field,
443
- ):
444
- etw = ExtractedTextWrapper()
445
- etw.field.field = field_id
446
- etw.field.field_type = type_id # type: ignore
447
- extracted_text = await field.get_extracted_text()
448
- if extracted_text is not None:
449
- etw.body.CopyFrom(extracted_text)
450
- bm.extracted_text.append(etw)
451
-
452
- async def generate_field(
453
- self,
454
- bm: BrokerMessage,
455
- type_id: FieldType.ValueType,
456
- field_id: str,
457
- field: Field,
458
- ):
459
- # Used for exporting a field
460
- if type_id == FieldType.TEXT:
461
- value = await field.get_value()
462
- bm.texts[field_id].CopyFrom(value)
463
- elif type_id == FieldType.LINK:
464
- value = await field.get_value()
465
- bm.links[field_id].CopyFrom(value)
466
- elif type_id == FieldType.FILE:
467
- value = await field.get_value()
468
- bm.files[field_id].CopyFrom(value)
469
- elif type_id == FieldType.CONVERSATION:
470
- value = await self.get_full_conversation(field) # type: ignore
471
- bm.conversations[field_id].CopyFrom(value)
472
- elif type_id == FieldType.KEYWORDSET:
473
- value = await field.get_value()
474
- bm.keywordsets[field_id].CopyFrom(value)
475
- elif type_id == FieldType.DATETIME:
476
- value = await field.get_value()
477
- bm.datetimes[field_id].CopyFrom(value)
478
- elif type_id == FieldType.LAYOUT:
479
- value = await field.get_value()
480
- bm.layouts[field_id].CopyFrom(value)
481
-
482
- async def get_full_conversation(
483
- self,
484
- conversation_field: Conversation,
485
- ) -> Optional[PBConversation]:
486
- """
487
- Messages of a conversations may be stored across several pages.
488
- This method fetches them all and returns a single complete conversation.
489
- """
490
- full_conv = PBConversation()
491
- n_page = 1
492
- while True:
493
- page = await conversation_field.get_value(page=n_page)
494
- if page is None:
495
- break
496
- full_conv.messages.extend(page.messages)
497
- n_page += 1
498
- return full_conv
499
-
500
- async def generate_broker_message(self) -> BrokerMessage:
501
- # full means downloading all the pointers
502
- # minuts the ones to external files that are not PB
503
- # Go for all fields and recreate brain
504
- bm = BrokerMessage()
505
- bm.kbid = self.kb.kbid
506
- bm.uuid = self.uuid
507
- basic = await self.get_basic()
508
- if basic is not None:
509
- bm.basic.CopyFrom(basic)
510
- bm.slug = bm.basic.slug
511
- origin = await self.get_origin()
512
- if origin is not None:
513
- bm.origin.CopyFrom(origin)
514
- relations = await self.get_relations()
515
- if relations is not None:
516
- for relation in relations.relations:
517
- bm.relations.append(relation)
518
-
519
- fields = await self.get_fields(force=True)
520
- for (type_id, field_id), field in fields.items():
521
- # Value
522
- await self.generate_field(bm, type_id, field_id, field)
523
-
524
- # Extracted text
525
- await self.generate_extracted_text(bm, type_id, field_id, field)
526
-
527
- # Field Computed Metadata
528
- await self.generate_field_computed_metadata(bm, type_id, field_id, field)
529
-
530
- if type_id == FieldType.FILE and isinstance(field, File):
531
- field_extracted_data = await field.get_file_extracted_data()
532
- if field_extracted_data is not None:
533
- bm.file_extracted_data.append(field_extracted_data)
534
359
 
535
- elif type_id == FieldType.LINK and isinstance(field, Link):
536
- link_extracted_data = await field.get_link_extracted_data()
537
- if link_extracted_data is not None:
538
- bm.link_extracted_data.append(link_extracted_data)
539
-
540
- # Field vectors
541
- await self.generate_field_vectors(bm, type_id, field_id, field)
542
-
543
- # Large metadata
544
- await self.generate_field_large_computed_metadata(
545
- bm, type_id, field_id, field
546
- )
547
-
548
- return bm
360
+ vectorset_configs = []
361
+ async with datamanagers.with_ro_transaction() as ro_txn:
362
+ async for vectorset_id, vectorset_config in datamanagers.vectorsets.iter(
363
+ ro_txn, kbid=self.kb.kbid
364
+ ):
365
+ vectorset_configs.append(vectorset_config)
366
+ for vectorset_config in vectorset_configs:
367
+ vo = await field.get_vectors(vectorset=vectorset_config.vectorset_id)
368
+ if vo is not None:
369
+ dimension = vectorset_config.vectorset_index_config.vector_dimension
370
+ brain.apply_field_vectors(
371
+ field_key,
372
+ vo,
373
+ vectorset=vectorset_config.vectorset_id,
374
+ matryoshka_vector_dimension=dimension,
375
+ replace_field=reindex,
376
+ )
377
+ return brain
549
378
 
550
379
  # Fields
551
- async def get_fields(
552
- self, force: bool = False
553
- ) -> dict[tuple[FieldType.ValueType, str], Field]:
380
+ async def get_fields(self, force: bool = False) -> dict[tuple[FieldType.ValueType, str], Field]:
554
381
  # Get all fields
555
382
  for type, field in await self.get_fields_ids(force=force):
556
383
  if (type, field) not in self.fields:
557
384
  self.fields[(type, field)] = await self.get_field(field, type)
558
385
  return self.fields
559
386
 
560
- async def _deprecated_scan_fields_ids(
561
- self,
562
- ) -> AsyncIterator[tuple[FieldType.ValueType, str]]:
563
- logger.warning("Scanning fields ids. This is not optimal.")
564
- prefix = KB_RESOURCE_FIELDS.format(kbid=self.kb.kbid, uuid=self.uuid)
565
- allfields = set()
566
- async for key in self.txn.keys(prefix, count=-1):
567
- # The [6:8] `slicing purpose is to match exactly the two
568
- # splitted parts corresponding to type and field, and nothing else!
569
- type, field = key.split("/")[6:8]
570
- type_id = KB_REVERSE.get(type)
571
- if type_id is None:
572
- raise AttributeError("Invalid field type")
573
- result = (type_id, field)
574
- if result not in allfields:
575
- # fields can have errors that are stored in a subkey:
576
- # - field key -> kbs/kbid/r/ruuid/f/myfield
577
- # - field error key -> kbs/kbid/r/ruuid/f/myfield/errors
578
- # and that would return duplicates here.
579
- yield result
580
- allfields.add(result)
581
-
582
387
  async def _inner_get_fields_ids(self) -> list[tuple[FieldType.ValueType, str]]:
583
388
  # Use a set to make sure we don't have duplicate field ids
584
389
  result = set()
585
- all_fields = await self.get_all_field_ids()
390
+ all_fields = await self.get_all_field_ids(for_update=False)
586
391
  if all_fields is not None:
587
392
  for f in all_fields.fields:
588
393
  result.add((f.field_type, f.field))
@@ -599,9 +404,7 @@ class Resource:
599
404
  result.add((FieldType.GENERIC, generic))
600
405
  return list(result)
601
406
 
602
- async def get_fields_ids(
603
- self, force: bool = False
604
- ) -> list[tuple[FieldType.ValueType, str]]:
407
+ async def get_fields_ids(self, force: bool = False) -> list[tuple[FieldType.ValueType, str]]:
605
408
  """
606
409
  Get all ids of the fields of the resource and cache them.
607
410
  """
@@ -645,23 +448,20 @@ class Resource:
645
448
  if field in self.all_fields_keys:
646
449
  self.all_fields_keys.remove(field)
647
450
 
648
- field_key = self.generate_field_id(FieldID(field_type=type, field=key)) # type: ignore
649
- vo = await field_obj.get_vectors()
650
- if vo is not None:
651
- self.indexer.delete_vectors(field_key=field_key, vo=vo)
451
+ field_key = self.generate_field_id(FieldID(field_type=type, field=key))
652
452
 
653
453
  metadata = await field_obj.get_field_metadata()
654
454
  if metadata is not None:
655
- self.indexer.delete_metadata(field_key=field_key, metadata=metadata)
455
+ self.indexer.delete_field(field_key=field_key)
656
456
 
657
457
  await field_obj.delete()
658
458
 
659
459
  def has_field(self, type: FieldType.ValueType, field: str) -> bool:
660
460
  return (type, field) in self.fields
661
461
 
662
- async def get_all_field_ids(self) -> Optional[PBAllFieldIDs]:
462
+ async def get_all_field_ids(self, *, for_update: bool) -> Optional[PBAllFieldIDs]:
663
463
  return await datamanagers.resources.get_all_field_ids(
664
- self.txn, kbid=self.kb.kbid, rid=self.uuid
464
+ self.txn, kbid=self.kb.kbid, rid=self.uuid, for_update=for_update
665
465
  )
666
466
 
667
467
  async def set_all_field_ids(self, all_fields: PBAllFieldIDs):
@@ -677,7 +477,7 @@ class Resource:
677
477
  errors: Optional[list[writer_pb2.Error]] = None,
678
478
  ):
679
479
  needs_update = False
680
- all_fields = await self.get_all_field_ids()
480
+ all_fields = await self.get_all_field_ids(for_update=True)
681
481
  if all_fields is None:
682
482
  needs_update = True
683
483
  all_fields = PBAllFieldIDs()
@@ -704,26 +504,12 @@ class Resource:
704
504
  @processor_observer.wrap({"type": "apply_fields"})
705
505
  async def apply_fields(self, message: BrokerMessage):
706
506
  message_updated_fields = []
707
- for field, layout in message.layouts.items():
708
- fid = FieldID(field_type=FieldType.LAYOUT, field=field)
709
- await self.set_field(fid.field_type, fid.field, layout)
710
- message_updated_fields.append(fid)
711
507
 
712
508
  for field, text in message.texts.items():
713
509
  fid = FieldID(field_type=FieldType.TEXT, field=field)
714
510
  await self.set_field(fid.field_type, fid.field, text)
715
511
  message_updated_fields.append(fid)
716
512
 
717
- for field, keywordset in message.keywordsets.items():
718
- fid = FieldID(field_type=FieldType.KEYWORDSET, field=field)
719
- await self.set_field(fid.field_type, fid.field, keywordset)
720
- message_updated_fields.append(fid)
721
-
722
- for field, datetimeobj in message.datetimes.items():
723
- fid = FieldID(field_type=FieldType.DATETIME, field=field)
724
- await self.set_field(fid.field_type, fid.field, datetimeobj)
725
- message_updated_fields.append(fid)
726
-
727
513
  for field, link in message.links.items():
728
514
  fid = FieldID(field_type=FieldType.LINK, field=field)
729
515
  await self.set_field(fid.field_type, fid.field, link)
@@ -742,13 +528,11 @@ class Resource:
742
528
  for fieldid in message.delete_fields:
743
529
  await self.delete_field(fieldid.field_type, fieldid.field)
744
530
 
745
- if (
746
- len(message_updated_fields)
747
- or len(message.delete_fields)
748
- or len(message.errors)
749
- ):
531
+ if len(message_updated_fields) or len(message.delete_fields) or len(message.errors):
750
532
  await self.update_all_field_ids(
751
- updated=message_updated_fields, deleted=message.delete_fields, errors=message.errors # type: ignore
533
+ updated=message_updated_fields,
534
+ deleted=message.delete_fields, # type: ignore
535
+ errors=message.errors, # type: ignore
752
536
  )
753
537
 
754
538
  @processor_observer.wrap({"type": "apply_extracted"})
@@ -784,13 +568,15 @@ class Resource:
784
568
 
785
569
  for link_extracted_data in message.link_extracted_data:
786
570
  await self._apply_link_extracted_data(link_extracted_data)
787
- await self.maybe_update_title_metadata(link_extracted_data)
571
+ await self.maybe_update_resource_title_from_link(link_extracted_data)
788
572
  extracted_languages.append(link_extracted_data.language)
789
573
 
790
574
  for file_extracted_data in message.file_extracted_data:
791
575
  await self._apply_file_extracted_data(file_extracted_data)
792
576
  extracted_languages.append(file_extracted_data.language)
793
577
 
578
+ await self.maybe_update_resource_title_from_file_extracted_data(message)
579
+
794
580
  # Metadata should go first
795
581
  for field_metadata in message.field_metadata:
796
582
  await self._apply_field_computed_metadata(field_metadata)
@@ -801,6 +587,7 @@ class Resource:
801
587
  # Upload to binary storage
802
588
  # Vector indexing
803
589
  if self.disable_vectors is False:
590
+ await self.get_fields(force=True)
804
591
  for field_vectors in message.field_vectors:
805
592
  await self._apply_extracted_vectors(field_vectors)
806
593
 
@@ -826,9 +613,7 @@ class Resource:
826
613
  extracted_text.field,
827
614
  )
828
615
 
829
- async def _apply_question_answers(
830
- self, question_answers: FieldQuestionAnswerWrapper
831
- ):
616
+ async def _apply_question_answers(self, question_answers: FieldQuestionAnswerWrapper):
832
617
  field = question_answers.field
833
618
  field_obj = await self.get_field(field.field, field.field_type, load=False)
834
619
  await field_obj.set_question_answers(question_answers)
@@ -848,19 +633,27 @@ class Resource:
848
633
 
849
634
  maybe_update_basic_summary(self.basic, link_extracted_data.description)
850
635
 
851
- async def maybe_update_title_metadata(self, link_extracted_data: LinkExtractedData):
636
+ async def maybe_update_resource_title_from_link(self, link_extracted_data: LinkExtractedData):
637
+ """
638
+ When parsing link extracted data, we want to replace the resource title for the first link
639
+ that gets processed and has a title, and only if the current title is a URL, which we take
640
+ as a hint that the title was not set by the user.
641
+ """
852
642
  assert self.basic is not None
853
643
  if not link_extracted_data.title:
854
644
  return
855
645
  if not (self.basic.title.startswith("http") or self.basic.title == ""):
856
646
  return
857
-
858
647
  title = link_extracted_data.title
859
- self.basic.title = title
648
+ await self.update_resource_title(title)
649
+
650
+ async def update_resource_title(self, computed_title: str) -> None:
651
+ assert self.basic is not None
652
+ self.basic.title = computed_title
860
653
  # Extracted text
861
654
  field = await self.get_field("title", FieldType.GENERIC, load=False)
862
655
  etw = ExtractedTextWrapper()
863
- etw.body.text = title
656
+ etw.body.text = computed_title
864
657
  await field.set_extracted_text(etw)
865
658
 
866
659
  # Field computed metadata
@@ -872,11 +665,8 @@ class Resource:
872
665
  fcm = await field.get_field_metadata(force=True)
873
666
  if fcm is not None:
874
667
  fcmw.metadata.CopyFrom(fcm)
875
-
876
668
  fcmw.metadata.metadata.ClearField("paragraphs")
877
- paragraph = Paragraph(
878
- start=0, end=len(title), kind=Paragraph.TypeParagraph.TITLE
879
- )
669
+ paragraph = Paragraph(start=0, end=len(computed_title), kind=Paragraph.TypeParagraph.TITLE)
880
670
  fcmw.metadata.metadata.paragraphs.append(paragraph)
881
671
 
882
672
  await field.set_field_metadata(fcmw)
@@ -893,9 +683,54 @@ class Resource:
893
683
  maybe_update_basic_icon(self.basic, file_extracted_data.icon)
894
684
  maybe_update_basic_thumbnail(self.basic, file_extracted_data.file_thumbnail)
895
685
 
896
- async def _apply_field_computed_metadata(
897
- self, field_metadata: FieldComputedMetadataWrapper
898
- ):
686
+ async def _should_update_resource_title_from_file_metadata(self) -> bool:
687
+ """
688
+ We only want to update resource title from file metadata if the title is empty,
689
+ equal to the resource uuid or equal to any of the file filenames in the resource.
690
+ """
691
+ basic = await self.get_basic()
692
+ if basic is None:
693
+ return True
694
+ current_title = basic.title
695
+ if current_title == "":
696
+ # If the title is empty, we should update it
697
+ return True
698
+ if current_title == self.uuid:
699
+ # If the title is the same as the resource uuid, we should update it
700
+ return True
701
+ fields = await self.get_fields(force=True)
702
+ filenames = set()
703
+ for (field_type, _), field_obj in fields.items():
704
+ if field_type == FieldType.FILE:
705
+ field_value: Optional[FieldFile] = await field_obj.get_value()
706
+ if field_value is not None:
707
+ if field_value.file.filename not in ("", None):
708
+ filenames.add(field_value.file.filename)
709
+ if current_title in filenames:
710
+ # If the title is equal to any of the file filenames, we should update it
711
+ return True
712
+ return False
713
+
714
+ async def maybe_update_resource_title_from_file_extracted_data(self, message: BrokerMessage):
715
+ """
716
+ Update the resource title with the first file that has a title extracted.
717
+ """
718
+ if not await self._should_update_resource_title_from_file_metadata():
719
+ return
720
+ for fed in message.file_extracted_data:
721
+ if fed.title == "":
722
+ # Skip if the extracted title is empty
723
+ continue
724
+ fid = FieldId.from_pb(rid=self.uuid, field_type=FieldType.FILE, key=fed.field)
725
+ logger.info(
726
+ "Updating resource title from file extracted data",
727
+ extra={"kbid": self.kb.kbid, "field": fid.full(), "new_title": fed.title},
728
+ )
729
+ await self.update_resource_title(fed.title)
730
+ # Break after the first file with a title is found
731
+ break
732
+
733
+ async def _apply_field_computed_metadata(self, field_metadata: FieldComputedMetadataWrapper):
899
734
  assert self.basic is not None
900
735
  maybe_update_basic_summary(self.basic, field_metadata.metadata.metadata.summary)
901
736
 
@@ -904,17 +739,11 @@ class Resource:
904
739
  field_metadata.field.field_type,
905
740
  load=False,
906
741
  )
907
- (
908
- metadata,
909
- replace_field,
910
- replace_splits,
911
- ) = await field_obj.set_field_metadata(field_metadata)
742
+ metadata = await field_obj.set_field_metadata(field_metadata)
912
743
  field_key = self.generate_field_id(field_metadata.field)
913
744
 
914
745
  page_positions: Optional[FilePagePositions] = None
915
- if field_metadata.field.field_type == FieldType.FILE and isinstance(
916
- field_obj, File
917
- ):
746
+ if field_metadata.field.field_type == FieldType.FILE and isinstance(field_obj, File):
918
747
  page_positions = await get_file_page_positions(field_obj)
919
748
 
920
749
  user_field_metadata = next(
@@ -932,29 +761,24 @@ class Resource:
932
761
  self.indexer.apply_field_metadata,
933
762
  field_key,
934
763
  metadata,
935
- replace_field=replace_field,
936
- replace_splits=replace_splits,
937
764
  page_positions=page_positions,
938
765
  extracted_text=extracted_text,
939
766
  basic_user_field_metadata=user_field_metadata,
767
+ replace_field=True,
940
768
  )
941
769
  loop = asyncio.get_running_loop()
942
770
  await loop.run_in_executor(_executor, apply_field_metadata)
943
771
 
944
- maybe_update_basic_thumbnail(
945
- self.basic, field_metadata.metadata.metadata.thumbnail
946
- )
772
+ maybe_update_basic_thumbnail(self.basic, field_metadata.metadata.metadata.thumbnail)
947
773
 
948
774
  add_field_classifications(self.basic, field_metadata)
949
775
 
950
776
  async def _apply_extracted_vectors(self, field_vectors: ExtractedVectorsWrapper):
951
- if not self.has_field(
952
- field_vectors.field.field_type, field_vectors.field.field
953
- ):
777
+ # Store vectors in the resource
778
+
779
+ if not self.has_field(field_vectors.field.field_type, field_vectors.field.field):
954
780
  # skipping because field does not exist
955
- logger.warning(
956
- f'Field "{field_vectors.field.field}" does not exist, skipping vectors'
957
- )
781
+ logger.warning(f'Field "{field_vectors.field.field}" does not exist, skipping vectors')
958
782
  return
959
783
 
960
784
  field_obj = await self.get_field(
@@ -962,22 +786,36 @@ class Resource:
962
786
  field_vectors.field.field_type,
963
787
  load=False,
964
788
  )
965
- (
966
- vo,
967
- replace_field_sentences,
968
- replace_splits_sentences,
969
- ) = await field_obj.set_vectors(field_vectors)
789
+ vo = await field_obj.set_vectors(field_vectors)
790
+
791
+ # Prepare vectors to be indexed
792
+
970
793
  field_key = self.generate_field_id(field_vectors.field)
971
794
  if vo is not None:
972
- dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
973
- self.txn, kbid=self.kb.kbid
974
- )
795
+ vectorset_id = field_vectors.vectorset_id or None
796
+ if vectorset_id is None:
797
+ dimension = await datamanagers.kb.get_matryoshka_vector_dimension(
798
+ self.txn, kbid=self.kb.kbid
799
+ )
800
+ else:
801
+ config = await datamanagers.vectorsets.get(
802
+ self.txn, kbid=self.kb.kbid, vectorset_id=vectorset_id
803
+ )
804
+ if config is None:
805
+ logger.warning(
806
+ f"Trying to apply a resource on vectorset '{vectorset_id}' that doesn't exist."
807
+ )
808
+ return
809
+ dimension = config.vectorset_index_config.vector_dimension
810
+ if not dimension:
811
+ raise ValueError(f"Vector dimension not set for vectorset '{vectorset_id}'")
812
+
975
813
  apply_field_vectors_partial = partial(
976
814
  self.indexer.apply_field_vectors,
977
815
  field_key,
978
816
  vo,
979
- replace_field=replace_field_sentences,
980
- replace_splits=replace_splits_sentences,
817
+ vectorset=vectorset_id,
818
+ replace_field=True,
981
819
  matryoshka_vector_dimension=dimension,
982
820
  )
983
821
  loop = asyncio.get_running_loop()
@@ -985,9 +823,7 @@ class Resource:
985
823
  else:
986
824
  raise AttributeError("VO not found on set")
987
825
 
988
- async def _apply_field_large_metadata(
989
- self, field_large_metadata: LargeComputedMetadataWrapper
990
- ):
826
+ async def _apply_field_large_metadata(self, field_large_metadata: LargeComputedMetadataWrapper):
991
827
  field_obj = await self.get_field(
992
828
  field_large_metadata.field.field,
993
829
  field_large_metadata.field.field_type,
@@ -996,7 +832,7 @@ class Resource:
996
832
  await field_obj.set_large_field_metadata(field_large_metadata)
997
833
 
998
834
  def generate_field_id(self, field: FieldID) -> str:
999
- return f"{FIELD_TYPE_TO_ID[field.field_type]}/{field.field}"
835
+ return f"{FIELD_TYPE_PB_TO_STR[field.field_type]}/{field.field}"
1000
836
 
1001
837
  async def compute_security(self, brain: ResourceBrain):
1002
838
  security = await self.get_security()
@@ -1015,7 +851,7 @@ class Resource:
1015
851
  brain.set_resource_metadata(basic=basic, origin=origin)
1016
852
  for type, field in await self.get_fields_ids(force=True):
1017
853
  fieldobj = await self.get_field(field, type, load=False)
1018
- fieldid = FieldID(field_type=type, field=field) # type: ignore
854
+ fieldid = FieldID(field_type=type, field=field)
1019
855
  fieldkey = self.generate_field_id(fieldid)
1020
856
  extracted_metadata = await fieldobj.get_field_metadata()
1021
857
  valid_user_field_metadata = None
@@ -1026,16 +862,16 @@ class Resource:
1026
862
  ):
1027
863
  valid_user_field_metadata = user_field_metadata
1028
864
  break
865
+
866
+ generated_by = await fieldobj.generated_by()
1029
867
  brain.apply_field_labels(
1030
868
  fieldkey,
1031
869
  extracted_metadata,
1032
870
  self.uuid,
871
+ generated_by,
1033
872
  basic.usermetadata,
1034
873
  valid_user_field_metadata,
1035
874
  )
1036
- if type == FieldType.KEYWORDSET:
1037
- field_data = await fieldobj.db_get_value()
1038
- brain.process_keywordset_fields(fieldkey, field_data)
1039
875
 
1040
876
  @processor_observer.wrap({"type": "compute_global_text"})
1041
877
  async def compute_global_text(self):
@@ -1072,12 +908,10 @@ class Resource:
1072
908
  for fieldmetadata in self.basic.fieldmetadata:
1073
909
  field_id = self.generate_field_id(fieldmetadata.field)
1074
910
  for annotationparagraph in fieldmetadata.paragraphs:
1075
- userdefinedparagraphclass[annotationparagraph.key] = (
1076
- annotationparagraph
1077
- )
911
+ userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
1078
912
 
1079
913
  for (type_id, field_id), field in fields.items():
1080
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
914
+ fieldid = FieldID(field_type=type_id, field=field_id)
1081
915
  field_key = self.generate_field_id(fieldid)
1082
916
  fm = await field.get_field_metadata()
1083
917
  extracted_text = None
@@ -1092,9 +926,7 @@ class Resource:
1092
926
  if fm is None:
1093
927
  continue
1094
928
 
1095
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1096
- (None, fm.metadata)
1097
- ]
929
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1098
930
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1099
931
  field_metadatas.append((subfield_metadata, splitted_metadata))
1100
932
 
@@ -1105,7 +937,7 @@ class Resource:
1105
937
 
1106
938
  entities: dict[str, str] = {}
1107
939
  if enabled_metadata.entities:
1108
- entities.update(field_metadata.ner)
940
+ _update_entities_dict(entities, field_metadata)
1109
941
 
1110
942
  precomputed_vectors = {}
1111
943
  if vo is not None:
@@ -1116,9 +948,7 @@ class Resource:
1116
948
  vectors = vo.vectors
1117
949
  base_vector_key = f"{self.uuid}/{field_key}"
1118
950
  for index, vector in enumerate(vectors.vectors):
1119
- vector_key = (
1120
- f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
1121
- )
951
+ vector_key = f"{base_vector_key}/{index}/{vector.start}-{vector.end}"
1122
952
  precomputed_vectors[vector_key] = vector.vector
1123
953
 
1124
954
  if extracted_text is not None:
@@ -1129,11 +959,11 @@ class Resource:
1129
959
 
1130
960
  for paragraph in field_metadata.paragraphs:
1131
961
  if subfield is not None:
1132
- paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1133
- else:
1134
962
  paragraph_key = (
1135
- f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
963
+ f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1136
964
  )
965
+ else:
966
+ paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1137
967
 
1138
968
  if enabled_metadata.labels:
1139
969
  metadata.labels.ClearField("field")
@@ -1147,7 +977,9 @@ class Resource:
1147
977
  if subfield is not None:
1148
978
  sentence_key = f"{self.uuid}/{field_key}/{subfield}/{index}/{sentence.start}-{sentence.end}"
1149
979
  else:
1150
- sentence_key = f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
980
+ sentence_key = (
981
+ f"{self.uuid}/{field_key}/{index}/{sentence.start}-{sentence.end}"
982
+ )
1151
983
 
1152
984
  if vo is not None:
1153
985
  metadata.ClearField("vector")
@@ -1186,12 +1018,10 @@ class Resource:
1186
1018
  for fieldmetadata in self.basic.fieldmetadata:
1187
1019
  field_id = self.generate_field_id(fieldmetadata.field)
1188
1020
  for annotationparagraph in fieldmetadata.paragraphs:
1189
- userdefinedparagraphclass[annotationparagraph.key] = (
1190
- annotationparagraph
1191
- )
1021
+ userdefinedparagraphclass[annotationparagraph.key] = annotationparagraph
1192
1022
 
1193
1023
  for (type_id, field_id), field in fields.items():
1194
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
1024
+ fieldid = FieldID(field_type=type_id, field=field_id)
1195
1025
  field_key = self.generate_field_id(fieldid)
1196
1026
  fm = await field.get_field_metadata()
1197
1027
  extracted_text = None
@@ -1202,9 +1032,7 @@ class Resource:
1202
1032
  if fm is None:
1203
1033
  continue
1204
1034
 
1205
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1206
- (None, fm.metadata)
1207
- ]
1035
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1208
1036
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1209
1037
  field_metadatas.append((subfield_metadata, splitted_metadata))
1210
1038
 
@@ -1215,7 +1043,7 @@ class Resource:
1215
1043
 
1216
1044
  entities: dict[str, str] = {}
1217
1045
  if enabled_metadata.entities:
1218
- entities.update(field_metadata.ner)
1046
+ _update_entities_dict(entities, field_metadata)
1219
1047
 
1220
1048
  if extracted_text is not None:
1221
1049
  if subfield is not None:
@@ -1225,11 +1053,11 @@ class Resource:
1225
1053
 
1226
1054
  for paragraph in field_metadata.paragraphs:
1227
1055
  if subfield is not None:
1228
- paragraph_key = f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1229
- else:
1230
1056
  paragraph_key = (
1231
- f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1057
+ f"{self.uuid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
1232
1058
  )
1059
+ else:
1060
+ paragraph_key = f"{self.uuid}/{field_key}/{paragraph.start}-{paragraph.end}"
1233
1061
 
1234
1062
  if enabled_metadata.labels:
1235
1063
  metadata.labels.ClearField("paragraph")
@@ -1257,9 +1085,7 @@ class Resource:
1257
1085
 
1258
1086
  yield pb_paragraph
1259
1087
 
1260
- async def iterate_fields(
1261
- self, enabled_metadata: EnabledMetadata
1262
- ) -> AsyncIterator[TrainField]:
1088
+ async def iterate_fields(self, enabled_metadata: EnabledMetadata) -> AsyncIterator[TrainField]:
1263
1089
  fields = await self.get_fields(force=True)
1264
1090
  metadata = TrainMetadata()
1265
1091
  if enabled_metadata.labels:
@@ -1269,7 +1095,7 @@ class Resource:
1269
1095
  metadata.labels.resource.extend(self.basic.usermetadata.classifications)
1270
1096
 
1271
1097
  for (type_id, field_id), field in fields.items():
1272
- fieldid = FieldID(field_type=type_id, field=field_id) # type: ignore
1098
+ fieldid = FieldID(field_type=type_id, field=field_id)
1273
1099
  fm = await field.get_field_metadata()
1274
1100
  extracted_text = None
1275
1101
 
@@ -1279,9 +1105,7 @@ class Resource:
1279
1105
  if fm is None:
1280
1106
  continue
1281
1107
 
1282
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1283
- (None, fm.metadata)
1284
- ]
1108
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1285
1109
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1286
1110
  field_metadatas.append((subfield_metadata, splitted_metadata))
1287
1111
 
@@ -1298,7 +1122,7 @@ class Resource:
1298
1122
 
1299
1123
  if enabled_metadata.entities:
1300
1124
  metadata.ClearField("entities")
1301
- metadata.entities.update(splitted_metadata.ner)
1125
+ _update_entities_dict(metadata.entities, splitted_metadata)
1302
1126
 
1303
1127
  pb_field = TrainField()
1304
1128
  pb_field.uuid = self.uuid
@@ -1306,9 +1130,7 @@ class Resource:
1306
1130
  pb_field.metadata.CopyFrom(metadata)
1307
1131
  yield pb_field
1308
1132
 
1309
- async def generate_train_resource(
1310
- self, enabled_metadata: EnabledMetadata
1311
- ) -> TrainResource:
1133
+ async def generate_train_resource(self, enabled_metadata: EnabledMetadata) -> TrainResource:
1312
1134
  fields = await self.get_fields(force=True)
1313
1135
  metadata = TrainMetadata()
1314
1136
  if enabled_metadata.labels:
@@ -1335,9 +1157,7 @@ class Resource:
1335
1157
  if fm is None:
1336
1158
  continue
1337
1159
 
1338
- field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [
1339
- (None, fm.metadata)
1340
- ]
1160
+ field_metadatas: list[tuple[Optional[str], FieldMetadata]] = [(None, fm.metadata)]
1341
1161
  for subfield_metadata, splitted_metadata in fm.split_metadata.items():
1342
1162
  field_metadatas.append((subfield_metadata, splitted_metadata))
1343
1163
 
@@ -1346,7 +1166,7 @@ class Resource:
1346
1166
  metadata.labels.field.extend(splitted_metadata.classifications)
1347
1167
 
1348
1168
  if enabled_metadata.entities:
1349
- metadata.entities.update(splitted_metadata.ner)
1169
+ _update_entities_dict(metadata.entities, splitted_metadata)
1350
1170
 
1351
1171
  pb_resource = TrainResource()
1352
1172
  pb_resource.uuid = self.uuid
@@ -1375,33 +1195,35 @@ def remove_field_classifications(basic: PBBasic, deleted_fields: list[FieldID]):
1375
1195
  Clean classifications of fields that have been deleted
1376
1196
  """
1377
1197
  field_classifications = [
1378
- fc
1379
- for fc in basic.computedmetadata.field_classifications
1380
- if fc.field not in deleted_fields
1198
+ fc for fc in basic.computedmetadata.field_classifications if fc.field not in deleted_fields
1381
1199
  ]
1382
1200
  basic.computedmetadata.ClearField("field_classifications")
1383
1201
  basic.computedmetadata.field_classifications.extend(field_classifications)
1384
1202
 
1385
1203
 
1386
- def add_field_classifications(
1387
- basic: PBBasic, fcmw: FieldComputedMetadataWrapper
1388
- ) -> bool:
1204
+ def add_field_classifications(basic: PBBasic, fcmw: FieldComputedMetadataWrapper) -> bool:
1389
1205
  """
1390
1206
  Returns whether some new field classifications were added
1391
1207
  """
1392
- if len(fcmw.metadata.metadata.classifications) == 0:
1208
+ if len(fcmw.metadata.metadata.classifications) == 0 and all(
1209
+ len(split.classifications) == 0 for split in fcmw.metadata.split_metadata.values()
1210
+ ):
1393
1211
  return False
1212
+
1394
1213
  remove_field_classifications(basic, [fcmw.field])
1395
1214
  fcfs = FieldClassifications()
1396
1215
  fcfs.field.CopyFrom(fcmw.field)
1397
1216
  fcfs.classifications.extend(fcmw.metadata.metadata.classifications)
1217
+
1218
+ for split_id, split in fcmw.metadata.split_metadata.items():
1219
+ if split_id not in fcmw.metadata.deleted_splits:
1220
+ fcfs.classifications.extend(split.classifications)
1221
+
1398
1222
  basic.computedmetadata.field_classifications.append(fcfs)
1399
1223
  return True
1400
1224
 
1401
1225
 
1402
- def add_entities_to_metadata(
1403
- entities: dict[str, str], local_text: str, metadata: TrainMetadata
1404
- ) -> None:
1226
+ def add_entities_to_metadata(entities: dict[str, str], local_text: str, metadata: TrainMetadata) -> None:
1405
1227
  for entity_key, entity_value in entities.items():
1406
1228
  if entity_key not in local_text:
1407
1229
  # Add the entity only if found in text
@@ -1415,9 +1237,7 @@ def add_entities_to_metadata(
1415
1237
  for _ in range(local_text.count(entity_key)):
1416
1238
  start = local_text.index(entity_key, last_occurrence_end)
1417
1239
  end = start + len(entity_key)
1418
- metadata.entity_positions[poskey].positions.append(
1419
- TrainPosition(start=start, end=end)
1420
- )
1240
+ metadata.entity_positions[poskey].positions.append(TrainPosition(start=start, end=end))
1421
1241
  last_occurrence_end = end
1422
1242
 
1423
1243
 
@@ -1432,15 +1252,22 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
1432
1252
  if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
1433
1253
  # Icon already set or detected
1434
1254
  return False
1255
+
1435
1256
  if not mimetype:
1436
1257
  return False
1258
+
1259
+ if not content_types.valid(mimetype):
1260
+ logger.warning(
1261
+ "Invalid mimetype. Skipping icon update.",
1262
+ extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
1263
+ )
1264
+ return False
1265
+
1437
1266
  basic.icon = mimetype
1438
1267
  return True
1439
1268
 
1440
1269
 
1441
- def maybe_update_basic_thumbnail(
1442
- basic: PBBasic, thumbnail: Optional[CloudFile]
1443
- ) -> bool:
1270
+ def maybe_update_basic_thumbnail(basic: PBBasic, thumbnail: Optional[CloudFile]) -> bool:
1444
1271
  if basic.thumbnail or thumbnail is None:
1445
1272
  return False
1446
1273
  basic.thumbnail = CloudLink.format_reader_download_uri(thumbnail.uri)
@@ -1482,3 +1309,23 @@ def extract_field_metadata_languages(
1482
1309
  for _, splitted_metadata in field_metadata.metadata.split_metadata.items():
1483
1310
  languages.add(splitted_metadata.language)
1484
1311
  return list(languages)
1312
+
1313
+
1314
+ def _update_entities_dict(target_entites_dict: MutableMapping[str, str], field_metadata: FieldMetadata):
1315
+ """
1316
+ Update the entities dict with the entities from the field metadata.
1317
+ Method created to ease the transition from legacy ner field to new entities field.
1318
+ """
1319
+ # Data Augmentation + Processor entities
1320
+ # This will overwrite entities detected from more than one data augmentation task
1321
+ # TODO: Change TrainMetadata proto to accept multiple entities with the same text
1322
+ entity_map = {
1323
+ entity.text: entity.label
1324
+ for data_augmentation_task_id, entities_wrapper in field_metadata.entities.items()
1325
+ for entity in entities_wrapper.entities
1326
+ }
1327
+ target_entites_dict.update(entity_map)
1328
+
1329
+ # Legacy processor entities
1330
+ # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
1331
+ target_entites_dict.update(field_metadata.ner)