nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -22,17 +22,25 @@ from copy import deepcopy
22
22
  from dataclasses import dataclass
23
23
  from typing import Optional
24
24
 
25
+ from nucliadb.common import ids
26
+ from nucliadb.ingest import logger
27
+ from nucliadb.ingest.orm.utils import compute_paragraph_key
28
+ from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
29
+ from nucliadb_models.metadata import ResourceProcessingStatus
30
+ from nucliadb_protos import utils_pb2
25
31
  from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
26
- from nucliadb_protos.noderesources_pb2 import ParagraphMetadata
32
+ from nucliadb_protos.noderesources_pb2 import (
33
+ ParagraphMetadata,
34
+ Representation,
35
+ ResourceID,
36
+ )
27
37
  from nucliadb_protos.noderesources_pb2 import Position as TextPosition
28
- from nucliadb_protos.noderesources_pb2 import Representation
29
38
  from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
30
- from nucliadb_protos.noderesources_pb2 import ResourceID
31
39
  from nucliadb_protos.resources_pb2 import (
32
40
  Basic,
33
41
  ExtractedText,
42
+ FieldAuthor,
34
43
  FieldComputedMetadata,
35
- FieldKeywordset,
36
44
  FieldMetadata,
37
45
  Metadata,
38
46
  Origin,
@@ -40,25 +48,10 @@ from nucliadb_protos.resources_pb2 import (
40
48
  UserFieldMetadata,
41
49
  UserMetadata,
42
50
  )
43
- from nucliadb_protos.utils_pb2 import Relation, RelationNode, VectorObject
44
-
45
- from nucliadb.ingest import logger
46
- from nucliadb.ingest.orm.utils import compute_paragraph_key
47
- from nucliadb_models.labels import BASE_LABELS, flatten_resource_labels
48
- from nucliadb_models.metadata import ResourceProcessingStatus
49
- from nucliadb_protos import utils_pb2
51
+ from nucliadb_protos.utils_pb2 import Relation, RelationNode
50
52
 
51
53
  FilePagePositions = dict[int, tuple[int, int]]
52
54
 
53
- FIELD_PARAGRAPH_ID = "{rid}/{field_id}/{paragraph_start}-{paragraph_end}"
54
- SPLIT_FIELD_PARAGRAPH_ID = (
55
- "{rid}/{field_id}/{subfield_id}/{paragraph_start}-{paragraph_end}"
56
- )
57
- FIELD_VECTOR_ID = "{rid}/{field_id}/{index}/{vector_start}-{vector_end}"
58
- SPLIT_FIELD_VECTOR_ID = (
59
- "{rid}/{field_id}/{subfield_id}/{index}/{vector_start}-{vector_end}"
60
- )
61
-
62
55
  METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
63
56
  Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
64
57
  Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
@@ -79,7 +72,7 @@ class ResourceBrain:
79
72
  self.rid = rid
80
73
  ridobj = ResourceID(uuid=rid)
81
74
  self.brain: PBBrainResource = PBBrainResource(resource=ridobj)
82
- self.labels: dict[str, list[str]] = deepcopy(BASE_LABELS)
75
+ self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
83
76
 
84
77
  def apply_field_text(self, field_key: str, text: str):
85
78
  self.brain.texts[field_key].text = text
@@ -104,28 +97,30 @@ class ResourceBrain:
104
97
  self,
105
98
  field_key: str,
106
99
  metadata: FieldComputedMetadata,
107
- replace_field: list[str],
108
- replace_splits: dict[str, list[str]],
109
100
  page_positions: Optional[FilePagePositions],
110
101
  extracted_text: Optional[ExtractedText],
111
102
  basic_user_field_metadata: Optional[UserFieldMetadata] = None,
103
+ *,
104
+ replace_field: bool = False,
112
105
  ):
113
106
  # To check for duplicate paragraphs
114
107
  unique_paragraphs: set[str] = set()
115
108
 
116
109
  # Expose also user classifications
117
- paragraph_classifications = self._get_paragraph_user_classifications(
110
+ user_paragraph_classifications = self._get_paragraph_user_classifications(
118
111
  basic_user_field_metadata
119
112
  )
120
113
 
121
114
  # We should set paragraphs and labels
122
115
  paragraph_pages = ParagraphPages(page_positions) if page_positions else None
123
116
  for subfield, metadata_split in metadata.split_metadata.items():
117
+ extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
118
+
124
119
  # For each split of this field
125
120
  for index, paragraph in enumerate(metadata_split.paragraphs):
126
121
  key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
127
122
 
128
- denied_classifications = paragraph_classifications.denied.get(key, [])
123
+ denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
129
124
  position = TextPosition(
130
125
  index=index,
131
126
  start=paragraph.start,
@@ -157,9 +152,8 @@ class ResourceBrain:
157
152
  index=index,
158
153
  repeated_in_field=is_paragraph_repeated_in_field(
159
154
  paragraph,
160
- extracted_text,
155
+ extracted_text_str,
161
156
  unique_paragraphs,
162
- split=subfield,
163
157
  ),
164
158
  metadata=ParagraphMetadata(
165
159
  position=position,
@@ -167,22 +161,22 @@ class ResourceBrain:
167
161
  representation=representation,
168
162
  ),
169
163
  )
170
- p.labels.append(
171
- f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
164
+ paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
165
+ paragraph_labels = {paragraph_kind_label}
166
+ paragraph_labels.update(
167
+ f"/l/{classification.labelset}/{classification.label}"
168
+ for classification in paragraph.classifications
172
169
  )
173
- for classification in paragraph.classifications:
174
- label = f"/l/{classification.labelset}/{classification.label}"
175
- if label not in denied_classifications:
176
- p.labels.append(label)
177
-
178
- # Add user annotated labels to paragraphs
179
- extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
170
+ paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
171
+ paragraph_labels.difference_update(denied_classifications)
172
+ p.labels.extend(list(paragraph_labels))
180
173
 
181
174
  self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
182
175
 
176
+ extracted_text_str = extracted_text.text if extracted_text else None
183
177
  for index, paragraph in enumerate(metadata.metadata.paragraphs):
184
178
  key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
185
- denied_classifications = paragraph_classifications.denied.get(key, [])
179
+ denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
186
180
  position = TextPosition(
187
181
  index=index,
188
182
  start=paragraph.start,
@@ -212,7 +206,7 @@ class ResourceBrain:
212
206
  field=field_key,
213
207
  index=index,
214
208
  repeated_in_field=is_paragraph_repeated_in_field(
215
- paragraph, extracted_text, unique_paragraphs
209
+ paragraph, extracted_text_str, unique_paragraphs
216
210
  ),
217
211
  metadata=ParagraphMetadata(
218
212
  position=position,
@@ -220,72 +214,59 @@ class ResourceBrain:
220
214
  representation=representation,
221
215
  ),
222
216
  )
223
- p.labels.append(
224
- f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
217
+ paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
218
+ paragraph_labels = {paragraph_kind_label}
219
+ paragraph_labels.update(
220
+ f"/l/{classification.labelset}/{classification.label}"
221
+ for classification in paragraph.classifications
225
222
  )
226
-
227
- for classification in paragraph.classifications:
228
- label = f"/l/{classification.labelset}/{classification.label}"
229
- if label not in denied_classifications:
230
- p.labels.append(label)
231
-
232
- # Add user annotated labels to paragraphs
233
- extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
223
+ paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
224
+ paragraph_labels.difference_update(denied_classifications)
225
+ p.labels.extend(list(paragraph_labels))
234
226
 
235
227
  self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
236
228
 
229
+ if replace_field:
230
+ field_type, field_name = field_key.split("/")
231
+ full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
232
+ self.brain.paragraphs_to_delete.append(full_field_id)
233
+
237
234
  for relations in metadata.metadata.relations:
238
235
  for relation in relations.relations:
239
236
  self.brain.relations.append(relation)
240
237
 
241
- for split, sentences in replace_splits.items():
242
- for sentence in sentences:
243
- self.brain.paragraphs_to_delete.append(
244
- f"{self.rid}/{field_key}/{split}/{sentence}"
245
- )
246
-
247
- for sentence_to_delete in replace_field:
248
- self.brain.paragraphs_to_delete.append(
249
- f"{self.rid}/{field_key}/{sentence_to_delete}"
250
- )
251
-
252
- def delete_metadata(self, field_key: str, metadata: FieldComputedMetadata):
253
- for subfield, metadata_split in metadata.split_metadata.items():
254
- for paragraph in metadata_split.paragraphs:
255
- self.brain.paragraphs_to_delete.append(
256
- f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
257
- )
258
-
259
- for paragraph in metadata.metadata.paragraphs:
260
- self.brain.sentences_to_delete.append(
261
- f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
262
- )
238
+ def delete_field(self, field_key: str):
239
+ ftype, fkey = field_key.split("/")
240
+ full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
241
+ self.brain.paragraphs_to_delete.append(full_field_id)
242
+ self.brain.sentences_to_delete.append(full_field_id)
263
243
 
264
244
  def apply_field_vectors(
265
245
  self,
266
246
  field_id: str,
267
247
  vo: utils_pb2.VectorObject,
268
248
  *,
249
+ vectorset: Optional[str] = None,
269
250
  replace_field: bool = False,
270
- replace_splits: Optional[list[str]] = None,
271
251
  matryoshka_vector_dimension: Optional[int] = None,
272
252
  ):
273
- replace_splits = replace_splits or []
274
-
253
+ fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
275
254
  for subfield, vectors in vo.split_vectors.items():
255
+ _field_id = ids.FieldId(
256
+ rid=fid.rid,
257
+ type=fid.type,
258
+ key=fid.key,
259
+ subfield_id=subfield,
260
+ )
276
261
  # For each split of this field
277
262
  for index, vector in enumerate(vectors.vectors):
278
- paragraph_key = SPLIT_FIELD_PARAGRAPH_ID.format(
279
- rid=self.rid,
280
- field_id=field_id,
281
- subfield_id=subfield,
263
+ paragraph_key = ids.ParagraphId(
264
+ field_id=_field_id,
282
265
  paragraph_start=vector.start_paragraph,
283
266
  paragraph_end=vector.end_paragraph,
284
267
  )
285
- sentence_key = SPLIT_FIELD_VECTOR_ID.format(
286
- rid=self.rid,
287
- field_id=field_id,
288
- subfield_id=subfield,
268
+ sentence_key = ids.VectorId(
269
+ field_id=_field_id,
289
270
  index=index,
290
271
  vector_start=vector.start,
291
272
  vector_end=vector.end,
@@ -295,19 +276,23 @@ class ResourceBrain:
295
276
  paragraph_key,
296
277
  sentence_key,
297
278
  vector,
279
+ vectorset=vectorset,
298
280
  matryoshka_vector_dimension=matryoshka_vector_dimension,
299
281
  )
300
282
 
283
+ _field_id = ids.FieldId(
284
+ rid=fid.rid,
285
+ type=fid.type,
286
+ key=fid.key,
287
+ )
301
288
  for index, vector in enumerate(vo.vectors.vectors):
302
- paragraph_key = FIELD_PARAGRAPH_ID.format(
303
- rid=self.rid,
304
- field_id=field_id,
289
+ paragraph_key = ids.ParagraphId(
290
+ field_id=_field_id,
305
291
  paragraph_start=vector.start_paragraph,
306
292
  paragraph_end=vector.end_paragraph,
307
293
  )
308
- sentence_key = FIELD_VECTOR_ID.format(
309
- rid=self.rid,
310
- field_id=field_id,
294
+ sentence_key = ids.VectorId(
295
+ field_id=_field_id,
311
296
  index=index,
312
297
  vector_start=vector.start,
313
298
  vector_end=vector.end,
@@ -317,26 +302,33 @@ class ResourceBrain:
317
302
  paragraph_key,
318
303
  sentence_key,
319
304
  vector,
305
+ vectorset=vectorset,
320
306
  matryoshka_vector_dimension=matryoshka_vector_dimension,
321
307
  )
322
308
 
323
- for split in replace_splits:
324
- self.brain.sentences_to_delete.append(f"{self.rid}/{field_id}/{split}")
325
-
326
309
  if replace_field:
327
- self.brain.sentences_to_delete.append(f"{self.rid}/{field_id}")
310
+ full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
311
+ if vectorset is None:
312
+ # DEPRECATED
313
+ self.brain.sentences_to_delete.append(full_field_id)
314
+ else:
315
+ self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
328
316
 
329
317
  def _apply_field_vector(
330
318
  self,
331
319
  field_id: str,
332
- paragraph_key: str,
333
- sentence_key: str,
320
+ paragraph_key: ids.ParagraphId,
321
+ sentence_key: ids.VectorId,
334
322
  vector: utils_pb2.Vector,
335
323
  *,
324
+ vectorset: Optional[str],
336
325
  matryoshka_vector_dimension: Optional[int] = None,
337
326
  ):
338
- paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key]
339
- sentence_pb = paragraph_pb.sentences[sentence_key]
327
+ paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
328
+ if vectorset:
329
+ sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
330
+ else:
331
+ sentence_pb = paragraph_pb.sentences[sentence_key.full()]
340
332
 
341
333
  sentence_pb.ClearField("vector") # clear first to prevent duplicates
342
334
 
@@ -352,39 +344,18 @@ class ResourceBrain:
352
344
  sentence_pb.metadata.position.end = vector.end_paragraph
353
345
 
354
346
  # does it make sense to copy forward paragraph values here?
355
- sentence_pb.metadata.position.page_number = (
356
- paragraph_pb.metadata.position.page_number
357
- )
347
+ sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
358
348
  sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
359
349
 
360
350
  sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
361
351
 
362
- sentence_pb.metadata.representation.file = (
363
- paragraph_pb.metadata.representation.file
364
- )
352
+ sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
365
353
 
366
- sentence_pb.metadata.representation.is_a_table = (
367
- paragraph_pb.metadata.representation.is_a_table
368
- )
354
+ sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
369
355
 
370
356
  sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
371
357
 
372
- def delete_vectors(self, field_key: str, vo: VectorObject):
373
- # TODO: no need to iterate over all vectors, just delete the whole field
374
- for subfield, vectors in vo.split_vectors.items():
375
- for vector in vectors.vectors:
376
- self.brain.sentences_to_delete.append(
377
- f"{self.rid}/{field_key}/{subfield}/{vector.start}-{vector.end}"
378
- )
379
-
380
- for vector in vo.vectors.vectors:
381
- self.brain.sentences_to_delete.append(
382
- f"{self.rid}/{field_key}/{vector.start}-{vector.end}"
383
- )
384
-
385
- def set_processing_status(
386
- self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]
387
- ):
358
+ def set_processing_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
388
359
  """
389
360
  We purposefully overwrite what we index as a status and DO NOT reflect
390
361
  actual status with what we index.
@@ -441,15 +412,11 @@ class ResourceBrain:
441
412
  self.brain.metadata.modified.CopyFrom(origin.modified)
442
413
 
443
414
  def _set_resource_relations(self, basic: Basic, origin: Optional[Origin]):
444
- relationnodedocument = RelationNode(
445
- value=self.rid, ntype=RelationNode.NodeType.RESOURCE
446
- )
415
+ relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
447
416
  if origin is not None:
448
417
  # origin contributors
449
418
  for contrib in origin.colaborators:
450
- relationnodeuser = RelationNode(
451
- value=contrib, ntype=RelationNode.NodeType.USER
452
- )
419
+ relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
453
420
  self.brain.relations.append(
454
421
  Relation(
455
422
  relation=Relation.COLAB,
@@ -478,115 +445,147 @@ class ResourceBrain:
478
445
  def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
479
446
  if origin is not None:
480
447
  if origin.source_id:
481
- self.labels["o"] = [origin.source_id]
448
+ self.labels["o"] = {origin.source_id}
482
449
  # origin tags
483
450
  for tag in origin.tags:
484
- self.labels["t"].append(tag)
451
+ self.labels["t"].add(tag)
485
452
  # origin source
486
453
  if origin.source_id != "":
487
- self.labels["u"].append(f"s/{origin.source_id}")
454
+ self.labels["u"].add(f"s/{origin.source_id}")
488
455
 
489
456
  if origin.path:
490
- self.labels["p"].append(origin.path.lstrip("/"))
457
+ self.labels["p"].add(origin.path.lstrip("/"))
491
458
 
492
459
  # origin contributors
493
460
  for contrib in origin.colaborators:
494
- self.labels["u"].append(f"o/{contrib}")
461
+ self.labels["u"].add(f"o/{contrib}")
495
462
 
496
463
  for key, value in origin.metadata.items():
497
- self.labels["m"].append(f"{key[:255]}/{value[:255]}")
464
+ self.labels["m"].add(f"{key[:255]}/{value[:255]}")
498
465
 
499
466
  # icon
500
- self.labels["n"].append(f"i/{basic.icon}")
467
+ self.labels["n"].add(f"i/{basic.icon}")
501
468
 
502
469
  # processing status
503
470
  status_tag = self.get_processing_status_tag(basic.metadata)
504
- self.labels["n"].append(f"s/{status_tag}")
471
+ self.labels["n"].add(f"s/{status_tag}")
505
472
 
506
473
  # main language
507
474
  if basic.metadata.language:
508
- self.labels["s"].append(f"p/{basic.metadata.language}")
475
+ self.labels["s"].add(f"p/{basic.metadata.language}")
509
476
 
510
477
  # all language
511
478
  for lang in basic.metadata.languages:
512
- self.labels["s"].append(f"s/{lang}")
479
+ self.labels["s"].add(f"s/{lang}")
513
480
 
514
481
  # labels
515
482
  for classification in basic.usermetadata.classifications:
516
- self.labels["l"].append(f"{classification.labelset}/{classification.label}")
483
+ self.labels["l"].add(f"{classification.labelset}/{classification.label}")
517
484
 
518
- self.compute_labels()
485
+ # hidden
486
+ if basic.hidden:
487
+ _, p1, p2 = LABEL_HIDDEN.split("/")
488
+ self.labels[p1].add(p2)
489
+
490
+ self.brain.ClearField("labels")
491
+ self.brain.labels.extend(flatten_resource_labels(self.labels))
519
492
 
520
493
  def process_field_metadata(
521
494
  self,
522
495
  field_key: str,
523
496
  metadata: FieldMetadata,
524
- labels: dict[str, list[str]],
497
+ labels: dict[str, set[str]],
525
498
  relation_node_document: RelationNode,
526
- user_canceled_labels: list[str],
499
+ user_canceled_labels: set[str],
527
500
  ):
501
+ if metadata.mime_type != "":
502
+ labels["mt"].add(metadata.mime_type)
503
+
504
+ base_classification_relation = Relation(
505
+ relation=Relation.ABOUT,
506
+ source=relation_node_document,
507
+ to=RelationNode(
508
+ ntype=RelationNode.NodeType.LABEL,
509
+ ),
510
+ )
528
511
  for classification in metadata.classifications:
529
512
  label = f"{classification.labelset}/{classification.label}"
530
513
  if label not in user_canceled_labels:
531
- labels["l"].append(label)
532
- relation_node_label = RelationNode(
533
- value=label,
534
- ntype=RelationNode.NodeType.LABEL,
535
- )
536
- self.brain.relations.append(
537
- Relation(
538
- relation=Relation.ABOUT,
539
- source=relation_node_document,
540
- to=relation_node_label,
541
- )
542
- )
514
+ labels["l"].add(label)
515
+ relation = Relation()
516
+ relation.CopyFrom(base_classification_relation)
517
+ relation.to.value = label
518
+ self.brain.relations.append(relation)
519
+
520
+ # Data Augmentation + Processor entities
521
+ base_entity_relation = Relation(
522
+ relation=Relation.ENTITY,
523
+ source=relation_node_document,
524
+ to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
525
+ )
526
+ use_legacy_entities = True
527
+ for data_augmentation_task_id, entities in metadata.entities.items():
528
+ # If we recieved the entities from the processor here, we don't want to use the legacy entities
529
+ # TODO: Remove this when processor doesn't use this anymore
530
+ if data_augmentation_task_id == "processor":
531
+ use_legacy_entities = False
532
+
533
+ for ent in entities.entities:
534
+ entity_text = ent.text
535
+ entity_label = ent.label
536
+ # Seems like we don't care about where the entity is in the text
537
+ # entity_positions = entity.positions
538
+ labels["e"].add(
539
+ f"{entity_label}/{entity_text}"
540
+ ) # Add data_augmentation_task_id as a prefix?
541
+ relation = Relation()
542
+ relation.CopyFrom(base_entity_relation)
543
+ relation.to.value = entity_text
544
+ relation.to.subtype = entity_label
545
+ self.brain.relations.append(relation)
543
546
 
544
- for klass_entity, _ in metadata.positions.items():
545
- labels["e"].append(klass_entity)
546
- entity_array = klass_entity.split("/")
547
- if len(entity_array) == 1:
547
+ # Legacy processor entities
548
+ # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
549
+ def _parse_entity(klass_entity: str) -> tuple[str, str]:
550
+ try:
551
+ klass, entity = klass_entity.split("/", 1)
552
+ return klass, entity
553
+ except ValueError:
548
554
  raise AttributeError(f"Entity should be with type {klass_entity}")
549
- elif len(entity_array) > 1:
550
- klass = entity_array[0]
551
- entity = "/".join(entity_array[1:])
552
- relation_node_entity = RelationNode(
553
- value=entity, ntype=RelationNode.NodeType.ENTITY, subtype=klass
554
- )
555
- rel = Relation(
556
- relation=Relation.ENTITY,
557
- source=relation_node_document,
558
- to=relation_node_entity,
559
- )
560
- self.brain.relations.append(rel)
561
555
 
562
- def process_keywordset_fields(self, field_key: str, field: FieldKeywordset):
563
- # all field keywords
564
- if field:
565
- for keyword in field.keywords:
566
- self.labels["f"].append(f"{field_key}/{keyword.value}")
567
- self.labels["fg"].append(keyword.value)
556
+ if use_legacy_entities:
557
+ for klass_entity in metadata.positions.keys():
558
+ labels["e"].add(klass_entity)
559
+ klass, entity = _parse_entity(klass_entity)
560
+ relation = Relation()
561
+ relation.CopyFrom(base_entity_relation)
562
+ relation.to.value = entity
563
+ relation.to.subtype = klass
564
+ self.brain.relations.append(relation)
568
565
 
569
566
  def apply_field_labels(
570
567
  self,
571
568
  field_key: str,
572
569
  metadata: Optional[FieldComputedMetadata],
573
570
  uuid: str,
571
+ generated_by: FieldAuthor,
574
572
  basic_user_metadata: Optional[UserMetadata] = None,
575
573
  basic_user_fieldmetadata: Optional[UserFieldMetadata] = None,
576
574
  ):
575
+ user_canceled_labels: set[str] = set()
577
576
  if basic_user_metadata is not None:
578
- user_canceled_labels = [
579
- f"/l/{classification.labelset}/{classification.label}"
577
+ user_canceled_labels.update(
578
+ f"{classification.labelset}/{classification.label}"
580
579
  for classification in basic_user_metadata.classifications
581
580
  if classification.cancelled_by_user
582
- ]
583
- else:
584
- user_canceled_labels = []
585
-
586
- relation_node_resource = RelationNode(
587
- value=uuid, ntype=RelationNode.NodeType.RESOURCE
588
- )
589
- labels: dict[str, list[str]] = {"l": [], "e": []}
581
+ )
582
+ relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
583
+ labels: dict[str, set[str]] = {
584
+ "l": set(), # classification labels
585
+ "e": set(), # entities
586
+ "mt": set(), # mime type
587
+ "g/da": set(), # generated by
588
+ }
590
589
  if metadata is not None:
591
590
  for meta in metadata.split_metadata.values():
592
591
  self.process_field_metadata(
@@ -607,7 +606,7 @@ class ResourceBrain:
607
606
  if basic_user_fieldmetadata is not None:
608
607
  for token in basic_user_fieldmetadata.token:
609
608
  if token.cancelled_by_user is False:
610
- labels["e"].append(f"{token.klass}/{token.token}")
609
+ labels["e"].add(f"{token.klass}/{token.token}")
611
610
  relation_node_entity = RelationNode(
612
611
  value=token.token,
613
612
  ntype=RelationNode.NodeType.ENTITY,
@@ -635,36 +634,33 @@ class ResourceBrain:
635
634
  self.brain.paragraphs[field_key].paragraphs[
636
635
  paragraph_annotation.key
637
636
  ].labels.append(label)
638
- extend_unique(
639
- self.brain.texts[field_key].labels, flatten_resource_labels(labels) # type: ignore
640
- )
641
-
642
- def compute_labels(self):
643
- extend_unique(self.brain.labels, flatten_resource_labels(self.labels))
644
637
 
638
+ if generated_by.WhichOneof("author") == "data_augmentation":
639
+ field_type, field_id = field_key.split("/")
640
+ da_task_id = ids.extract_data_augmentation_id(field_id)
641
+ if da_task_id is None: # pragma: nocover
642
+ logger.warning(
643
+ "Data augmentation field id has an unexpected format! Skipping label",
644
+ extra={
645
+ "rid": uuid,
646
+ "field_id": field_id,
647
+ },
648
+ )
649
+ else:
650
+ labels["g/da"].add(da_task_id)
645
651
 
646
- def get_paragraph_text(
647
- extracted_text: ExtractedText, start: int, end: int, split: Optional[str] = None
648
- ) -> str:
649
- if split is not None:
650
- text = extracted_text.split_text[split]
651
- else:
652
- text = extracted_text.text
653
- return text[start:end]
652
+ self.brain.texts[field_key].labels.extend(flatten_resource_labels(labels))
654
653
 
655
654
 
656
655
  def is_paragraph_repeated_in_field(
657
656
  paragraph: Paragraph,
658
- extracted_text: Optional[ExtractedText],
657
+ extracted_text: Optional[str],
659
658
  unique_paragraphs: set[str],
660
- split: Optional[str] = None,
661
659
  ) -> bool:
662
660
  if extracted_text is None:
663
661
  return False
664
662
 
665
- paragraph_text = get_paragraph_text(
666
- extracted_text, start=paragraph.start, end=paragraph.end, split=split
667
- )
663
+ paragraph_text = extracted_text[paragraph.start : paragraph.end]
668
664
  if len(paragraph_text) == 0:
669
665
  return False
670
666
 
@@ -701,12 +697,3 @@ class ParagraphPages:
701
697
  if len(self._materialized) > 0:
702
698
  return self._materialized[-1]
703
699
  return 0
704
-
705
-
706
- def extend_unique(a: list, b: list):
707
- """
708
- Prevents extending with duplicate elements
709
- """
710
- for item in b:
711
- if item not in a:
712
- a.append(item)