nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -20,20 +20,27 @@
20
20
  import logging
21
21
  from copy import deepcopy
22
22
  from dataclasses import dataclass
23
- from typing import TYPE_CHECKING, Optional, Union
23
+ from typing import Optional
24
24
 
25
- from google.protobuf.internal.containers import MessageMap
25
+ from nucliadb.common import ids
26
+ from nucliadb.ingest import logger
27
+ from nucliadb.ingest.orm.utils import compute_paragraph_key
28
+ from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
29
+ from nucliadb_models.metadata import ResourceProcessingStatus
30
+ from nucliadb_protos import utils_pb2
26
31
  from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
27
- from nucliadb_protos.noderesources_pb2 import ParagraphMetadata
32
+ from nucliadb_protos.noderesources_pb2 import (
33
+ ParagraphMetadata,
34
+ Representation,
35
+ ResourceID,
36
+ )
28
37
  from nucliadb_protos.noderesources_pb2 import Position as TextPosition
29
- from nucliadb_protos.noderesources_pb2 import Representation
30
38
  from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
31
- from nucliadb_protos.noderesources_pb2 import ResourceID
32
39
  from nucliadb_protos.resources_pb2 import (
33
40
  Basic,
34
41
  ExtractedText,
42
+ FieldAuthor,
35
43
  FieldComputedMetadata,
36
- FieldKeywordset,
37
44
  FieldMetadata,
38
45
  Metadata,
39
46
  Origin,
@@ -41,28 +48,10 @@ from nucliadb_protos.resources_pb2 import (
41
48
  UserFieldMetadata,
42
49
  UserMetadata,
43
50
  )
44
- from nucliadb_protos.utils_pb2 import (
45
- Relation,
46
- RelationNode,
47
- UserVectorSet,
48
- UserVectorsList,
49
- VectorObject,
50
- )
51
-
52
- from nucliadb.ingest import logger
53
- from nucliadb.ingest.orm.utils import compute_paragraph_key
54
- from nucliadb_models.labels import BASE_LABELS, flatten_resource_labels
55
- from nucliadb_models.metadata import ResourceProcessingStatus
56
- from nucliadb_protos import utils_pb2
57
-
58
- if TYPE_CHECKING: # pragma: no cover
59
- StatusValue = Union[Metadata.Status.V, int]
60
- else:
61
- StatusValue = int
51
+ from nucliadb_protos.utils_pb2 import Relation, RelationNode
62
52
 
63
53
  FilePagePositions = dict[int, tuple[int, int]]
64
54
 
65
-
66
55
  METADATA_STATUS_PB_TYPE_TO_NAME_MAP = {
67
56
  Metadata.Status.ERROR: ResourceProcessingStatus.ERROR.name,
68
57
  Metadata.Status.PROCESSED: ResourceProcessingStatus.PROCESSED.name,
@@ -83,7 +72,7 @@ class ResourceBrain:
83
72
  self.rid = rid
84
73
  ridobj = ResourceID(uuid=rid)
85
74
  self.brain: PBBrainResource = PBBrainResource(resource=ridobj)
86
- self.labels: dict[str, list[str]] = deepcopy(BASE_LABELS)
75
+ self.labels: dict[str, set[str]] = deepcopy(BASE_LABELS)
87
76
 
88
77
  def apply_field_text(self, field_key: str, text: str):
89
78
  self.brain.texts[field_key].text = text
@@ -108,28 +97,30 @@ class ResourceBrain:
108
97
  self,
109
98
  field_key: str,
110
99
  metadata: FieldComputedMetadata,
111
- replace_field: list[str],
112
- replace_splits: dict[str, list[str]],
113
100
  page_positions: Optional[FilePagePositions],
114
101
  extracted_text: Optional[ExtractedText],
115
102
  basic_user_field_metadata: Optional[UserFieldMetadata] = None,
103
+ *,
104
+ replace_field: bool = False,
116
105
  ):
117
106
  # To check for duplicate paragraphs
118
107
  unique_paragraphs: set[str] = set()
119
108
 
120
109
  # Expose also user classifications
121
- paragraph_classifications = self._get_paragraph_user_classifications(
110
+ user_paragraph_classifications = self._get_paragraph_user_classifications(
122
111
  basic_user_field_metadata
123
112
  )
124
113
 
125
114
  # We should set paragraphs and labels
126
115
  paragraph_pages = ParagraphPages(page_positions) if page_positions else None
127
116
  for subfield, metadata_split in metadata.split_metadata.items():
117
+ extracted_text_str = extracted_text.split_text[subfield] if extracted_text else None
118
+
128
119
  # For each split of this field
129
120
  for index, paragraph in enumerate(metadata_split.paragraphs):
130
121
  key = f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
131
122
 
132
- denied_classifications = paragraph_classifications.denied.get(key, [])
123
+ denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
133
124
  position = TextPosition(
134
125
  index=index,
135
126
  start=paragraph.start,
@@ -161,9 +152,8 @@ class ResourceBrain:
161
152
  index=index,
162
153
  repeated_in_field=is_paragraph_repeated_in_field(
163
154
  paragraph,
164
- extracted_text,
155
+ extracted_text_str,
165
156
  unique_paragraphs,
166
- split=subfield,
167
157
  ),
168
158
  metadata=ParagraphMetadata(
169
159
  position=position,
@@ -171,22 +161,22 @@ class ResourceBrain:
171
161
  representation=representation,
172
162
  ),
173
163
  )
174
- p.labels.append(
175
- f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
164
+ paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
165
+ paragraph_labels = {paragraph_kind_label}
166
+ paragraph_labels.update(
167
+ f"/l/{classification.labelset}/{classification.label}"
168
+ for classification in paragraph.classifications
176
169
  )
177
- for classification in paragraph.classifications:
178
- label = f"/l/{classification.labelset}/{classification.label}"
179
- if label not in denied_classifications:
180
- p.labels.append(label)
181
-
182
- # Add user annotated labels to paragraphs
183
- extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
170
+ paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
171
+ paragraph_labels.difference_update(denied_classifications)
172
+ p.labels.extend(list(paragraph_labels))
184
173
 
185
174
  self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
186
175
 
176
+ extracted_text_str = extracted_text.text if extracted_text else None
187
177
  for index, paragraph in enumerate(metadata.metadata.paragraphs):
188
178
  key = f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
189
- denied_classifications = paragraph_classifications.denied.get(key, [])
179
+ denied_classifications = set(user_paragraph_classifications.denied.get(key, []))
190
180
  position = TextPosition(
191
181
  index=index,
192
182
  start=paragraph.start,
@@ -216,7 +206,7 @@ class ResourceBrain:
216
206
  field=field_key,
217
207
  index=index,
218
208
  repeated_in_field=is_paragraph_repeated_in_field(
219
- paragraph, extracted_text, unique_paragraphs
209
+ paragraph, extracted_text_str, unique_paragraphs
220
210
  ),
221
211
  metadata=ParagraphMetadata(
222
212
  position=position,
@@ -224,161 +214,148 @@ class ResourceBrain:
224
214
  representation=representation,
225
215
  ),
226
216
  )
227
- p.labels.append(
228
- f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
217
+ paragraph_kind_label = f"/k/{Paragraph.TypeParagraph.Name(paragraph.kind).lower()}"
218
+ paragraph_labels = {paragraph_kind_label}
219
+ paragraph_labels.update(
220
+ f"/l/{classification.labelset}/{classification.label}"
221
+ for classification in paragraph.classifications
229
222
  )
230
-
231
- for classification in paragraph.classifications:
232
- label = f"/l/{classification.labelset}/{classification.label}"
233
- if label not in denied_classifications:
234
- p.labels.append(label)
235
-
236
- # Add user annotated labels to paragraphs
237
- extend_unique(p.labels, paragraph_classifications.valid.get(key, [])) # type: ignore
223
+ paragraph_labels.update(set(user_paragraph_classifications.valid.get(key, [])))
224
+ paragraph_labels.difference_update(denied_classifications)
225
+ p.labels.extend(list(paragraph_labels))
238
226
 
239
227
  self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
240
228
 
229
+ if replace_field:
230
+ field_type, field_name = field_key.split("/")
231
+ full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
232
+ self.brain.paragraphs_to_delete.append(full_field_id)
233
+
241
234
  for relations in metadata.metadata.relations:
242
235
  for relation in relations.relations:
243
236
  self.brain.relations.append(relation)
244
237
 
245
- for split, sentences in replace_splits.items():
246
- for sentence in sentences:
247
- self.brain.paragraphs_to_delete.append(
248
- f"{self.rid}/{field_key}/{split}/{sentence}"
249
- )
250
-
251
- for sentence_to_delete in replace_field:
252
- self.brain.paragraphs_to_delete.append(
253
- f"{self.rid}/{field_key}/{sentence_to_delete}"
254
- )
255
-
256
- def delete_metadata(self, field_key: str, metadata: FieldComputedMetadata):
257
- for subfield, metadata_split in metadata.split_metadata.items():
258
- for paragraph in metadata_split.paragraphs:
259
- self.brain.paragraphs_to_delete.append(
260
- f"{self.rid}/{field_key}/{subfield}/{paragraph.start}-{paragraph.end}"
261
- )
262
-
263
- for paragraph in metadata.metadata.paragraphs:
264
- self.brain.sentences_to_delete.append(
265
- f"{self.rid}/{field_key}/{paragraph.start}-{paragraph.end}"
266
- )
267
-
268
- def apply_user_vectors(
269
- self,
270
- field_key: str,
271
- user_vectors: UserVectorSet,
272
- vectors_to_delete: MessageMap[str, UserVectorsList],
273
- ):
274
- for vectorset, vectors in user_vectors.vectors.items():
275
- for vector_id, user_vector in vectors.vectors.items():
276
- self.brain.vectors[vectorset].vectors[
277
- f"{self.rid}/{field_key}/{vector_id}/{user_vector.start}-{user_vector.end}"
278
- ].CopyFrom(user_vector)
279
-
280
- for vectorset, vectorslist in vectors_to_delete.items():
281
- for vector in vectorslist.vectors:
282
- self.brain.vectors_to_delete[vectorset].vectors.append(
283
- f"{self.rid}/{field_key}/{vector}"
284
- )
238
+ def delete_field(self, field_key: str):
239
+ ftype, fkey = field_key.split("/")
240
+ full_field_id = ids.FieldId(rid=self.rid, type=ftype, key=fkey).full()
241
+ self.brain.paragraphs_to_delete.append(full_field_id)
242
+ self.brain.sentences_to_delete.append(full_field_id)
285
243
 
286
244
  def apply_field_vectors(
287
245
  self,
288
- field_key: str,
289
- vo: VectorObject,
290
- replace_field: bool,
291
- replace_splits: list[str],
246
+ field_id: str,
247
+ vo: utils_pb2.VectorObject,
248
+ *,
249
+ vectorset: Optional[str] = None,
250
+ replace_field: bool = False,
251
+ matryoshka_vector_dimension: Optional[int] = None,
292
252
  ):
253
+ fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
293
254
  for subfield, vectors in vo.split_vectors.items():
255
+ _field_id = ids.FieldId(
256
+ rid=fid.rid,
257
+ type=fid.type,
258
+ key=fid.key,
259
+ subfield_id=subfield,
260
+ )
294
261
  # For each split of this field
295
-
296
262
  for index, vector in enumerate(vectors.vectors):
297
- sparagraph = self.brain.paragraphs[field_key].paragraphs[
298
- f"{self.rid}/{field_key}/{subfield}/{vector.start_paragraph}-{vector.end_paragraph}"
299
- ]
300
- ssentence = sparagraph.sentences[
301
- f"{self.rid}/{field_key}/{subfield}/{index}/{vector.start}-{vector.end}"
302
- ]
303
-
304
- ssentence.ClearField("vector") # clear first to prevent duplicates
305
- ssentence.vector.extend(vector.vector)
306
-
307
- # we only care about start/stop position of the paragraph for a given sentence here
308
- # the key has the sentence position
309
- ssentence.metadata.position.start = vector.start_paragraph
310
- ssentence.metadata.position.end = vector.end_paragraph
311
-
312
- ssentence.metadata.position.page_number = (
313
- sparagraph.metadata.position.page_number
314
- )
315
- ssentence.metadata.position.in_page = (
316
- sparagraph.metadata.position.in_page
317
- )
318
- ssentence.metadata.page_with_visual = (
319
- sparagraph.metadata.page_with_visual
263
+ paragraph_key = ids.ParagraphId(
264
+ field_id=_field_id,
265
+ paragraph_start=vector.start_paragraph,
266
+ paragraph_end=vector.end_paragraph,
320
267
  )
321
-
322
- ssentence.metadata.representation.file = (
323
- sparagraph.metadata.representation.file
268
+ sentence_key = ids.VectorId(
269
+ field_id=_field_id,
270
+ index=index,
271
+ vector_start=vector.start,
272
+ vector_end=vector.end,
324
273
  )
325
- ssentence.metadata.representation.is_a_table = (
326
- sparagraph.metadata.representation.is_a_table
274
+ self._apply_field_vector(
275
+ field_id,
276
+ paragraph_key,
277
+ sentence_key,
278
+ vector,
279
+ vectorset=vectorset,
280
+ matryoshka_vector_dimension=matryoshka_vector_dimension,
327
281
  )
328
- ssentence.metadata.position.index = sparagraph.metadata.position.index
329
282
 
283
+ _field_id = ids.FieldId(
284
+ rid=fid.rid,
285
+ type=fid.type,
286
+ key=fid.key,
287
+ )
330
288
  for index, vector in enumerate(vo.vectors.vectors):
331
- para_key = f"{self.rid}/{field_key}/{vector.start_paragraph}-{vector.end_paragraph}"
332
- paragraph = self.brain.paragraphs[field_key].paragraphs[para_key]
333
- sent_key = f"{self.rid}/{field_key}/{index}/{vector.start}-{vector.end}"
334
- sentence = paragraph.sentences[sent_key]
335
-
336
- sentence.ClearField("vector") # clear first to prevent duplicates
337
- sentence.vector.extend(vector.vector)
338
-
339
- # we only care about start/stop position of the paragraph for a given sentence here
340
- # the key has the sentence position
341
- sentence.metadata.position.start = vector.start_paragraph
342
- sentence.metadata.position.end = vector.end_paragraph
343
-
344
- # does it make sense to copy forward paragraph values here?
345
- sentence.metadata.position.page_number = (
346
- paragraph.metadata.position.page_number
289
+ paragraph_key = ids.ParagraphId(
290
+ field_id=_field_id,
291
+ paragraph_start=vector.start_paragraph,
292
+ paragraph_end=vector.end_paragraph,
347
293
  )
348
- sentence.metadata.position.in_page = paragraph.metadata.position.in_page
349
-
350
- sentence.metadata.page_with_visual = paragraph.metadata.page_with_visual
351
-
352
- sentence.metadata.representation.file = (
353
- paragraph.metadata.representation.file
294
+ sentence_key = ids.VectorId(
295
+ field_id=_field_id,
296
+ index=index,
297
+ vector_start=vector.start,
298
+ vector_end=vector.end,
354
299
  )
355
- sentence.metadata.representation.is_a_table = (
356
- paragraph.metadata.representation.is_a_table
300
+ self._apply_field_vector(
301
+ field_id,
302
+ paragraph_key,
303
+ sentence_key,
304
+ vector,
305
+ vectorset=vectorset,
306
+ matryoshka_vector_dimension=matryoshka_vector_dimension,
357
307
  )
358
308
 
359
- sentence.metadata.position.index = paragraph.metadata.position.index
309
+ if replace_field:
310
+ full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
311
+ if vectorset is None:
312
+ # DEPRECATED
313
+ self.brain.sentences_to_delete.append(full_field_id)
314
+ else:
315
+ self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
316
+
317
+ def _apply_field_vector(
318
+ self,
319
+ field_id: str,
320
+ paragraph_key: ids.ParagraphId,
321
+ sentence_key: ids.VectorId,
322
+ vector: utils_pb2.Vector,
323
+ *,
324
+ vectorset: Optional[str],
325
+ matryoshka_vector_dimension: Optional[int] = None,
326
+ ):
327
+ paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
328
+ if vectorset:
329
+ sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
330
+ else:
331
+ sentence_pb = paragraph_pb.sentences[sentence_key.full()]
360
332
 
361
- for split in replace_splits:
362
- self.brain.sentences_to_delete.append(f"{self.rid}/{field_key}/{split}")
333
+ sentence_pb.ClearField("vector") # clear first to prevent duplicates
363
334
 
364
- if replace_field:
365
- self.brain.sentences_to_delete.append(f"{self.rid}/{field_key}")
335
+ # cut vectors if a specific dimension is specified
336
+ if matryoshka_vector_dimension is not None:
337
+ sentence_pb.vector.extend(vector.vector[:matryoshka_vector_dimension])
338
+ else:
339
+ sentence_pb.vector.extend(vector.vector)
366
340
 
367
- def delete_vectors(self, field_key: str, vo: VectorObject):
368
- for subfield, vectors in vo.split_vectors.items():
369
- for vector in vectors.vectors:
370
- self.brain.sentences_to_delete.append(
371
- f"{self.rid}/{field_key}/{subfield}/{vector.start}-{vector.end}"
372
- )
341
+ # we only care about start/stop position of the paragraph for a given sentence here
342
+ # the key has the sentence position
343
+ sentence_pb.metadata.position.start = vector.start_paragraph
344
+ sentence_pb.metadata.position.end = vector.end_paragraph
373
345
 
374
- for vector in vo.vectors.vectors:
375
- self.brain.sentences_to_delete.append(
376
- f"{self.rid}/{field_key}/{vector.start}-{vector.end}"
377
- )
346
+ # does it make sense to copy forward paragraph values here?
347
+ sentence_pb.metadata.position.page_number = paragraph_pb.metadata.position.page_number
348
+ sentence_pb.metadata.position.in_page = paragraph_pb.metadata.position.in_page
378
349
 
379
- def set_processing_status(
380
- self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]
381
- ):
350
+ sentence_pb.metadata.page_with_visual = paragraph_pb.metadata.page_with_visual
351
+
352
+ sentence_pb.metadata.representation.file = paragraph_pb.metadata.representation.file
353
+
354
+ sentence_pb.metadata.representation.is_a_table = paragraph_pb.metadata.representation.is_a_table
355
+
356
+ sentence_pb.metadata.position.index = paragraph_pb.metadata.position.index
357
+
358
+ def set_processing_status(self, basic: Basic, previous_status: Optional[Metadata.Status.ValueType]):
382
359
  """
383
360
  We purposefully overwrite what we index as a status and DO NOT reflect
384
361
  actual status with what we index.
@@ -435,15 +412,11 @@ class ResourceBrain:
435
412
  self.brain.metadata.modified.CopyFrom(origin.modified)
436
413
 
437
414
  def _set_resource_relations(self, basic: Basic, origin: Optional[Origin]):
438
- relationnodedocument = RelationNode(
439
- value=self.rid, ntype=RelationNode.NodeType.RESOURCE
440
- )
415
+ relationnodedocument = RelationNode(value=self.rid, ntype=RelationNode.NodeType.RESOURCE)
441
416
  if origin is not None:
442
417
  # origin contributors
443
418
  for contrib in origin.colaborators:
444
- relationnodeuser = RelationNode(
445
- value=contrib, ntype=RelationNode.NodeType.USER
446
- )
419
+ relationnodeuser = RelationNode(value=contrib, ntype=RelationNode.NodeType.USER)
447
420
  self.brain.relations.append(
448
421
  Relation(
449
422
  relation=Relation.COLAB,
@@ -472,115 +445,147 @@ class ResourceBrain:
472
445
  def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):
473
446
  if origin is not None:
474
447
  if origin.source_id:
475
- self.labels["o"] = [origin.source_id]
448
+ self.labels["o"] = {origin.source_id}
476
449
  # origin tags
477
450
  for tag in origin.tags:
478
- self.labels["t"].append(tag)
451
+ self.labels["t"].add(tag)
479
452
  # origin source
480
453
  if origin.source_id != "":
481
- self.labels["u"].append(f"s/{origin.source_id}")
454
+ self.labels["u"].add(f"s/{origin.source_id}")
482
455
 
483
456
  if origin.path:
484
- self.labels["p"].append(origin.path.lstrip("/"))
457
+ self.labels["p"].add(origin.path.lstrip("/"))
485
458
 
486
459
  # origin contributors
487
460
  for contrib in origin.colaborators:
488
- self.labels["u"].append(f"o/{contrib}")
461
+ self.labels["u"].add(f"o/{contrib}")
489
462
 
490
463
  for key, value in origin.metadata.items():
491
- self.labels["m"].append(f"{key[:255]}/{value[:255]}")
464
+ self.labels["m"].add(f"{key[:255]}/{value[:255]}")
492
465
 
493
466
  # icon
494
- self.labels["n"].append(f"i/{basic.icon}")
467
+ self.labels["n"].add(f"i/{basic.icon}")
495
468
 
496
469
  # processing status
497
470
  status_tag = self.get_processing_status_tag(basic.metadata)
498
- self.labels["n"].append(f"s/{status_tag}")
471
+ self.labels["n"].add(f"s/{status_tag}")
499
472
 
500
473
  # main language
501
474
  if basic.metadata.language:
502
- self.labels["s"].append(f"p/{basic.metadata.language}")
475
+ self.labels["s"].add(f"p/{basic.metadata.language}")
503
476
 
504
477
  # all language
505
478
  for lang in basic.metadata.languages:
506
- self.labels["s"].append(f"s/{lang}")
479
+ self.labels["s"].add(f"s/{lang}")
507
480
 
508
481
  # labels
509
482
  for classification in basic.usermetadata.classifications:
510
- self.labels["l"].append(f"{classification.labelset}/{classification.label}")
483
+ self.labels["l"].add(f"{classification.labelset}/{classification.label}")
484
+
485
+ # hidden
486
+ if basic.hidden:
487
+ _, p1, p2 = LABEL_HIDDEN.split("/")
488
+ self.labels[p1].add(p2)
511
489
 
512
- self.compute_labels()
490
+ self.brain.ClearField("labels")
491
+ self.brain.labels.extend(flatten_resource_labels(self.labels))
513
492
 
514
493
  def process_field_metadata(
515
494
  self,
516
495
  field_key: str,
517
496
  metadata: FieldMetadata,
518
- labels: dict[str, list[str]],
497
+ labels: dict[str, set[str]],
519
498
  relation_node_document: RelationNode,
520
- user_canceled_labels: list[str],
499
+ user_canceled_labels: set[str],
521
500
  ):
501
+ if metadata.mime_type != "":
502
+ labels["mt"].add(metadata.mime_type)
503
+
504
+ base_classification_relation = Relation(
505
+ relation=Relation.ABOUT,
506
+ source=relation_node_document,
507
+ to=RelationNode(
508
+ ntype=RelationNode.NodeType.LABEL,
509
+ ),
510
+ )
522
511
  for classification in metadata.classifications:
523
512
  label = f"{classification.labelset}/{classification.label}"
524
513
  if label not in user_canceled_labels:
525
- labels["l"].append(label)
526
- relation_node_label = RelationNode(
527
- value=label,
528
- ntype=RelationNode.NodeType.LABEL,
529
- )
530
- self.brain.relations.append(
531
- Relation(
532
- relation=Relation.ABOUT,
533
- source=relation_node_document,
534
- to=relation_node_label,
535
- )
536
- )
514
+ labels["l"].add(label)
515
+ relation = Relation()
516
+ relation.CopyFrom(base_classification_relation)
517
+ relation.to.value = label
518
+ self.brain.relations.append(relation)
519
+
520
+ # Data Augmentation + Processor entities
521
+ base_entity_relation = Relation(
522
+ relation=Relation.ENTITY,
523
+ source=relation_node_document,
524
+ to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
525
+ )
526
+ use_legacy_entities = True
527
+ for data_augmentation_task_id, entities in metadata.entities.items():
528
+ # If we recieved the entities from the processor here, we don't want to use the legacy entities
529
+ # TODO: Remove this when processor doesn't use this anymore
530
+ if data_augmentation_task_id == "processor":
531
+ use_legacy_entities = False
532
+
533
+ for ent in entities.entities:
534
+ entity_text = ent.text
535
+ entity_label = ent.label
536
+ # Seems like we don't care about where the entity is in the text
537
+ # entity_positions = entity.positions
538
+ labels["e"].add(
539
+ f"{entity_label}/{entity_text}"
540
+ ) # Add data_augmentation_task_id as a prefix?
541
+ relation = Relation()
542
+ relation.CopyFrom(base_entity_relation)
543
+ relation.to.value = entity_text
544
+ relation.to.subtype = entity_label
545
+ self.brain.relations.append(relation)
537
546
 
538
- for klass_entity, _ in metadata.positions.items():
539
- labels["e"].append(klass_entity)
540
- entity_array = klass_entity.split("/")
541
- if len(entity_array) == 1:
547
+ # Legacy processor entities
548
+ # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
549
+ def _parse_entity(klass_entity: str) -> tuple[str, str]:
550
+ try:
551
+ klass, entity = klass_entity.split("/", 1)
552
+ return klass, entity
553
+ except ValueError:
542
554
  raise AttributeError(f"Entity should be with type {klass_entity}")
543
- elif len(entity_array) > 1:
544
- klass = entity_array[0]
545
- entity = "/".join(entity_array[1:])
546
- relation_node_entity = RelationNode(
547
- value=entity, ntype=RelationNode.NodeType.ENTITY, subtype=klass
548
- )
549
- rel = Relation(
550
- relation=Relation.ENTITY,
551
- source=relation_node_document,
552
- to=relation_node_entity,
553
- )
554
- self.brain.relations.append(rel)
555
555
 
556
- def process_keywordset_fields(self, field_key: str, field: FieldKeywordset):
557
- # all field keywords
558
- if field:
559
- for keyword in field.keywords:
560
- self.labels["f"].append(f"{field_key}/{keyword.value}")
561
- self.labels["fg"].append(keyword.value)
556
+ if use_legacy_entities:
557
+ for klass_entity in metadata.positions.keys():
558
+ labels["e"].add(klass_entity)
559
+ klass, entity = _parse_entity(klass_entity)
560
+ relation = Relation()
561
+ relation.CopyFrom(base_entity_relation)
562
+ relation.to.value = entity
563
+ relation.to.subtype = klass
564
+ self.brain.relations.append(relation)
562
565
 
563
566
  def apply_field_labels(
564
567
  self,
565
568
  field_key: str,
566
569
  metadata: Optional[FieldComputedMetadata],
567
570
  uuid: str,
571
+ generated_by: FieldAuthor,
568
572
  basic_user_metadata: Optional[UserMetadata] = None,
569
573
  basic_user_fieldmetadata: Optional[UserFieldMetadata] = None,
570
574
  ):
575
+ user_canceled_labels: set[str] = set()
571
576
  if basic_user_metadata is not None:
572
- user_canceled_labels = [
573
- f"/l/{classification.labelset}/{classification.label}"
577
+ user_canceled_labels.update(
578
+ f"{classification.labelset}/{classification.label}"
574
579
  for classification in basic_user_metadata.classifications
575
580
  if classification.cancelled_by_user
576
- ]
577
- else:
578
- user_canceled_labels = []
579
-
580
- relation_node_resource = RelationNode(
581
- value=uuid, ntype=RelationNode.NodeType.RESOURCE
582
- )
583
- labels: dict[str, list[str]] = {"l": [], "e": []}
581
+ )
582
+ relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
583
+ labels: dict[str, set[str]] = {
584
+ "l": set(), # classification labels
585
+ "e": set(), # entities
586
+ "mt": set(), # mime type
587
+ "g/da": set(), # generated by
588
+ }
584
589
  if metadata is not None:
585
590
  for meta in metadata.split_metadata.values():
586
591
  self.process_field_metadata(
@@ -601,7 +606,7 @@ class ResourceBrain:
601
606
  if basic_user_fieldmetadata is not None:
602
607
  for token in basic_user_fieldmetadata.token:
603
608
  if token.cancelled_by_user is False:
604
- labels["e"].append(f"{token.klass}/{token.token}")
609
+ labels["e"].add(f"{token.klass}/{token.token}")
605
610
  relation_node_entity = RelationNode(
606
611
  value=token.token,
607
612
  ntype=RelationNode.NodeType.ENTITY,
@@ -629,36 +634,33 @@ class ResourceBrain:
629
634
  self.brain.paragraphs[field_key].paragraphs[
630
635
  paragraph_annotation.key
631
636
  ].labels.append(label)
632
- extend_unique(
633
- self.brain.texts[field_key].labels, flatten_resource_labels(labels) # type: ignore
634
- )
635
637
 
636
- def compute_labels(self):
637
- extend_unique(self.brain.labels, flatten_resource_labels(self.labels))
638
+ if generated_by.WhichOneof("author") == "data_augmentation":
639
+ field_type, field_id = field_key.split("/")
640
+ da_task_id = ids.extract_data_augmentation_id(field_id)
641
+ if da_task_id is None: # pragma: nocover
642
+ logger.warning(
643
+ "Data augmentation field id has an unexpected format! Skipping label",
644
+ extra={
645
+ "rid": uuid,
646
+ "field_id": field_id,
647
+ },
648
+ )
649
+ else:
650
+ labels["g/da"].add(da_task_id)
638
651
 
639
-
640
- def get_paragraph_text(
641
- extracted_text: ExtractedText, start: int, end: int, split: Optional[str] = None
642
- ) -> str:
643
- if split is not None:
644
- text = extracted_text.split_text[split]
645
- else:
646
- text = extracted_text.text
647
- return text[start:end]
652
+ self.brain.texts[field_key].labels.extend(flatten_resource_labels(labels))
648
653
 
649
654
 
650
655
  def is_paragraph_repeated_in_field(
651
656
  paragraph: Paragraph,
652
- extracted_text: Optional[ExtractedText],
657
+ extracted_text: Optional[str],
653
658
  unique_paragraphs: set[str],
654
- split: Optional[str] = None,
655
659
  ) -> bool:
656
660
  if extracted_text is None:
657
661
  return False
658
662
 
659
- paragraph_text = get_paragraph_text(
660
- extracted_text, start=paragraph.start, end=paragraph.end, split=split
661
- )
663
+ paragraph_text = extracted_text[paragraph.start : paragraph.end]
662
664
  if len(paragraph_text) == 0:
663
665
  return False
664
666
 
@@ -695,12 +697,3 @@ class ParagraphPages:
695
697
  if len(self._materialized) > 0:
696
698
  return self._materialized[-1]
697
699
  return 0
698
-
699
-
700
- def extend_unique(a: list, b: list):
701
- """
702
- Prevents extending with duplicate elements
703
- """
704
- for item in b:
705
- if item not in a:
706
- a.append(item)