nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,37 +18,48 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
- from typing import Any, Iterator, Optional, cast
21
+ from typing import Iterable, Union
22
22
 
23
- from nucliadb_protos.nodereader_pb2 import (
24
- DocumentScored,
25
- EntitiesSubgraphRequest,
26
- ParagraphResult,
27
- SearchResponse,
28
- )
29
-
30
- from nucliadb.common.maindb.driver import Transaction
31
- from nucliadb.ingest.serialize import managed_serialize
32
- from nucliadb.middleware.transaction import get_read_only_transaction
23
+ from nucliadb.common.external_index_providers.base import TextBlockMatch
24
+ from nucliadb.common.ids import ParagraphId, VectorId
33
25
  from nucliadb.search import SERVICE_NAME, logger
34
- from nucliadb.search.search.cache import get_resource_cache
26
+ from nucliadb.search.search.cut import cut_page
27
+ from nucliadb.search.search.hydrator import (
28
+ ResourceHydrationOptions,
29
+ TextBlockHydrationOptions,
30
+ hydrate_resource_metadata,
31
+ hydrate_text_block,
32
+ text_block_to_find_paragraph,
33
+ )
35
34
  from nucliadb.search.search.merge import merge_relations_results
35
+ from nucliadb.search.search.rank_fusion import RankFusionAlgorithm
36
+ from nucliadb.search.search.rerankers import (
37
+ RerankableItem,
38
+ Reranker,
39
+ RerankingOptions,
40
+ )
36
41
  from nucliadb_models.common import FieldTypeName
37
- from nucliadb_models.resource import ExtractedDataTypeName
42
+ from nucliadb_models.resource import ExtractedDataTypeName, Resource
38
43
  from nucliadb_models.search import (
39
44
  SCORE_TYPE,
40
45
  FindField,
41
- FindParagraph,
42
46
  FindResource,
43
47
  KnowledgeboxFindResults,
44
48
  MinScore,
45
49
  ResourceProperties,
46
- TempFindParagraph,
47
50
  TextPosition,
48
51
  )
52
+ from nucliadb_protos.nodereader_pb2 import (
53
+ DocumentScored,
54
+ EntitiesSubgraphRequest,
55
+ ParagraphResult,
56
+ ParagraphSearchResponse,
57
+ RelationSearchResponse,
58
+ SearchResponse,
59
+ VectorSearchResponse,
60
+ )
49
61
  from nucliadb_telemetry import metrics
50
62
 
51
- from . import paragraphs
52
63
  from .metrics import merge_observer
53
64
 
54
65
  FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
@@ -57,407 +68,413 @@ FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
57
68
  )
58
69
 
59
70
 
60
- def _round(x: float) -> float:
61
- return round(x, ndigits=3)
62
-
63
-
64
- @merge_observer.wrap({"type": "set_text_value"})
65
- async def set_text_value(
71
+ @merge_observer.wrap({"type": "find_merge"})
72
+ async def build_find_response(
73
+ search_responses: list[SearchResponse],
74
+ *,
66
75
  kbid: str,
67
- result_paragraph: TempFindParagraph,
68
- max_operations: asyncio.Semaphore,
76
+ query: str,
77
+ relation_subgraph_query: EntitiesSubgraphRequest,
78
+ top_k: int,
79
+ min_score_bm25: float,
80
+ min_score_semantic: float,
81
+ rank_fusion_algorithm: RankFusionAlgorithm,
82
+ reranker: Reranker,
83
+ show: list[ResourceProperties] = [],
84
+ extracted: list[ExtractedDataTypeName] = [],
85
+ field_type_filter: list[FieldTypeName] = [],
69
86
  highlight: bool = False,
70
- ematches: Optional[list[str]] = None,
71
- extracted_text_cache: Optional[paragraphs.ExtractedTextCache] = None,
72
- ):
73
- async with max_operations:
74
- assert result_paragraph.paragraph
75
- assert result_paragraph.paragraph.position
76
- result_paragraph.paragraph.text = await paragraphs.get_paragraph_text(
77
- kbid=kbid,
78
- rid=result_paragraph.rid,
79
- field=result_paragraph.field,
80
- start=result_paragraph.paragraph.position.start,
81
- end=result_paragraph.paragraph.position.end,
82
- split=result_paragraph.split,
83
- highlight=highlight,
84
- ematches=ematches,
85
- matches=[], # TODO
86
- extracted_text_cache=extracted_text_cache,
87
+ ) -> KnowledgeboxFindResults:
88
+ # merge
89
+ search_response = merge_shard_responses(search_responses)
90
+
91
+ keyword_results = keyword_results_to_text_block_matches(search_response.paragraph.results)
92
+ semantic_results = semantic_results_to_text_block_matches(
93
+ filter(
94
+ lambda x: x.score >= min_score_semantic,
95
+ search_response.vector.documents,
87
96
  )
97
+ )
98
+
99
+ merged_text_blocks: list[TextBlockMatch] = rank_fusion_algorithm.fuse(
100
+ keyword_results, semantic_results
101
+ )
102
+
103
+ # cut
104
+ # we assume pagination + predict reranker is forbidden and has been already
105
+ # enforced/validated by the query parsing.
106
+ if reranker.needs_extra_results:
107
+ assert reranker.window is not None, "Reranker definition must enforce this condition"
108
+ text_blocks_page, next_page = cut_page(merged_text_blocks, reranker.window)
109
+ else:
110
+ text_blocks_page, next_page = cut_page(merged_text_blocks, top_k)
111
+
112
+ # hydrate and rerank
113
+ resource_hydration_options = ResourceHydrationOptions(
114
+ show=show, extracted=extracted, field_type_filter=field_type_filter
115
+ )
116
+ text_block_hydration_options = TextBlockHydrationOptions(
117
+ highlight=highlight,
118
+ ematches=search_response.paragraph.ematches, # type: ignore
119
+ )
120
+ reranking_options = RerankingOptions(kbid=kbid, query=query)
121
+ text_blocks, resources, best_matches = await hydrate_and_rerank(
122
+ text_blocks_page,
123
+ kbid,
124
+ resource_hydration_options=resource_hydration_options,
125
+ text_block_hydration_options=text_block_hydration_options,
126
+ reranker=reranker,
127
+ reranking_options=reranking_options,
128
+ top_k=top_k,
129
+ )
130
+
131
+ # build relations graph
132
+ relations = await merge_relations_results([search_response.relation], relation_subgraph_query)
133
+
134
+ # compose response
135
+ find_resources = compose_find_resources(text_blocks, resources)
136
+
137
+ next_page = search_response.paragraph.next_page or next_page
138
+ total_paragraphs = search_response.paragraph.total
139
+
140
+ find_results = KnowledgeboxFindResults(
141
+ query=query,
142
+ resources=find_resources,
143
+ best_matches=best_matches,
144
+ relations=relations,
145
+ total=total_paragraphs,
146
+ page_number=0, # Bw/c with pagination
147
+ page_size=top_k,
148
+ next_page=next_page,
149
+ min_score=MinScore(bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)),
150
+ )
151
+ return find_results
152
+
153
+
154
+ def merge_shard_responses(
155
+ responses: list[SearchResponse],
156
+ ) -> SearchResponse:
157
+ """Merge search responses into a single response as if there were no shards
158
+ involved.
159
+
160
+ ATENTION! This is not a complete merge, we are only merging the fields
161
+ needed to compose a /find response.
162
+
163
+ """
164
+ paragraphs = []
165
+ vectors = []
166
+ relations = []
167
+ for response in responses:
168
+ paragraphs.append(response.paragraph)
169
+ vectors.append(response.vector)
170
+ relations.append(response.relation)
88
171
 
89
-
90
- @merge_observer.wrap({"type": "set_resource_metadada_value"})
91
- async def set_resource_metadata_value(
92
- txn: Transaction,
93
- kbid: str,
94
- resource: str,
95
- show: list[ResourceProperties],
96
- field_type_filter: list[FieldTypeName],
97
- extracted: list[ExtractedDataTypeName],
98
- find_resources: dict[str, FindResource],
99
- max_operations: asyncio.Semaphore,
100
- ):
101
- async with max_operations:
102
- serialized_resource = await managed_serialize(
103
- txn,
104
- kbid,
105
- resource,
106
- show,
107
- field_type_filter=field_type_filter,
108
- extracted=extracted,
109
- service_name=SERVICE_NAME,
110
- )
111
- if serialized_resource is not None:
112
- find_resources[resource].updated_from(serialized_resource)
113
- else:
114
- logger.warning(f"Resource {resource} not found in {kbid}")
115
- find_resources.pop(resource, None)
116
-
117
-
118
- class Orderer:
119
- def __init__(self):
120
- self.boosted_items = []
121
- self.items = []
122
-
123
- def add(self, key: Any):
124
- self.items.append(key)
125
-
126
- def add_boosted(self, key: Any):
127
- self.boosted_items.append(key)
128
-
129
- def sorted_by_score(self) -> Iterator[Any]:
130
- for key in sorted(self.items, key=lambda value: value[3], reverse=True):
131
- yield key
132
-
133
- def sorted_by_insertion(self) -> Iterator[Any]:
134
- returned = set()
135
- for key in self.boosted_items:
136
- if key in returned:
137
- continue
138
- returned.add(key)
139
- yield key
140
-
141
- for key in self.items:
142
- if key in returned:
143
- continue
144
- returned.add(key)
145
- yield key
146
-
147
-
148
- @merge_observer.wrap({"type": "fetch_find_metadata"})
149
- async def fetch_find_metadata(
150
- find_resources: dict[str, FindResource],
151
- best_matches: list[str],
152
- result_paragraphs: list[TempFindParagraph],
172
+ merged = SearchResponse(
173
+ paragraph=merge_shards_keyword_responses(paragraphs),
174
+ vector=merge_shards_semantic_responses(vectors),
175
+ relation=merge_shards_relation_responses(relations),
176
+ )
177
+ return merged
178
+
179
+
180
+ def merge_shards_keyword_responses(
181
+ keyword_responses: list[ParagraphSearchResponse],
182
+ ) -> ParagraphSearchResponse:
183
+ """Merge keyword (paragraph) search responses into a single response as if
184
+ there were no shards involved.
185
+
186
+ ATENTION! This is not a complete merge, we are only merging the fields
187
+ needed to compose a /find response.
188
+
189
+ """
190
+ merged = ParagraphSearchResponse()
191
+ for response in keyword_responses:
192
+ merged.query = response.query
193
+ merged.next_page = merged.next_page or response.next_page
194
+ merged.total += response.total
195
+ merged.results.extend(response.results)
196
+ merged.ematches.extend(response.ematches)
197
+
198
+ return merged
199
+
200
+
201
+ def merge_shards_semantic_responses(
202
+ semantic_responses: list[VectorSearchResponse],
203
+ ) -> VectorSearchResponse:
204
+ """Merge semantic (vector) search responses into a single response as if
205
+ there were no shards involved.
206
+
207
+ ATENTION! This is not a complete merge, we are only merging the fields
208
+ needed to compose a /find response.
209
+
210
+ """
211
+ merged = VectorSearchResponse()
212
+ for response in semantic_responses:
213
+ merged.documents.extend(response.documents)
214
+
215
+ return merged
216
+
217
+
218
+ def merge_shards_relation_responses(
219
+ relation_responses: list[RelationSearchResponse],
220
+ ) -> RelationSearchResponse:
221
+ merged = RelationSearchResponse()
222
+ for response in relation_responses:
223
+ merged.prefix.nodes.extend(response.prefix.nodes)
224
+ merged.subgraph.relations.extend(response.subgraph.relations)
225
+
226
+ return merged
227
+
228
+
229
+ def keyword_result_to_text_block_match(item: ParagraphResult) -> TextBlockMatch:
230
+ fuzzy_result = len(item.matches) > 0
231
+ return TextBlockMatch(
232
+ paragraph_id=ParagraphId.from_string(item.paragraph),
233
+ score=item.score.bm25,
234
+ score_type=SCORE_TYPE.BM25,
235
+ order=0, # NOTE: this will be filled later
236
+ text="", # NOTE: this will be filled later too
237
+ position=TextPosition(
238
+ page_number=item.metadata.position.page_number,
239
+ index=item.metadata.position.index,
240
+ start=item.start,
241
+ end=item.end,
242
+ start_seconds=[x for x in item.metadata.position.start_seconds],
243
+ end_seconds=[x for x in item.metadata.position.end_seconds],
244
+ ),
245
+ # XXX: we should split labels
246
+ field_labels=[],
247
+ paragraph_labels=list(item.labels),
248
+ fuzzy_search=fuzzy_result,
249
+ is_a_table=item.metadata.representation.is_a_table,
250
+ representation_file=item.metadata.representation.file,
251
+ page_with_visual=item.metadata.page_with_visual,
252
+ )
253
+
254
+
255
+ def keyword_results_to_text_block_matches(items: Iterable[ParagraphResult]) -> list[TextBlockMatch]:
256
+ return [keyword_result_to_text_block_match(item) for item in items]
257
+
258
+
259
+ class InvalidDocId(Exception):
260
+ """Raised while parsing an invalid id coming from semantic search"""
261
+
262
+ def __init__(self, invalid_vector_id: str):
263
+ self.invalid_vector_id = invalid_vector_id
264
+ super().__init__(f"Invalid vector ID: {invalid_vector_id}")
265
+
266
+
267
+ def semantic_result_to_text_block_match(item: DocumentScored) -> TextBlockMatch:
268
+ try:
269
+ vector_id = VectorId.from_string(item.doc_id.id)
270
+ except (IndexError, ValueError):
271
+ raise InvalidDocId(item.doc_id.id)
272
+
273
+ return TextBlockMatch(
274
+ paragraph_id=ParagraphId.from_vector_id(vector_id),
275
+ score=item.score,
276
+ score_type=SCORE_TYPE.VECTOR,
277
+ order=0, # NOTE: this will be filled later
278
+ text="", # NOTE: this will be filled later too
279
+ position=TextPosition(
280
+ page_number=item.metadata.position.page_number,
281
+ index=item.metadata.position.index,
282
+ start=vector_id.vector_start,
283
+ end=vector_id.vector_end,
284
+ start_seconds=[x for x in item.metadata.position.start_seconds],
285
+ end_seconds=[x for x in item.metadata.position.end_seconds],
286
+ ),
287
+ # XXX: we should split labels
288
+ field_labels=[],
289
+ paragraph_labels=list(item.labels),
290
+ fuzzy_search=False, # semantic search doesn't have fuzziness
291
+ is_a_table=item.metadata.representation.is_a_table,
292
+ representation_file=item.metadata.representation.file,
293
+ page_with_visual=item.metadata.page_with_visual,
294
+ )
295
+
296
+
297
+ def semantic_results_to_text_block_matches(items: Iterable[DocumentScored]) -> list[TextBlockMatch]:
298
+ text_blocks: list[TextBlockMatch] = []
299
+ for item in items:
300
+ try:
301
+ text_block = semantic_result_to_text_block_match(item)
302
+ except InvalidDocId as exc:
303
+ logger.warning(f"Skipping invalid doc_id: {exc.invalid_vector_id}")
304
+ continue
305
+ text_blocks.append(text_block)
306
+ return text_blocks
307
+
308
+
309
+ @merge_observer.wrap({"type": "hydrate_and_rerank"})
310
+ async def hydrate_and_rerank(
311
+ text_blocks: Iterable[TextBlockMatch],
153
312
  kbid: str,
154
- show: list[ResourceProperties],
155
- field_type_filter: list[FieldTypeName],
156
- extracted: list[ExtractedDataTypeName],
157
- highlight: bool = False,
158
- ematches: Optional[list[str]] = None,
159
- ):
160
- txn = await get_read_only_transaction()
161
- resources = set()
162
- operations = []
313
+ *,
314
+ resource_hydration_options: ResourceHydrationOptions,
315
+ text_block_hydration_options: TextBlockHydrationOptions,
316
+ reranker: Reranker,
317
+ reranking_options: RerankingOptions,
318
+ top_k: int,
319
+ ) -> tuple[list[TextBlockMatch], list[Resource], list[str]]:
320
+ """Given a list of text blocks from a retrieval operation, hydrate and
321
+ rerank the results.
322
+
323
+ This function returns either the entire list or a subset of updated
324
+ (hydrated and reranked) text blocks and their corresponding resource
325
+ metadata. It also returns an ordered list of best matches.
326
+
327
+ """
163
328
  max_operations = asyncio.Semaphore(50)
164
- orderer = Orderer()
165
- etcache = paragraphs.ExtractedTextCache()
166
- for result_paragraph in result_paragraphs:
167
- if result_paragraph.paragraph is not None:
168
- find_resource = find_resources.setdefault(
169
- result_paragraph.rid, FindResource(id=result_paragraph.id, fields={})
170
- )
171
- find_field = find_resource.fields.setdefault(
172
- result_paragraph.field, FindField(paragraphs={})
173
- )
174
329
 
175
- if result_paragraph.paragraph.id in find_field.paragraphs:
176
- # Its a multiple match, push the score
177
- # find_field.paragraphs[result_paragraph.paragraph.id].score = 25
178
- if (
179
- find_field.paragraphs[result_paragraph.paragraph.id].score
180
- < result_paragraph.paragraph.score
181
- ):
182
- # Use Vector score if there are both
183
- find_field.paragraphs[result_paragraph.paragraph.id].score = (
184
- result_paragraph.paragraph.score * 2
185
- )
186
- orderer.add(
187
- (
188
- result_paragraph.rid,
189
- result_paragraph.field,
190
- result_paragraph.paragraph.id,
191
- result_paragraph.paragraph.score,
192
- )
193
- )
194
- find_field.paragraphs[
195
- result_paragraph.paragraph.id
196
- ].score_type = SCORE_TYPE.BOTH
197
-
198
- else:
199
- find_field.paragraphs[
200
- result_paragraph.paragraph.id
201
- ] = result_paragraph.paragraph
202
- orderer.add(
203
- (
204
- result_paragraph.rid,
205
- result_paragraph.field,
206
- result_paragraph.paragraph.id,
207
- result_paragraph.paragraph.score,
330
+ # Iterate text blocks and create text block and resource metadata hydration
331
+ # tasks depending on the reranker
332
+ text_blocks_by_id: dict[str, TextBlockMatch] = {} # useful for faster access to text blocks later
333
+ resource_hydration_ops = {}
334
+ text_block_hydration_ops = []
335
+ for text_block in text_blocks:
336
+ rid = text_block.paragraph_id.rid
337
+ paragraph_id = text_block.paragraph_id.full()
338
+
339
+ # If we find multiple results (from different indexes) with different
340
+ # metadata, this statement will only get the metadata from the first on
341
+ # the list. We assume metadata is the same on all indexes, otherwise
342
+ # this would be a BUG
343
+ text_blocks_by_id.setdefault(paragraph_id, text_block)
344
+
345
+ # rerankers that need extra results may end with less resources than the
346
+ # ones we see now, so we'll skip this step and recompute the resources
347
+ # later
348
+ if not reranker.needs_extra_results:
349
+ if rid not in resource_hydration_ops:
350
+ resource_hydration_ops[rid] = asyncio.create_task(
351
+ hydrate_resource_metadata(
352
+ kbid,
353
+ rid,
354
+ options=resource_hydration_options,
355
+ concurrency_control=max_operations,
356
+ service_name=SERVICE_NAME,
208
357
  )
209
358
  )
210
359
 
211
- operations.append(
212
- asyncio.create_task(
213
- set_text_value(
214
- kbid=kbid,
215
- result_paragraph=result_paragraph,
216
- highlight=highlight,
217
- ematches=ematches,
218
- max_operations=max_operations,
219
- extracted_text_cache=etcache,
220
- )
221
- )
222
- )
223
- resources.add(result_paragraph.rid)
224
- etcache.clear()
225
-
226
- for order, (rid, field_id, paragraph_id, _) in enumerate(orderer.sorted_by_score()):
227
- find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order
228
- best_matches.append(paragraph_id)
229
-
230
- for resource in resources:
231
- operations.append(
360
+ text_block_hydration_ops.append(
232
361
  asyncio.create_task(
233
- set_resource_metadata_value(
234
- txn,
235
- kbid=kbid,
236
- resource=resource,
237
- show=show,
238
- field_type_filter=field_type_filter,
239
- extracted=extracted,
240
- find_resources=find_resources,
241
- max_operations=max_operations,
362
+ hydrate_text_block(
363
+ kbid,
364
+ text_block,
365
+ text_block_hydration_options,
366
+ concurrency_control=max_operations,
242
367
  )
243
368
  )
244
369
  )
245
370
 
246
- FIND_FETCH_OPS_DISTRIBUTION.observe(len(operations))
247
- if len(operations) > 0:
248
- done, _ = await asyncio.wait(operations) # type: ignore
249
- for task in done:
250
- if task.exception() is not None: # pragma: no cover
251
- logger.error("Error fetching find metadata", exc_info=task.exception())
252
-
253
-
254
- @merge_observer.wrap({"type": "merge_paragraphs_vectors"})
255
- def merge_paragraphs_vectors(
256
- paragraphs_shards: list[list[ParagraphResult]],
257
- vectors_shards: list[list[DocumentScored]],
258
- count: int,
259
- page: int,
260
- min_score: float,
261
- ) -> tuple[list[TempFindParagraph], bool]:
262
- merged_paragrahs: list[TempFindParagraph] = []
263
-
264
- # We assume that paragraphs_shards and vectors_shards are already ordered
265
- for paragraphs_shard in paragraphs_shards:
266
- for paragraph in paragraphs_shard:
267
- fuzzy_result = len(paragraph.matches) > 0
268
- merged_paragrahs.append(
269
- TempFindParagraph(
270
- paragraph_index=paragraph,
271
- field=paragraph.field,
272
- rid=paragraph.uuid,
273
- score=paragraph.score.bm25,
274
- start=paragraph.start,
275
- split=paragraph.split,
276
- end=paragraph.end,
277
- id=paragraph.paragraph,
278
- fuzzy_result=fuzzy_result,
279
- page_with_visual=paragraph.metadata.page_with_visual,
280
- reference=paragraph.metadata.representation.file,
281
- is_a_table=paragraph.metadata.representation.is_a_table,
371
+ # hydrate only the strictly needed before rerank
372
+ hydrated_text_blocks: list[TextBlockMatch]
373
+ hydrated_resources: list[Union[Resource, None]]
374
+
375
+ ops = [
376
+ *text_block_hydration_ops,
377
+ *resource_hydration_ops.values(),
378
+ ]
379
+ FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
380
+ results = await asyncio.gather(*ops)
381
+
382
+ hydrated_text_blocks = results[: len(text_block_hydration_ops)] # type: ignore
383
+ hydrated_resources = results[len(text_block_hydration_ops) :] # type: ignore
384
+
385
+ # with the hydrated text, rerank and apply new scores to the text blocks
386
+ to_rerank = [
387
+ RerankableItem(
388
+ id=text_block.paragraph_id.full(),
389
+ score=text_block.score,
390
+ score_type=text_block.score_type,
391
+ content=text_block.text or "", # TODO: add a warning, this shouldn't usually happen
392
+ )
393
+ for text_block in hydrated_text_blocks
394
+ ]
395
+ reranked = await reranker.rerank(to_rerank, reranking_options)
396
+
397
+ # after reranking, we can cut to the number of results the user wants, so we
398
+ # don't hydrate unnecessary stuff
399
+ reranked = reranked[:top_k]
400
+
401
+ matches = []
402
+ for item in reranked:
403
+ paragraph_id = item.id
404
+ score = item.score
405
+ score_type = item.score_type
406
+
407
+ text_block = text_blocks_by_id[paragraph_id]
408
+ text_block.score = score
409
+ text_block.score_type = score_type
410
+
411
+ matches.append((paragraph_id, score))
412
+
413
+ matches.sort(key=lambda x: x[1], reverse=True)
414
+
415
+ best_matches = []
416
+ best_text_blocks = []
417
+ resource_hydration_ops = {}
418
+ for order, (paragraph_id, _) in enumerate(matches):
419
+ text_block = text_blocks_by_id[paragraph_id]
420
+ text_block.order = order
421
+ best_matches.append(paragraph_id)
422
+ best_text_blocks.append(text_block)
423
+
424
+ # now we have removed the text block surplus, fetch resource metadata
425
+ if reranker.needs_extra_results:
426
+ rid = ParagraphId.from_string(paragraph_id).rid
427
+ if rid not in resource_hydration_ops:
428
+ resource_hydration_ops[rid] = asyncio.create_task(
429
+ hydrate_resource_metadata(
430
+ kbid,
431
+ rid,
432
+ options=resource_hydration_options,
433
+ concurrency_control=max_operations,
434
+ service_name=SERVICE_NAME,
435
+ )
282
436
  )
283
- )
284
437
 
285
- # merged_paragrahs.sort(key=lambda r: r.score, reverse=True)
438
+ # Finally, fetch resource metadata if we haven't already done it
439
+ if reranker.needs_extra_results:
440
+ ops = list(resource_hydration_ops.values())
441
+ FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
442
+ hydrated_resources = await asyncio.gather(*ops) # type: ignore
286
443
 
287
- nextpos = 1
288
- for vectors_shard in vectors_shards:
289
- for vector in vectors_shard:
290
- if vector.score < min_score:
291
- logger.warning(
292
- f"Skipping low score vector: {vector.doc_id.id}. This should not happen"
293
- )
294
- continue
295
- doc_id_split = vector.doc_id.id.split("/")
296
- split = None
297
- if len(doc_id_split) == 5:
298
- rid, field_type, field, index, position = doc_id_split
299
- paragraph_id = f"{rid}/{field_type}/{field}/{position}"
300
- elif len(doc_id_split) == 6:
301
- rid, field_type, field, split, index, position = doc_id_split
302
- paragraph_id = f"{rid}/{field_type}/{field}/{split}/{position}"
303
- else:
304
- logger.warning(f"Skipping invalid doc_id: {vector.doc_id.id}")
305
- continue
306
- start, end = position.split("-")
307
- merged_paragrahs.insert(
308
- nextpos,
309
- TempFindParagraph(
310
- vector_index=vector,
311
- rid=rid,
312
- field=f"/{field_type}/{field}",
313
- score=vector.score,
314
- start=int(start),
315
- end=int(end),
316
- split=split,
317
- id=paragraph_id,
318
- ),
319
- )
320
- nextpos += 3
321
-
322
- # merged_paragrahs.sort(key=lambda r: r.score, reverse=True)
323
- init_position = count * page
324
- end_position = init_position + count
325
- next_page = len(merged_paragrahs) > end_position
326
- merged_paragrahs = merged_paragrahs[init_position:end_position]
327
-
328
- for merged_paragraph in merged_paragrahs:
329
- if merged_paragraph.vector_index is not None:
330
- merged_paragraph.paragraph = FindParagraph(
331
- score=merged_paragraph.vector_index.score,
332
- score_type=SCORE_TYPE.VECTOR,
333
- text="",
334
- labels=[], # TODO: Get labels from index
335
- page_with_visual=merged_paragraph.vector_index.metadata.page_with_visual,
336
- reference=merged_paragraph.vector_index.metadata.representation.file,
337
- is_a_table=merged_paragraph.vector_index.metadata.representation.is_a_table,
338
- position=TextPosition(
339
- page_number=merged_paragraph.vector_index.metadata.position.page_number,
340
- index=merged_paragraph.vector_index.metadata.position.index,
341
- start=merged_paragraph.start,
342
- end=merged_paragraph.end,
343
- start_seconds=[
344
- x
345
- for x in merged_paragraph.vector_index.metadata.position.start_seconds
346
- ],
347
- end_seconds=[
348
- x
349
- for x in merged_paragraph.vector_index.metadata.position.end_seconds
350
- ],
351
- ),
352
- id=merged_paragraph.id,
353
- # Vector searches don't have fuzziness
354
- fuzzy_result=False,
355
- )
356
- elif merged_paragraph.paragraph_index is not None:
357
- merged_paragraph.paragraph = FindParagraph(
358
- score=merged_paragraph.paragraph_index.score.bm25,
359
- score_type=SCORE_TYPE.BM25,
360
- text="",
361
- labels=[x for x in merged_paragraph.paragraph_index.labels],
362
- page_with_visual=merged_paragraph.paragraph_index.metadata.page_with_visual,
363
- reference=merged_paragraph.paragraph_index.metadata.representation.file,
364
- is_a_table=merged_paragraph.paragraph_index.metadata.representation.is_a_table,
365
- position=TextPosition(
366
- page_number=merged_paragraph.paragraph_index.metadata.position.page_number,
367
- index=merged_paragraph.paragraph_index.metadata.position.index,
368
- start=merged_paragraph.start,
369
- end=merged_paragraph.end,
370
- start_seconds=[
371
- x
372
- for x in merged_paragraph.paragraph_index.metadata.position.start_seconds
373
- ],
374
- end_seconds=[
375
- x
376
- for x in merged_paragraph.paragraph_index.metadata.position.end_seconds
377
- ],
378
- ),
379
- id=merged_paragraph.id,
380
- fuzzy_result=merged_paragraph.fuzzy_result,
381
- )
382
- return merged_paragrahs, next_page
444
+ resources = [resource for resource in hydrated_resources if resource is not None]
383
445
 
446
+ return best_text_blocks, resources, best_matches
384
447
 
385
- @merge_observer.wrap({"type": "find_merge"})
386
- async def find_merge_results(
387
- search_responses: list[SearchResponse],
388
- count: int,
389
- page: int,
390
- kbid: str,
391
- show: list[ResourceProperties],
392
- field_type_filter: list[FieldTypeName],
393
- extracted: list[ExtractedDataTypeName],
394
- requested_relations: EntitiesSubgraphRequest,
395
- min_score_bm25: float,
396
- min_score_semantic: float,
397
- highlight: bool = False,
398
- ) -> KnowledgeboxFindResults:
399
- # force getting transaction on current asyncio task
400
- # so all sub tasks will use the same transaction
401
- # this is contextvar magic that is probably not ideal
402
- await get_read_only_transaction()
403
448
 
404
- paragraphs: list[list[ParagraphResult]] = []
405
- vectors: list[list[DocumentScored]] = []
406
- relations = []
449
+ def compose_find_resources(
450
+ text_blocks: list[TextBlockMatch],
451
+ resources: list[Resource],
452
+ ) -> dict[str, FindResource]:
453
+ find_resources: dict[str, FindResource] = {}
407
454
 
408
- next_page = True
409
- ematches: list[str] = []
410
- real_query = ""
411
- total_paragraphs = 0
412
- for response in search_responses:
413
- # Iterate over answers from different logic shards
455
+ for resource in resources:
456
+ rid = resource.id
457
+ if rid not in find_resources:
458
+ find_resources[rid] = FindResource(id=rid, fields={})
459
+ find_resources[rid].updated_from(resource)
414
460
 
415
- ematches.extend(response.paragraph.ematches)
416
- real_query = response.paragraph.query
417
- next_page = next_page and response.paragraph.next_page
418
- total_paragraphs += response.paragraph.total
461
+ for text_block in text_blocks:
462
+ rid = text_block.paragraph_id.rid
463
+ if rid not in find_resources:
464
+ # resource not found in db, skipping
465
+ continue
419
466
 
420
- paragraphs.append(cast(list[ParagraphResult], response.paragraph.results))
421
- vectors.append(cast(list[DocumentScored], response.vector.documents))
467
+ find_resource = find_resources[rid]
468
+ field_id = text_block.paragraph_id.field_id.short_without_subfield()
469
+ find_field = find_resource.fields.setdefault(field_id, FindField(paragraphs={}))
422
470
 
423
- relations.append(response.relation)
471
+ paragraph_id = text_block.paragraph_id.full()
472
+ find_paragraph = text_block_to_find_paragraph(text_block)
424
473
 
425
- rcache = get_resource_cache(clear=True)
474
+ find_field.paragraphs[paragraph_id] = find_paragraph
426
475
 
427
- try:
428
- result_paragraphs, merged_next_page = merge_paragraphs_vectors(
429
- paragraphs, vectors, count, page, min_score_semantic
430
- )
431
- next_page = next_page or merged_next_page
432
-
433
- api_results = KnowledgeboxFindResults(
434
- resources={},
435
- query=real_query,
436
- total=total_paragraphs,
437
- page_number=page,
438
- page_size=count,
439
- next_page=next_page,
440
- min_score=MinScore(
441
- bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)
442
- ),
443
- best_matches=[],
444
- )
476
+ return find_resources
445
477
 
446
- await fetch_find_metadata(
447
- api_results.resources,
448
- api_results.best_matches,
449
- result_paragraphs,
450
- kbid,
451
- show,
452
- field_type_filter,
453
- extracted,
454
- highlight,
455
- ematches,
456
- )
457
- api_results.relations = await merge_relations_results(
458
- relations, requested_relations
459
- )
460
478
 
461
- return api_results
462
- finally:
463
- rcache.clear()
479
+ def _round(x: float) -> float:
480
+ return round(x, ndigits=3)