nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -18,37 +18,48 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
- from typing import Any, Iterator, Optional, cast
21
+ from typing import Iterable, Union
22
22
 
23
- from nucliadb_protos.nodereader_pb2 import (
24
- DocumentScored,
25
- EntitiesSubgraphRequest,
26
- ParagraphResult,
27
- SearchResponse,
28
- )
29
-
30
- from nucliadb.common.maindb.driver import Transaction
31
- from nucliadb.ingest.serialize import managed_serialize
32
- from nucliadb.middleware.transaction import get_read_only_transaction
23
+ from nucliadb.common.external_index_providers.base import TextBlockMatch
24
+ from nucliadb.common.ids import ParagraphId, VectorId
33
25
  from nucliadb.search import SERVICE_NAME, logger
34
- from nucliadb.search.search.cache import get_resource_cache
26
+ from nucliadb.search.search.cut import cut_page
27
+ from nucliadb.search.search.hydrator import (
28
+ ResourceHydrationOptions,
29
+ TextBlockHydrationOptions,
30
+ hydrate_resource_metadata,
31
+ hydrate_text_block,
32
+ text_block_to_find_paragraph,
33
+ )
35
34
  from nucliadb.search.search.merge import merge_relations_results
35
+ from nucliadb.search.search.rank_fusion import RankFusionAlgorithm
36
+ from nucliadb.search.search.rerankers import (
37
+ RerankableItem,
38
+ Reranker,
39
+ RerankingOptions,
40
+ )
36
41
  from nucliadb_models.common import FieldTypeName
37
- from nucliadb_models.resource import ExtractedDataTypeName
42
+ from nucliadb_models.resource import ExtractedDataTypeName, Resource
38
43
  from nucliadb_models.search import (
39
44
  SCORE_TYPE,
40
45
  FindField,
41
- FindParagraph,
42
46
  FindResource,
43
47
  KnowledgeboxFindResults,
44
48
  MinScore,
45
49
  ResourceProperties,
46
- TempFindParagraph,
47
50
  TextPosition,
48
51
  )
52
+ from nucliadb_protos.nodereader_pb2 import (
53
+ DocumentScored,
54
+ EntitiesSubgraphRequest,
55
+ ParagraphResult,
56
+ ParagraphSearchResponse,
57
+ RelationSearchResponse,
58
+ SearchResponse,
59
+ VectorSearchResponse,
60
+ )
49
61
  from nucliadb_telemetry import metrics
50
62
 
51
- from . import paragraphs
52
63
  from .metrics import merge_observer
53
64
 
54
65
  FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
@@ -57,407 +68,413 @@ FIND_FETCH_OPS_DISTRIBUTION = metrics.Histogram(
57
68
  )
58
69
 
59
70
 
60
- def _round(x: float) -> float:
61
- return round(x, ndigits=3)
62
-
63
-
64
- @merge_observer.wrap({"type": "set_text_value"})
65
- async def set_text_value(
71
+ @merge_observer.wrap({"type": "find_merge"})
72
+ async def build_find_response(
73
+ search_responses: list[SearchResponse],
74
+ *,
66
75
  kbid: str,
67
- result_paragraph: TempFindParagraph,
68
- max_operations: asyncio.Semaphore,
76
+ query: str,
77
+ relation_subgraph_query: EntitiesSubgraphRequest,
78
+ top_k: int,
79
+ min_score_bm25: float,
80
+ min_score_semantic: float,
81
+ rank_fusion_algorithm: RankFusionAlgorithm,
82
+ reranker: Reranker,
83
+ show: list[ResourceProperties] = [],
84
+ extracted: list[ExtractedDataTypeName] = [],
85
+ field_type_filter: list[FieldTypeName] = [],
69
86
  highlight: bool = False,
70
- ematches: Optional[list[str]] = None,
71
- extracted_text_cache: Optional[paragraphs.ExtractedTextCache] = None,
72
- ):
73
- async with max_operations:
74
- assert result_paragraph.paragraph
75
- assert result_paragraph.paragraph.position
76
- result_paragraph.paragraph.text = await paragraphs.get_paragraph_text(
77
- kbid=kbid,
78
- rid=result_paragraph.rid,
79
- field=result_paragraph.field,
80
- start=result_paragraph.paragraph.position.start,
81
- end=result_paragraph.paragraph.position.end,
82
- split=result_paragraph.split,
83
- highlight=highlight,
84
- ematches=ematches,
85
- matches=[], # TODO
86
- extracted_text_cache=extracted_text_cache,
87
+ ) -> KnowledgeboxFindResults:
88
+ # merge
89
+ search_response = merge_shard_responses(search_responses)
90
+
91
+ keyword_results = keyword_results_to_text_block_matches(search_response.paragraph.results)
92
+ semantic_results = semantic_results_to_text_block_matches(
93
+ filter(
94
+ lambda x: x.score >= min_score_semantic,
95
+ search_response.vector.documents,
87
96
  )
97
+ )
98
+
99
+ merged_text_blocks: list[TextBlockMatch] = rank_fusion_algorithm.fuse(
100
+ keyword_results, semantic_results
101
+ )
102
+
103
+ # cut
104
+ # we assume pagination + predict reranker is forbidden and has been already
105
+ # enforced/validated by the query parsing.
106
+ if reranker.needs_extra_results:
107
+ assert reranker.window is not None, "Reranker definition must enforce this condition"
108
+ text_blocks_page, next_page = cut_page(merged_text_blocks, reranker.window)
109
+ else:
110
+ text_blocks_page, next_page = cut_page(merged_text_blocks, top_k)
111
+
112
+ # hydrate and rerank
113
+ resource_hydration_options = ResourceHydrationOptions(
114
+ show=show, extracted=extracted, field_type_filter=field_type_filter
115
+ )
116
+ text_block_hydration_options = TextBlockHydrationOptions(
117
+ highlight=highlight,
118
+ ematches=search_response.paragraph.ematches, # type: ignore
119
+ )
120
+ reranking_options = RerankingOptions(kbid=kbid, query=query)
121
+ text_blocks, resources, best_matches = await hydrate_and_rerank(
122
+ text_blocks_page,
123
+ kbid,
124
+ resource_hydration_options=resource_hydration_options,
125
+ text_block_hydration_options=text_block_hydration_options,
126
+ reranker=reranker,
127
+ reranking_options=reranking_options,
128
+ top_k=top_k,
129
+ )
130
+
131
+ # build relations graph
132
+ relations = await merge_relations_results([search_response.relation], relation_subgraph_query)
133
+
134
+ # compose response
135
+ find_resources = compose_find_resources(text_blocks, resources)
136
+
137
+ next_page = search_response.paragraph.next_page or next_page
138
+ total_paragraphs = search_response.paragraph.total
139
+
140
+ find_results = KnowledgeboxFindResults(
141
+ query=query,
142
+ resources=find_resources,
143
+ best_matches=best_matches,
144
+ relations=relations,
145
+ total=total_paragraphs,
146
+ page_number=0, # Bw/c with pagination
147
+ page_size=top_k,
148
+ next_page=next_page,
149
+ min_score=MinScore(bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)),
150
+ )
151
+ return find_results
152
+
153
+
154
+ def merge_shard_responses(
155
+ responses: list[SearchResponse],
156
+ ) -> SearchResponse:
157
+ """Merge search responses into a single response as if there were no shards
158
+ involved.
159
+
160
+ ATENTION! This is not a complete merge, we are only merging the fields
161
+ needed to compose a /find response.
162
+
163
+ """
164
+ paragraphs = []
165
+ vectors = []
166
+ relations = []
167
+ for response in responses:
168
+ paragraphs.append(response.paragraph)
169
+ vectors.append(response.vector)
170
+ relations.append(response.relation)
88
171
 
89
-
90
- @merge_observer.wrap({"type": "set_resource_metadada_value"})
91
- async def set_resource_metadata_value(
92
- txn: Transaction,
93
- kbid: str,
94
- resource: str,
95
- show: list[ResourceProperties],
96
- field_type_filter: list[FieldTypeName],
97
- extracted: list[ExtractedDataTypeName],
98
- find_resources: dict[str, FindResource],
99
- max_operations: asyncio.Semaphore,
100
- ):
101
- async with max_operations:
102
- serialized_resource = await managed_serialize(
103
- txn,
104
- kbid,
105
- resource,
106
- show,
107
- field_type_filter=field_type_filter,
108
- extracted=extracted,
109
- service_name=SERVICE_NAME,
110
- )
111
- if serialized_resource is not None:
112
- find_resources[resource].updated_from(serialized_resource)
113
- else:
114
- logger.warning(f"Resource {resource} not found in {kbid}")
115
- find_resources.pop(resource, None)
116
-
117
-
118
- class Orderer:
119
- def __init__(self):
120
- self.boosted_items = []
121
- self.items = []
122
-
123
- def add(self, key: Any):
124
- self.items.append(key)
125
-
126
- def add_boosted(self, key: Any):
127
- self.boosted_items.append(key)
128
-
129
- def sorted_by_score(self) -> Iterator[Any]:
130
- for key in sorted(self.items, key=lambda value: value[3], reverse=True):
131
- yield key
132
-
133
- def sorted_by_insertion(self) -> Iterator[Any]:
134
- returned = set()
135
- for key in self.boosted_items:
136
- if key in returned:
137
- continue
138
- returned.add(key)
139
- yield key
140
-
141
- for key in self.items:
142
- if key in returned:
143
- continue
144
- returned.add(key)
145
- yield key
146
-
147
-
148
- @merge_observer.wrap({"type": "fetch_find_metadata"})
149
- async def fetch_find_metadata(
150
- find_resources: dict[str, FindResource],
151
- best_matches: list[str],
152
- result_paragraphs: list[TempFindParagraph],
172
+ merged = SearchResponse(
173
+ paragraph=merge_shards_keyword_responses(paragraphs),
174
+ vector=merge_shards_semantic_responses(vectors),
175
+ relation=merge_shards_relation_responses(relations),
176
+ )
177
+ return merged
178
+
179
+
180
+ def merge_shards_keyword_responses(
181
+ keyword_responses: list[ParagraphSearchResponse],
182
+ ) -> ParagraphSearchResponse:
183
+ """Merge keyword (paragraph) search responses into a single response as if
184
+ there were no shards involved.
185
+
186
+ ATENTION! This is not a complete merge, we are only merging the fields
187
+ needed to compose a /find response.
188
+
189
+ """
190
+ merged = ParagraphSearchResponse()
191
+ for response in keyword_responses:
192
+ merged.query = response.query
193
+ merged.next_page = merged.next_page or response.next_page
194
+ merged.total += response.total
195
+ merged.results.extend(response.results)
196
+ merged.ematches.extend(response.ematches)
197
+
198
+ return merged
199
+
200
+
201
+ def merge_shards_semantic_responses(
202
+ semantic_responses: list[VectorSearchResponse],
203
+ ) -> VectorSearchResponse:
204
+ """Merge semantic (vector) search responses into a single response as if
205
+ there were no shards involved.
206
+
207
+ ATENTION! This is not a complete merge, we are only merging the fields
208
+ needed to compose a /find response.
209
+
210
+ """
211
+ merged = VectorSearchResponse()
212
+ for response in semantic_responses:
213
+ merged.documents.extend(response.documents)
214
+
215
+ return merged
216
+
217
+
218
+ def merge_shards_relation_responses(
219
+ relation_responses: list[RelationSearchResponse],
220
+ ) -> RelationSearchResponse:
221
+ merged = RelationSearchResponse()
222
+ for response in relation_responses:
223
+ merged.prefix.nodes.extend(response.prefix.nodes)
224
+ merged.subgraph.relations.extend(response.subgraph.relations)
225
+
226
+ return merged
227
+
228
+
229
+ def keyword_result_to_text_block_match(item: ParagraphResult) -> TextBlockMatch:
230
+ fuzzy_result = len(item.matches) > 0
231
+ return TextBlockMatch(
232
+ paragraph_id=ParagraphId.from_string(item.paragraph),
233
+ score=item.score.bm25,
234
+ score_type=SCORE_TYPE.BM25,
235
+ order=0, # NOTE: this will be filled later
236
+ text="", # NOTE: this will be filled later too
237
+ position=TextPosition(
238
+ page_number=item.metadata.position.page_number,
239
+ index=item.metadata.position.index,
240
+ start=item.start,
241
+ end=item.end,
242
+ start_seconds=[x for x in item.metadata.position.start_seconds],
243
+ end_seconds=[x for x in item.metadata.position.end_seconds],
244
+ ),
245
+ # XXX: we should split labels
246
+ field_labels=[],
247
+ paragraph_labels=list(item.labels),
248
+ fuzzy_search=fuzzy_result,
249
+ is_a_table=item.metadata.representation.is_a_table,
250
+ representation_file=item.metadata.representation.file,
251
+ page_with_visual=item.metadata.page_with_visual,
252
+ )
253
+
254
+
255
+ def keyword_results_to_text_block_matches(items: Iterable[ParagraphResult]) -> list[TextBlockMatch]:
256
+ return [keyword_result_to_text_block_match(item) for item in items]
257
+
258
+
259
+ class InvalidDocId(Exception):
260
+ """Raised while parsing an invalid id coming from semantic search"""
261
+
262
+ def __init__(self, invalid_vector_id: str):
263
+ self.invalid_vector_id = invalid_vector_id
264
+ super().__init__(f"Invalid vector ID: {invalid_vector_id}")
265
+
266
+
267
+ def semantic_result_to_text_block_match(item: DocumentScored) -> TextBlockMatch:
268
+ try:
269
+ vector_id = VectorId.from_string(item.doc_id.id)
270
+ except (IndexError, ValueError):
271
+ raise InvalidDocId(item.doc_id.id)
272
+
273
+ return TextBlockMatch(
274
+ paragraph_id=ParagraphId.from_vector_id(vector_id),
275
+ score=item.score,
276
+ score_type=SCORE_TYPE.VECTOR,
277
+ order=0, # NOTE: this will be filled later
278
+ text="", # NOTE: this will be filled later too
279
+ position=TextPosition(
280
+ page_number=item.metadata.position.page_number,
281
+ index=item.metadata.position.index,
282
+ start=vector_id.vector_start,
283
+ end=vector_id.vector_end,
284
+ start_seconds=[x for x in item.metadata.position.start_seconds],
285
+ end_seconds=[x for x in item.metadata.position.end_seconds],
286
+ ),
287
+ # XXX: we should split labels
288
+ field_labels=[],
289
+ paragraph_labels=list(item.labels),
290
+ fuzzy_search=False, # semantic search doesn't have fuzziness
291
+ is_a_table=item.metadata.representation.is_a_table,
292
+ representation_file=item.metadata.representation.file,
293
+ page_with_visual=item.metadata.page_with_visual,
294
+ )
295
+
296
+
297
+ def semantic_results_to_text_block_matches(items: Iterable[DocumentScored]) -> list[TextBlockMatch]:
298
+ text_blocks: list[TextBlockMatch] = []
299
+ for item in items:
300
+ try:
301
+ text_block = semantic_result_to_text_block_match(item)
302
+ except InvalidDocId as exc:
303
+ logger.warning(f"Skipping invalid doc_id: {exc.invalid_vector_id}")
304
+ continue
305
+ text_blocks.append(text_block)
306
+ return text_blocks
307
+
308
+
309
+ @merge_observer.wrap({"type": "hydrate_and_rerank"})
310
+ async def hydrate_and_rerank(
311
+ text_blocks: Iterable[TextBlockMatch],
153
312
  kbid: str,
154
- show: list[ResourceProperties],
155
- field_type_filter: list[FieldTypeName],
156
- extracted: list[ExtractedDataTypeName],
157
- highlight: bool = False,
158
- ematches: Optional[list[str]] = None,
159
- ):
160
- txn = await get_read_only_transaction()
161
- resources = set()
162
- operations = []
313
+ *,
314
+ resource_hydration_options: ResourceHydrationOptions,
315
+ text_block_hydration_options: TextBlockHydrationOptions,
316
+ reranker: Reranker,
317
+ reranking_options: RerankingOptions,
318
+ top_k: int,
319
+ ) -> tuple[list[TextBlockMatch], list[Resource], list[str]]:
320
+ """Given a list of text blocks from a retrieval operation, hydrate and
321
+ rerank the results.
322
+
323
+ This function returns either the entire list or a subset of updated
324
+ (hydrated and reranked) text blocks and their corresponding resource
325
+ metadata. It also returns an ordered list of best matches.
326
+
327
+ """
163
328
  max_operations = asyncio.Semaphore(50)
164
- orderer = Orderer()
165
- etcache = paragraphs.ExtractedTextCache()
166
- for result_paragraph in result_paragraphs:
167
- if result_paragraph.paragraph is not None:
168
- find_resource = find_resources.setdefault(
169
- result_paragraph.rid, FindResource(id=result_paragraph.id, fields={})
170
- )
171
- find_field = find_resource.fields.setdefault(
172
- result_paragraph.field, FindField(paragraphs={})
173
- )
174
-
175
- if result_paragraph.paragraph.id in find_field.paragraphs:
176
- # Its a multiple match, push the score
177
- # find_field.paragraphs[result_paragraph.paragraph.id].score = 25
178
- if (
179
- find_field.paragraphs[result_paragraph.paragraph.id].score
180
- < result_paragraph.paragraph.score
181
- ):
182
- # Use Vector score if there are both
183
- find_field.paragraphs[result_paragraph.paragraph.id].score = (
184
- result_paragraph.paragraph.score * 2
185
- )
186
- orderer.add(
187
- (
188
- result_paragraph.rid,
189
- result_paragraph.field,
190
- result_paragraph.paragraph.id,
191
- result_paragraph.paragraph.score,
192
- )
193
- )
194
- find_field.paragraphs[result_paragraph.paragraph.id].score_type = (
195
- SCORE_TYPE.BOTH
196
- )
197
329
 
198
- else:
199
- find_field.paragraphs[result_paragraph.paragraph.id] = (
200
- result_paragraph.paragraph
201
- )
202
- orderer.add(
203
- (
204
- result_paragraph.rid,
205
- result_paragraph.field,
206
- result_paragraph.paragraph.id,
207
- result_paragraph.paragraph.score,
330
+ # Iterate text blocks and create text block and resource metadata hydration
331
+ # tasks depending on the reranker
332
+ text_blocks_by_id: dict[str, TextBlockMatch] = {} # useful for faster access to text blocks later
333
+ resource_hydration_ops = {}
334
+ text_block_hydration_ops = []
335
+ for text_block in text_blocks:
336
+ rid = text_block.paragraph_id.rid
337
+ paragraph_id = text_block.paragraph_id.full()
338
+
339
+ # If we find multiple results (from different indexes) with different
340
+ # metadata, this statement will only get the metadata from the first on
341
+ # the list. We assume metadata is the same on all indexes, otherwise
342
+ # this would be a BUG
343
+ text_blocks_by_id.setdefault(paragraph_id, text_block)
344
+
345
+ # rerankers that need extra results may end with less resources than the
346
+ # ones we see now, so we'll skip this step and recompute the resources
347
+ # later
348
+ if not reranker.needs_extra_results:
349
+ if rid not in resource_hydration_ops:
350
+ resource_hydration_ops[rid] = asyncio.create_task(
351
+ hydrate_resource_metadata(
352
+ kbid,
353
+ rid,
354
+ options=resource_hydration_options,
355
+ concurrency_control=max_operations,
356
+ service_name=SERVICE_NAME,
208
357
  )
209
358
  )
210
359
 
211
- operations.append(
212
- asyncio.create_task(
213
- set_text_value(
214
- kbid=kbid,
215
- result_paragraph=result_paragraph,
216
- highlight=highlight,
217
- ematches=ematches,
218
- max_operations=max_operations,
219
- extracted_text_cache=etcache,
220
- )
221
- )
222
- )
223
- resources.add(result_paragraph.rid)
224
- etcache.clear()
225
-
226
- for order, (rid, field_id, paragraph_id, _) in enumerate(orderer.sorted_by_score()):
227
- find_resources[rid].fields[field_id].paragraphs[paragraph_id].order = order
228
- best_matches.append(paragraph_id)
229
-
230
- for resource in resources:
231
- operations.append(
360
+ text_block_hydration_ops.append(
232
361
  asyncio.create_task(
233
- set_resource_metadata_value(
234
- txn,
235
- kbid=kbid,
236
- resource=resource,
237
- show=show,
238
- field_type_filter=field_type_filter,
239
- extracted=extracted,
240
- find_resources=find_resources,
241
- max_operations=max_operations,
362
+ hydrate_text_block(
363
+ kbid,
364
+ text_block,
365
+ text_block_hydration_options,
366
+ concurrency_control=max_operations,
242
367
  )
243
368
  )
244
369
  )
245
370
 
246
- FIND_FETCH_OPS_DISTRIBUTION.observe(len(operations))
247
- if len(operations) > 0:
248
- done, _ = await asyncio.wait(operations) # type: ignore
249
- for task in done:
250
- if task.exception() is not None: # pragma: no cover
251
- logger.error("Error fetching find metadata", exc_info=task.exception())
252
-
253
-
254
- @merge_observer.wrap({"type": "merge_paragraphs_vectors"})
255
- def merge_paragraphs_vectors(
256
- paragraphs_shards: list[list[ParagraphResult]],
257
- vectors_shards: list[list[DocumentScored]],
258
- count: int,
259
- page: int,
260
- min_score: float,
261
- ) -> tuple[list[TempFindParagraph], bool]:
262
- merged_paragrahs: list[TempFindParagraph] = []
263
-
264
- # We assume that paragraphs_shards and vectors_shards are already ordered
265
- for paragraphs_shard in paragraphs_shards:
266
- for paragraph in paragraphs_shard:
267
- fuzzy_result = len(paragraph.matches) > 0
268
- merged_paragrahs.append(
269
- TempFindParagraph(
270
- paragraph_index=paragraph,
271
- field=paragraph.field,
272
- rid=paragraph.uuid,
273
- score=paragraph.score.bm25,
274
- start=paragraph.start,
275
- split=paragraph.split,
276
- end=paragraph.end,
277
- id=paragraph.paragraph,
278
- fuzzy_result=fuzzy_result,
279
- page_with_visual=paragraph.metadata.page_with_visual,
280
- reference=paragraph.metadata.representation.file,
281
- is_a_table=paragraph.metadata.representation.is_a_table,
371
+ # hydrate only the strictly needed before rerank
372
+ hydrated_text_blocks: list[TextBlockMatch]
373
+ hydrated_resources: list[Union[Resource, None]]
374
+
375
+ ops = [
376
+ *text_block_hydration_ops,
377
+ *resource_hydration_ops.values(),
378
+ ]
379
+ FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
380
+ results = await asyncio.gather(*ops)
381
+
382
+ hydrated_text_blocks = results[: len(text_block_hydration_ops)] # type: ignore
383
+ hydrated_resources = results[len(text_block_hydration_ops) :] # type: ignore
384
+
385
+ # with the hydrated text, rerank and apply new scores to the text blocks
386
+ to_rerank = [
387
+ RerankableItem(
388
+ id=text_block.paragraph_id.full(),
389
+ score=text_block.score,
390
+ score_type=text_block.score_type,
391
+ content=text_block.text or "", # TODO: add a warning, this shouldn't usually happen
392
+ )
393
+ for text_block in hydrated_text_blocks
394
+ ]
395
+ reranked = await reranker.rerank(to_rerank, reranking_options)
396
+
397
+ # after reranking, we can cut to the number of results the user wants, so we
398
+ # don't hydrate unnecessary stuff
399
+ reranked = reranked[:top_k]
400
+
401
+ matches = []
402
+ for item in reranked:
403
+ paragraph_id = item.id
404
+ score = item.score
405
+ score_type = item.score_type
406
+
407
+ text_block = text_blocks_by_id[paragraph_id]
408
+ text_block.score = score
409
+ text_block.score_type = score_type
410
+
411
+ matches.append((paragraph_id, score))
412
+
413
+ matches.sort(key=lambda x: x[1], reverse=True)
414
+
415
+ best_matches = []
416
+ best_text_blocks = []
417
+ resource_hydration_ops = {}
418
+ for order, (paragraph_id, _) in enumerate(matches):
419
+ text_block = text_blocks_by_id[paragraph_id]
420
+ text_block.order = order
421
+ best_matches.append(paragraph_id)
422
+ best_text_blocks.append(text_block)
423
+
424
+ # now we have removed the text block surplus, fetch resource metadata
425
+ if reranker.needs_extra_results:
426
+ rid = ParagraphId.from_string(paragraph_id).rid
427
+ if rid not in resource_hydration_ops:
428
+ resource_hydration_ops[rid] = asyncio.create_task(
429
+ hydrate_resource_metadata(
430
+ kbid,
431
+ rid,
432
+ options=resource_hydration_options,
433
+ concurrency_control=max_operations,
434
+ service_name=SERVICE_NAME,
435
+ )
282
436
  )
283
- )
284
437
 
285
- # merged_paragrahs.sort(key=lambda r: r.score, reverse=True)
438
+ # Finally, fetch resource metadata if we haven't already done it
439
+ if reranker.needs_extra_results:
440
+ ops = list(resource_hydration_ops.values())
441
+ FIND_FETCH_OPS_DISTRIBUTION.observe(len(ops))
442
+ hydrated_resources = await asyncio.gather(*ops) # type: ignore
286
443
 
287
- nextpos = 1
288
- for vectors_shard in vectors_shards:
289
- for vector in vectors_shard:
290
- if vector.score < min_score:
291
- logger.warning(
292
- f"Skipping low score vector: {vector.doc_id.id}. This should not happen"
293
- )
294
- continue
295
- doc_id_split = vector.doc_id.id.split("/")
296
- split = None
297
- if len(doc_id_split) == 5:
298
- rid, field_type, field, index, position = doc_id_split
299
- paragraph_id = f"{rid}/{field_type}/{field}/{position}"
300
- elif len(doc_id_split) == 6:
301
- rid, field_type, field, split, index, position = doc_id_split
302
- paragraph_id = f"{rid}/{field_type}/{field}/{split}/{position}"
303
- else:
304
- logger.warning(f"Skipping invalid doc_id: {vector.doc_id.id}")
305
- continue
306
- start, end = position.split("-")
307
- merged_paragrahs.insert(
308
- nextpos,
309
- TempFindParagraph(
310
- vector_index=vector,
311
- rid=rid,
312
- field=f"/{field_type}/{field}",
313
- score=vector.score,
314
- start=int(start),
315
- end=int(end),
316
- split=split,
317
- id=paragraph_id,
318
- ),
319
- )
320
- nextpos += 3
321
-
322
- # merged_paragrahs.sort(key=lambda r: r.score, reverse=True)
323
- init_position = count * page
324
- end_position = init_position + count
325
- next_page = len(merged_paragrahs) > end_position
326
- merged_paragrahs = merged_paragrahs[init_position:end_position]
327
-
328
- for merged_paragraph in merged_paragrahs:
329
- if merged_paragraph.vector_index is not None:
330
- merged_paragraph.paragraph = FindParagraph(
331
- score=merged_paragraph.vector_index.score,
332
- score_type=SCORE_TYPE.VECTOR,
333
- text="",
334
- labels=[], # TODO: Get labels from index
335
- page_with_visual=merged_paragraph.vector_index.metadata.page_with_visual,
336
- reference=merged_paragraph.vector_index.metadata.representation.file,
337
- is_a_table=merged_paragraph.vector_index.metadata.representation.is_a_table,
338
- position=TextPosition(
339
- page_number=merged_paragraph.vector_index.metadata.position.page_number,
340
- index=merged_paragraph.vector_index.metadata.position.index,
341
- start=merged_paragraph.start,
342
- end=merged_paragraph.end,
343
- start_seconds=[
344
- x
345
- for x in merged_paragraph.vector_index.metadata.position.start_seconds
346
- ],
347
- end_seconds=[
348
- x
349
- for x in merged_paragraph.vector_index.metadata.position.end_seconds
350
- ],
351
- ),
352
- id=merged_paragraph.id,
353
- # Vector searches don't have fuzziness
354
- fuzzy_result=False,
355
- )
356
- elif merged_paragraph.paragraph_index is not None:
357
- merged_paragraph.paragraph = FindParagraph(
358
- score=merged_paragraph.paragraph_index.score.bm25,
359
- score_type=SCORE_TYPE.BM25,
360
- text="",
361
- labels=[x for x in merged_paragraph.paragraph_index.labels],
362
- page_with_visual=merged_paragraph.paragraph_index.metadata.page_with_visual,
363
- reference=merged_paragraph.paragraph_index.metadata.representation.file,
364
- is_a_table=merged_paragraph.paragraph_index.metadata.representation.is_a_table,
365
- position=TextPosition(
366
- page_number=merged_paragraph.paragraph_index.metadata.position.page_number,
367
- index=merged_paragraph.paragraph_index.metadata.position.index,
368
- start=merged_paragraph.start,
369
- end=merged_paragraph.end,
370
- start_seconds=[
371
- x
372
- for x in merged_paragraph.paragraph_index.metadata.position.start_seconds
373
- ],
374
- end_seconds=[
375
- x
376
- for x in merged_paragraph.paragraph_index.metadata.position.end_seconds
377
- ],
378
- ),
379
- id=merged_paragraph.id,
380
- fuzzy_result=merged_paragraph.fuzzy_result,
381
- )
382
- return merged_paragrahs, next_page
444
+ resources = [resource for resource in hydrated_resources if resource is not None]
383
445
 
446
+ return best_text_blocks, resources, best_matches
384
447
 
385
- @merge_observer.wrap({"type": "find_merge"})
386
- async def find_merge_results(
387
- search_responses: list[SearchResponse],
388
- count: int,
389
- page: int,
390
- kbid: str,
391
- show: list[ResourceProperties],
392
- field_type_filter: list[FieldTypeName],
393
- extracted: list[ExtractedDataTypeName],
394
- requested_relations: EntitiesSubgraphRequest,
395
- min_score_bm25: float,
396
- min_score_semantic: float,
397
- highlight: bool = False,
398
- ) -> KnowledgeboxFindResults:
399
- # force getting transaction on current asyncio task
400
- # so all sub tasks will use the same transaction
401
- # this is contextvar magic that is probably not ideal
402
- await get_read_only_transaction()
403
448
 
404
- paragraphs: list[list[ParagraphResult]] = []
405
- vectors: list[list[DocumentScored]] = []
406
- relations = []
449
+ def compose_find_resources(
450
+ text_blocks: list[TextBlockMatch],
451
+ resources: list[Resource],
452
+ ) -> dict[str, FindResource]:
453
+ find_resources: dict[str, FindResource] = {}
407
454
 
408
- next_page = True
409
- ematches: list[str] = []
410
- real_query = ""
411
- total_paragraphs = 0
412
- for response in search_responses:
413
- # Iterate over answers from different logic shards
455
+ for resource in resources:
456
+ rid = resource.id
457
+ if rid not in find_resources:
458
+ find_resources[rid] = FindResource(id=rid, fields={})
459
+ find_resources[rid].updated_from(resource)
414
460
 
415
- ematches.extend(response.paragraph.ematches)
416
- real_query = response.paragraph.query
417
- next_page = next_page and response.paragraph.next_page
418
- total_paragraphs += response.paragraph.total
461
+ for text_block in text_blocks:
462
+ rid = text_block.paragraph_id.rid
463
+ if rid not in find_resources:
464
+ # resource not found in db, skipping
465
+ continue
419
466
 
420
- paragraphs.append(cast(list[ParagraphResult], response.paragraph.results))
421
- vectors.append(cast(list[DocumentScored], response.vector.documents))
467
+ find_resource = find_resources[rid]
468
+ field_id = text_block.paragraph_id.field_id.short_without_subfield()
469
+ find_field = find_resource.fields.setdefault(field_id, FindField(paragraphs={}))
422
470
 
423
- relations.append(response.relation)
471
+ paragraph_id = text_block.paragraph_id.full()
472
+ find_paragraph = text_block_to_find_paragraph(text_block)
424
473
 
425
- rcache = get_resource_cache(clear=True)
474
+ find_field.paragraphs[paragraph_id] = find_paragraph
426
475
 
427
- try:
428
- result_paragraphs, merged_next_page = merge_paragraphs_vectors(
429
- paragraphs, vectors, count, page, min_score_semantic
430
- )
431
- next_page = next_page or merged_next_page
432
-
433
- api_results = KnowledgeboxFindResults(
434
- resources={},
435
- query=real_query,
436
- total=total_paragraphs,
437
- page_number=page,
438
- page_size=count,
439
- next_page=next_page,
440
- min_score=MinScore(
441
- bm25=_round(min_score_bm25), semantic=_round(min_score_semantic)
442
- ),
443
- best_matches=[],
444
- )
476
+ return find_resources
445
477
 
446
- await fetch_find_metadata(
447
- api_results.resources,
448
- api_results.best_matches,
449
- result_paragraphs,
450
- kbid,
451
- show,
452
- field_type_filter,
453
- extracted,
454
- highlight,
455
- ematches,
456
- )
457
- api_results.relations = await merge_relations_results(
458
- relations, requested_relations
459
- )
460
478
 
461
- return api_results
462
- finally:
463
- rcache.clear()
479
+ def _round(x: float) -> float:
480
+ return round(x, ndigits=3)