nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,24 +19,19 @@
19
19
  #
20
20
  import asyncio
21
21
  import json
22
+ import string
22
23
  from datetime import datetime
23
24
  from typing import Any, Awaitable, Optional, Union
24
25
 
25
26
  from async_lru import alru_cache
26
- from nucliadb_protos.noderesources_pb2 import Resource
27
27
 
28
28
  from nucliadb.common import datamanagers
29
- from nucliadb.ingest.orm.synonyms import Synonyms
30
- from nucliadb.middleware.transaction import get_read_only_transaction
29
+ from nucliadb.common.maindb.utils import get_driver
31
30
  from nucliadb.search import logger
32
- from nucliadb.search.predict import (
33
- PredictVectorMissing,
34
- SendToPredictError,
35
- convert_relations,
36
- )
31
+ from nucliadb.search.predict import SendToPredictError, convert_relations
37
32
  from nucliadb.search.search.filters import (
38
33
  convert_to_node_filters,
39
- flat_filter_labels,
34
+ flatten_filter_literals,
40
35
  has_classification_label_filters,
41
36
  split_labels_by_type,
42
37
  translate_label,
@@ -46,27 +41,30 @@ from nucliadb.search.search.metrics import (
46
41
  node_features,
47
42
  query_parse_dependency_observer,
48
43
  )
44
+ from nucliadb.search.search.rank_fusion import (
45
+ RankFusionAlgorithm,
46
+ )
47
+ from nucliadb.search.search.rerankers import (
48
+ Reranker,
49
+ )
49
50
  from nucliadb.search.utilities import get_predict
50
- from nucliadb_models.labels import translate_system_to_alias_label
51
+ from nucliadb_models.internal.predict import QueryInfo
52
+ from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
51
53
  from nucliadb_models.metadata import ResourceProcessingStatus
52
54
  from nucliadb_models.search import (
53
55
  Filter,
56
+ MaxTokens,
54
57
  MinScore,
55
- QueryInfo,
56
58
  SearchOptions,
57
- SentenceSearch,
58
59
  SortField,
59
- SortFieldMap,
60
60
  SortOptions,
61
61
  SortOrder,
62
62
  SortOrderMap,
63
63
  SuggestOptions,
64
- TokenSearch,
65
64
  )
66
65
  from nucliadb_models.security import RequestSecurity
67
66
  from nucliadb_protos import knowledgebox_pb2, nodereader_pb2, utils_pb2
68
- from nucliadb_utils import const
69
- from nucliadb_utils.utilities import has_feature
67
+ from nucliadb_protos.noderesources_pb2 import Resource
70
68
 
71
69
  from .exceptions import InvalidQueryError
72
70
 
@@ -75,6 +73,8 @@ INDEX_SORTABLE_FIELDS = [
75
73
  SortField.MODIFIED,
76
74
  ]
77
75
 
76
+ DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
77
+
78
78
 
79
79
  class QueryParser:
80
80
  """
@@ -86,14 +86,14 @@ class QueryParser:
86
86
  query parsing.
87
87
  """
88
88
 
89
- _min_score_task: Optional[asyncio.Task] = None
90
89
  _query_information_task: Optional[asyncio.Task] = None
91
- _convert_vectors_task: Optional[asyncio.Task] = None
90
+ _get_vectorset_task: Optional[asyncio.Task] = None
92
91
  _detected_entities_task: Optional[asyncio.Task] = None
93
92
  _entities_meta_cache_task: Optional[asyncio.Task] = None
94
93
  _deleted_entities_groups_task: Optional[asyncio.Task] = None
95
94
  _synonyms_task: Optional[asyncio.Task] = None
96
95
  _get_classification_labels_task: Optional[asyncio.Task] = None
96
+ _get_matryoshka_dimension_task: Optional[asyncio.Task] = None
97
97
 
98
98
  def __init__(
99
99
  self,
@@ -101,9 +101,9 @@ class QueryParser:
101
101
  kbid: str,
102
102
  features: list[SearchOptions],
103
103
  query: str,
104
- filters: Union[list[str], list[Filter]],
105
- page_number: int,
106
- page_size: int,
104
+ label_filters: Union[list[str], list[Filter]],
105
+ keyword_filters: Union[list[str], list[Filter]],
106
+ top_k: int,
107
107
  min_score: MinScore,
108
108
  faceted: Optional[list[str]] = None,
109
109
  sort: Optional[SortOptions] = None,
@@ -121,16 +121,28 @@ class QueryParser:
121
121
  key_filters: Optional[list[str]] = None,
122
122
  security: Optional[RequestSecurity] = None,
123
123
  generative_model: Optional[str] = None,
124
- rephrase: Optional[bool] = False,
124
+ rephrase: bool = False,
125
+ rephrase_prompt: Optional[str] = None,
126
+ max_tokens: Optional[MaxTokens] = None,
127
+ hidden: Optional[bool] = None,
128
+ rank_fusion: Optional[RankFusionAlgorithm] = None,
129
+ reranker: Optional[Reranker] = None,
125
130
  ):
126
131
  self.kbid = kbid
127
132
  self.features = features
128
133
  self.query = query
129
- self.filters: dict[str, Any] = convert_to_node_filters(filters)
130
- self.flat_filter_labels: list[str] = []
134
+ self.hidden = hidden
135
+ if self.hidden is not None:
136
+ if self.hidden:
137
+ label_filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
138
+ else:
139
+ label_filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore
140
+
141
+ self.label_filters: dict[str, Any] = convert_to_node_filters(label_filters)
142
+ self.flat_label_filters: list[str] = []
143
+ self.keyword_filters: dict[str, Any] = convert_to_node_filters(keyword_filters)
131
144
  self.faceted = faceted or []
132
- self.page_number = page_number
133
- self.page_size = page_size
145
+ self.top_k = top_k
134
146
  self.min_score = min_score
135
147
  self.sort = sort
136
148
  self.range_creation_start = range_creation_start
@@ -148,65 +160,81 @@ class QueryParser:
148
160
  self.security = security
149
161
  self.generative_model = generative_model
150
162
  self.rephrase = rephrase
151
- self.query_endpoint_enabled = has_feature(
152
- const.Features.PREDICT_QUERY_ENDPOINT,
153
- default=False,
154
- context={"kbid": self.kbid},
163
+ self.rephrase_prompt = rephrase_prompt
164
+ self.query_endpoint_used = False
165
+ if len(self.label_filters) > 0:
166
+ self.label_filters = translate_label_filters(self.label_filters)
167
+ self.flat_label_filters = flatten_filter_literals(self.label_filters)
168
+ self.max_tokens = max_tokens
169
+ self.rank_fusion = rank_fusion
170
+ self.reranker = reranker
171
+
172
+ @property
173
+ def has_vector_search(self) -> bool:
174
+ return SearchOptions.SEMANTIC in self.features
175
+
176
+ @property
177
+ def has_relations_search(self) -> bool:
178
+ return SearchOptions.RELATIONS in self.features
179
+
180
+ def _get_query_information(self) -> Awaitable[QueryInfo]:
181
+ if self._query_information_task is None: # pragma: no cover
182
+ self._query_information_task = asyncio.create_task(self._query_information())
183
+ return self._query_information_task
184
+
185
+ async def _query_information(self) -> QueryInfo:
186
+ vectorset = await self.select_query_vectorset()
187
+ return await query_information(
188
+ self.kbid, self.query, vectorset, self.generative_model, self.rephrase, self.rephrase_prompt
155
189
  )
156
190
 
157
- if len(self.filters) > 0:
158
- self.filters = translate_label_filters(self.filters)
159
- self.flat_filter_labels = flat_filter_labels(self.filters)
191
+ def _get_vectorset(self) -> Awaitable[Optional[str]]:
192
+ if self._get_vectorset_task is None:
193
+ self._get_vectorset_task = asyncio.create_task(self._select_vectorset())
194
+ return self._get_vectorset_task
160
195
 
161
- def _get_default_semantic_min_score(self) -> Awaitable[float]:
162
- if self._min_score_task is None: # pragma: no cover
163
- self._min_score_task = asyncio.create_task(
164
- get_default_semantic_min_score(self.kbid)
165
- )
166
- return self._min_score_task
196
+ async def _select_vectorset(self) -> Optional[str]:
197
+ if self.vectorset:
198
+ return self.vectorset
167
199
 
168
- def _get_converted_vectors(self) -> Awaitable[list[float]]:
169
- if self._convert_vectors_task is None: # pragma: no cover
170
- self._convert_vectors_task = asyncio.create_task(
171
- convert_vectors(self.kbid, self.query)
172
- )
173
- return self._convert_vectors_task
200
+ # When vectorset is not provided we get the default from Predict API
174
201
 
175
- def _get_query_information(self) -> Awaitable[QueryInfo]:
176
- if self.query_endpoint_enabled is False:
177
- # XXX Can be removed once query endpoint is fully enabled
178
- async def static_query():
179
- return QueryInfo(
180
- visual_llm=False,
181
- max_context=300_000,
182
- entities=TokenSearch(tokens=[], time=0.0),
183
- sentence=SentenceSearch(data=[], time=0.0),
184
- query=self.query,
185
- )
202
+ try:
203
+ query_information = await self._get_query_information()
204
+ except SendToPredictError:
205
+ return None
186
206
 
187
- return static_query()
188
- if self._query_information_task is None: # pragma: no cover
189
- self._query_information_task = asyncio.create_task(
190
- query_information(
191
- self.kbid, self.query, self.generative_model, self.rephrase
192
- )
207
+ if query_information.sentence is None:
208
+ logger.error(
209
+ "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
193
210
  )
194
- return self._query_information_task
211
+ return None
212
+
213
+ for vectorset in query_information.sentence.vectors.keys():
214
+ self.vectorset = vectorset
215
+ break
216
+
217
+ return self.vectorset
218
+
219
+ def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]:
220
+ if self._get_matryoshka_dimension_task is None:
221
+ self._get_matryoshka_dimension_task = asyncio.create_task(self._matryoshka_dimension())
222
+ return self._get_matryoshka_dimension_task
223
+
224
+ async def _matryoshka_dimension(self) -> Optional[int]:
225
+ vectorset = await self._select_vectorset()
226
+ return await get_matryoshka_dimension_cached(self.kbid, vectorset)
195
227
 
196
228
  def _get_detected_entities(self) -> Awaitable[list[utils_pb2.RelationNode]]:
197
229
  if self._detected_entities_task is None: # pragma: no cover
198
- self._detected_entities_task = asyncio.create_task(
199
- detect_entities(self.kbid, self.query)
200
- )
230
+ self._detected_entities_task = asyncio.create_task(detect_entities(self.kbid, self.query))
201
231
  return self._detected_entities_task
202
232
 
203
233
  def _get_entities_meta_cache(
204
234
  self,
205
235
  ) -> Awaitable[datamanagers.entities.EntitiesMetaCache]:
206
236
  if self._entities_meta_cache_task is None:
207
- self._entities_meta_cache_task = asyncio.create_task(
208
- get_entities_meta_cache(self.kbid)
209
- )
237
+ self._entities_meta_cache_task = asyncio.create_task(get_entities_meta_cache(self.kbid))
210
238
  return self._entities_meta_cache_task
211
239
 
212
240
  def _get_deleted_entity_groups(self) -> Awaitable[list[str]]:
@@ -233,28 +261,17 @@ class QueryParser:
233
261
  This will schedule concurrent tasks for different data that needs to be pulled
234
262
  for the sake of the query being performed
235
263
  """
236
- if len(self.filters) > 0 and has_classification_label_filters(
237
- self.flat_filter_labels
238
- ):
264
+ if len(self.label_filters) > 0 and has_classification_label_filters(self.flat_label_filters):
239
265
  asyncio.ensure_future(self._get_classification_labels())
240
- if self.min_score.semantic is None:
241
- asyncio.ensure_future(self._get_default_semantic_min_score())
242
266
 
243
- if SearchOptions.VECTOR in self.features and self.user_vector is None:
244
- if self.query_endpoint_enabled:
245
- asyncio.ensure_future(self._get_query_information())
246
- else:
247
- asyncio.ensure_future(self._get_converted_vectors())
248
-
249
- if (SearchOptions.RELATIONS in self.features or self.autofilter) and len(
250
- self.query
251
- ) > 0:
252
- if (
253
- not self.query_endpoint_enabled
254
- or SearchOptions.VECTOR not in self.features
255
- or self.user_vector is not None
256
- ):
257
- self.query_endpoint_enabled = False
267
+ if self.has_vector_search and self.user_vector is None:
268
+ self.query_endpoint_used = True
269
+ asyncio.ensure_future(self._get_query_information())
270
+ asyncio.ensure_future(self._get_matryoshka_dimension())
271
+
272
+ if (self.has_relations_search or self.autofilter) and len(self.query) > 0:
273
+ if not self.query_endpoint_used:
274
+ # If we only need to detect entities, we don't need the query endpoint
258
275
  asyncio.ensure_future(self._get_detected_entities())
259
276
  asyncio.ensure_future(self._get_entities_meta_cache())
260
277
  asyncio.ensure_future(self._get_deleted_entity_groups())
@@ -273,6 +290,8 @@ class QueryParser:
273
290
  request.body = self.query
274
291
  request.with_duplicates = self.with_duplicates
275
292
 
293
+ self.parse_sorting(request)
294
+
276
295
  await self._schedule_dependency_tasks()
277
296
 
278
297
  await self.parse_filters(request)
@@ -281,30 +300,29 @@ class QueryParser:
281
300
  incomplete = await self.parse_vector_search(request)
282
301
  autofilters = await self.parse_relation_search(request)
283
302
  await self.parse_synonyms(request)
284
-
285
- self.parse_sorting(request)
286
- await self.parse_min_score(request)
287
-
303
+ await self.parse_min_score(request, incomplete)
304
+ await self.adjust_page_size(request, self.rank_fusion, self.reranker)
288
305
  return request, incomplete, autofilters
289
306
 
290
307
  async def parse_filters(self, request: nodereader_pb2.SearchRequest) -> None:
291
- if len(self.filters) > 0:
292
- field_labels = self.flat_filter_labels
308
+ if len(self.label_filters) > 0:
309
+ field_labels = self.flat_label_filters
293
310
  paragraph_labels: list[str] = []
294
- if has_classification_label_filters(self.flat_filter_labels):
311
+ if has_classification_label_filters(self.flat_label_filters):
295
312
  classification_labels = await self._get_classification_labels()
296
313
  field_labels, paragraph_labels = split_labels_by_type(
297
- self.flat_filter_labels, classification_labels
314
+ self.flat_label_filters, classification_labels
298
315
  )
299
- check_supported_filters(self.filters, paragraph_labels)
316
+ check_supported_filters(self.label_filters, paragraph_labels)
300
317
 
301
318
  request.filter.field_labels.extend(field_labels)
302
319
  request.filter.paragraph_labels.extend(paragraph_labels)
303
- request.filter.expression = json.dumps(self.filters)
320
+ request.filter.labels_expression = json.dumps(self.label_filters)
304
321
 
305
- request.faceted.labels.extend(
306
- [translate_label(facet) for facet in self.faceted]
307
- )
322
+ if len(self.keyword_filters) > 0:
323
+ request.filter.keywords_expression = json.dumps(self.keyword_filters)
324
+
325
+ request.faceted.labels.extend([translate_label(facet) for facet in self.faceted])
308
326
  request.fields.extend(self.fields)
309
327
 
310
328
  if self.security is not None and len(self.security.groups) > 0:
@@ -354,9 +372,7 @@ class QueryParser:
354
372
  order=SortOrder.DESC,
355
373
  limit=None,
356
374
  )
357
- elif (
358
- self.sort.field not in INDEX_SORTABLE_FIELDS and self.sort.limit is None
359
- ):
375
+ elif self.sort.field not in INDEX_SORTABLE_FIELDS and self.sort.limit is None:
360
376
  raise InvalidQueryError(
361
377
  "sort_field",
362
378
  f"Sort by '{self.sort.field}' requires setting a sort limit",
@@ -369,95 +385,133 @@ class QueryParser:
369
385
  # have consistent results, we must limit them
370
386
  request.result_per_page = self.sort.limit
371
387
  else:
372
- request.result_per_page = self.page_number * self.page_size + self.page_size
388
+ request.result_per_page = self.top_k
373
389
 
374
- sort_field = SortFieldMap[self.sort.field] if self.sort else None
390
+ sort_field = get_sort_field_proto(self.sort.field) if self.sort else None
375
391
  if sort_field is not None:
376
392
  request.order.sort_by = sort_field
377
393
  request.order.type = SortOrderMap[self.sort.order] # type: ignore
378
394
 
379
- async def parse_min_score(self, request: nodereader_pb2.SearchRequest) -> None:
380
- if self.min_score.semantic is None:
381
- self.min_score.semantic = await self._get_default_semantic_min_score()
395
+ async def parse_min_score(self, request: nodereader_pb2.SearchRequest, incomplete: bool) -> None:
396
+ semantic_min_score = DEFAULT_GENERIC_SEMANTIC_THRESHOLD
397
+ if self.min_score.semantic is not None:
398
+ semantic_min_score = self.min_score.semantic
399
+ elif self.has_vector_search and not incomplete:
400
+ query_information = await self._get_query_information()
401
+ vectorset = await self._select_vectorset()
402
+ if vectorset is not None:
403
+ semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
404
+ if semantic_threshold is not None:
405
+ semantic_min_score = semantic_threshold
406
+ else:
407
+ logger.warning(
408
+ "Semantic threshold not found in query information, using default",
409
+ extra={"kbid": self.kbid},
410
+ )
411
+ else:
412
+ logger.warning(
413
+ "Vectorset unset by user or predict, using default semantic threshold",
414
+ extra={"kbid": self.kbid},
415
+ )
416
+ self.min_score.semantic = semantic_min_score
382
417
  request.min_score_semantic = self.min_score.semantic
383
418
  request.min_score_bm25 = self.min_score.bm25
384
419
 
385
420
  def parse_document_search(self, request: nodereader_pb2.SearchRequest) -> None:
386
- if SearchOptions.DOCUMENT in self.features:
421
+ if SearchOptions.FULLTEXT in self.features:
387
422
  request.document = True
388
423
  node_features.inc({"type": "documents"})
389
424
 
390
425
  def parse_paragraph_search(self, request: nodereader_pb2.SearchRequest) -> None:
391
- if SearchOptions.PARAGRAPH in self.features:
426
+ if SearchOptions.KEYWORD in self.features:
392
427
  request.paragraph = True
393
428
  node_features.inc({"type": "paragraphs"})
394
429
 
430
+ async def select_query_vectorset(self) -> Optional[str]:
431
+ """Set and return the requested vectorset parameter (if used) validated
432
+ for the current KB.
433
+
434
+ """
435
+ if not self.vectorset:
436
+ return None
437
+
438
+ # validate vectorset
439
+ async with datamanagers.with_ro_transaction() as txn:
440
+ if not await datamanagers.vectorsets.exists(
441
+ txn, kbid=self.kbid, vectorset_id=self.vectorset
442
+ ):
443
+ raise InvalidQueryError(
444
+ "vectorset",
445
+ f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box",
446
+ )
447
+ return self.vectorset
448
+
395
449
  async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool:
396
- if SearchOptions.VECTOR not in self.features:
450
+ if not self.has_vector_search:
397
451
  return False
398
452
 
399
453
  node_features.inc({"type": "vectors"})
400
454
 
401
455
  incomplete = False
402
- if self.vectorset is not None:
403
- request.vectorset = self.vectorset
404
- node_features.inc({"type": "vectorset"})
405
456
 
457
+ vectorset = await self._select_vectorset()
458
+ if vectorset is not None:
459
+ request.vectorset = vectorset
460
+
461
+ query_vector = None
406
462
  if self.user_vector is None:
407
- if self.query_endpoint_enabled:
408
- try:
409
- query_info = await self._get_query_information()
410
- if query_info and query_info.sentence:
411
- request.vector.extend(query_info.sentence.data)
412
- else:
413
- incomplete = True
414
- except SendToPredictError as err:
415
- logger.warning(
416
- f"Errors on predict api trying to embedd query: {err}"
417
- )
418
- incomplete = True
419
- except PredictVectorMissing:
420
- logger.warning("Predict api returned an empty vector")
421
- incomplete = True
463
+ try:
464
+ query_info = await self._get_query_information()
465
+ except SendToPredictError as err:
466
+ logger.warning(f"Errors on predict api trying to embedd query: {err}")
467
+ incomplete = True
422
468
  else:
423
- try:
424
- request.vector.extend(await self._get_converted_vectors())
425
- except SendToPredictError as err:
426
- logger.warning(
427
- f"Errors on predict api trying to embedd query: {err}"
428
- )
429
- incomplete = True
430
- except PredictVectorMissing:
431
- logger.warning("Predict api returned an empty vector")
469
+ if query_info and query_info.sentence:
470
+ if vectorset:
471
+ if vectorset in query_info.sentence.vectors:
472
+ query_vector = query_info.sentence.vectors[vectorset]
473
+ else:
474
+ incomplete = True
475
+ else:
476
+ for vectorset_id, vector in query_info.sentence.vectors.items():
477
+ if vector:
478
+ query_vector = vector
479
+ break
480
+ else:
481
+ incomplete = True
482
+
483
+ else:
432
484
  incomplete = True
433
485
  else:
434
- request.vector.extend(self.user_vector)
486
+ query_vector = self.user_vector
487
+
488
+ if query_vector is not None:
489
+ matryoshka_dimension = await self._get_matryoshka_dimension()
490
+ if matryoshka_dimension is not None:
491
+ # KB using a matryoshka embeddings model, cut the query vector
492
+ # accordingly
493
+ query_vector = query_vector[:matryoshka_dimension]
494
+ request.vector.extend(query_vector)
495
+
435
496
  return incomplete
436
497
 
437
- async def parse_relation_search(
438
- self, request: nodereader_pb2.SearchRequest
439
- ) -> list[str]:
498
+ async def parse_relation_search(self, request: nodereader_pb2.SearchRequest) -> list[str]:
440
499
  autofilters = []
441
- relations_search = SearchOptions.RELATIONS in self.features
442
- if relations_search or self.autofilter:
443
- if not self.query_endpoint_enabled:
500
+ if self.has_relations_search or self.autofilter:
501
+ if not self.query_endpoint_used:
444
502
  detected_entities = await self._get_detected_entities()
445
503
  else:
446
504
  query_info_result = await self._get_query_information()
447
505
  if query_info_result.entities:
448
- detected_entities = convert_relations(
449
- query_info_result.entities.dict()
450
- )
506
+ detected_entities = convert_relations(query_info_result.entities.model_dump())
451
507
  else:
452
508
  detected_entities = []
453
509
  meta_cache = await self._get_entities_meta_cache()
454
510
  detected_entities = expand_entities(meta_cache, detected_entities)
455
- if relations_search:
511
+ if self.has_relations_search:
456
512
  request.relation_subgraph.entry_points.extend(detected_entities)
457
513
  request.relation_subgraph.depth = 1
458
- request.relation_subgraph.deleted_groups.extend(
459
- await self._get_deleted_entity_groups()
460
- )
514
+ request.relation_subgraph.deleted_groups.extend(await self._get_deleted_entity_groups())
461
515
  for group_id, deleted_entities in meta_cache.deleted_entities.items():
462
516
  request.relation_subgraph.deleted_entities.append(
463
517
  nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
@@ -467,67 +521,111 @@ class QueryParser:
467
521
  node_features.inc({"type": "relations"})
468
522
  if self.autofilter:
469
523
  entity_filters = parse_entities_to_filters(request, detected_entities)
470
- autofilters.extend(
471
- [translate_system_to_alias_label(e) for e in entity_filters]
472
- )
524
+ autofilters.extend([translate_system_to_alias_label(e) for e in entity_filters])
473
525
  return autofilters
474
526
 
475
527
  async def parse_synonyms(self, request: nodereader_pb2.SearchRequest) -> None:
476
- if not self.with_synonyms:
528
+ """
529
+ Replace the terms in the query with an expression that will make it match with the configured synonyms.
530
+ We're using the Tantivy's query language here: https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html
531
+
532
+ Example:
533
+ - Synonyms: Foo -> Bar, Baz
534
+ - Query: "What is Foo?"
535
+ - Advanced Query: "What is (Foo OR Bar OR Baz)?"
536
+ """
537
+ if not self.with_synonyms or not self.query:
538
+ # Nothing to do
477
539
  return
478
540
 
479
- if (
480
- SearchOptions.VECTOR in self.features
481
- or SearchOptions.RELATIONS in self.features
482
- ):
541
+ if self.has_vector_search or self.has_relations_search:
483
542
  raise InvalidQueryError(
484
543
  "synonyms",
485
544
  "Search with custom synonyms is only supported on paragraph and document search",
486
545
  )
487
546
 
488
- if not self.query:
489
- # Nothing to do
490
- return
491
-
492
547
  synonyms = await self._get_synomyns()
493
548
  if synonyms is None:
494
549
  # No synonyms found
495
550
  return
496
551
 
497
- synonyms_found: list[str] = []
498
- advanced_query = []
499
- for term in self.query.split(" "):
500
- advanced_query.append(term)
501
- term_synonyms = synonyms.terms.get(term)
502
- if term_synonyms is None or len(term_synonyms.synonyms) == 0:
503
- # No synonyms found for this term
504
- continue
505
- synonyms_found.extend(term_synonyms.synonyms)
506
-
507
- if len(synonyms_found):
508
- request.advanced_query = " OR ".join(advanced_query + synonyms_found)
552
+ # Calculate term variants: 'term' -> '(term OR synonym1 OR synonym2)'
553
+ variants: dict[str, str] = {}
554
+ for term, term_synonyms in synonyms.terms.items():
555
+ if len(term_synonyms.synonyms) > 0:
556
+ variants[term] = "({})".format(" OR ".join([term] + list(term_synonyms.synonyms)))
557
+
558
+ # Split the query into terms
559
+ query_terms = self.query.split()
560
+
561
+ # Remove punctuation from the query terms
562
+ clean_query_terms = [term.strip(string.punctuation) for term in query_terms]
563
+
564
+ # Replace the original terms with the variants if the cleaned term is in the variants
565
+ term_with_synonyms_found = False
566
+ for index, clean_term in enumerate(clean_query_terms):
567
+ if clean_term in variants:
568
+ term_with_synonyms_found = True
569
+ query_terms[index] = query_terms[index].replace(clean_term, variants[clean_term])
570
+
571
+ if term_with_synonyms_found:
572
+ request.advanced_query = " ".join(query_terms)
509
573
  request.ClearField("body")
510
574
 
511
575
  async def get_visual_llm_enabled(self) -> bool:
512
576
  return (await self._get_query_information()).visual_llm
513
577
 
514
- async def get_max_context(self) -> int:
515
- # Multiple by 3 is to have a good margin and guess
516
- # between characters and tokens. This will be fully properly
517
- # cut at the NUA API.
518
- return (await self._get_query_information()).max_context * 3
578
+ async def get_max_tokens_context(self) -> int:
579
+ model_max = (await self._get_query_information()).max_context
580
+ if self.max_tokens is not None and self.max_tokens.context is not None:
581
+ if self.max_tokens.context > model_max:
582
+ raise InvalidQueryError(
583
+ "max_tokens.context",
584
+ f"Max context tokens is higher than the model's limit of {model_max}",
585
+ )
586
+ return self.max_tokens.context
587
+ return model_max
588
+
589
+ def get_max_tokens_answer(self) -> Optional[int]:
590
+ if self.max_tokens is not None and self.max_tokens.answer is not None:
591
+ return self.max_tokens.answer
592
+ return None
593
+
594
+ async def adjust_page_size(
595
+ self,
596
+ request: nodereader_pb2.SearchRequest,
597
+ rank_fusion: Optional[RankFusionAlgorithm],
598
+ reranker: Optional[Reranker],
599
+ ):
600
+ """Adjust requested page size depending on rank fusion and reranking algorithms.
601
+
602
+ Some rerankers want more results than the requested by the user so
603
+ reranking can have more choices.
604
+
605
+ """
606
+ rank_fusion_window = 0
607
+ if rank_fusion is not None:
608
+ rank_fusion_window = rank_fusion.window
609
+
610
+ reranker_window = 0
611
+ if reranker is not None:
612
+ reranker_window = reranker.window or 0
613
+
614
+ request.result_per_page = max(
615
+ request.result_per_page,
616
+ rank_fusion_window,
617
+ reranker_window,
618
+ )
519
619
 
520
620
 
521
621
  async def paragraph_query_to_pb(
522
622
  kbid: str,
523
- features: list[SearchOptions],
524
623
  rid: str,
525
624
  query: str,
526
625
  fields: list[str],
527
626
  filters: list[str],
528
627
  faceted: list[str],
529
- page_number: int,
530
- page_size: int,
628
+ top_k: int,
531
629
  range_creation_start: Optional[datetime] = None,
532
630
  range_creation_end: Optional[datetime] = None,
533
631
  range_modification_start: Optional[datetime] = None,
@@ -535,13 +633,37 @@ async def paragraph_query_to_pb(
535
633
  sort: Optional[str] = None,
536
634
  sort_ord: str = SortOrder.DESC.value,
537
635
  with_duplicates: bool = False,
538
- ) -> nodereader_pb2.ParagraphSearchRequest:
539
- request = nodereader_pb2.ParagraphSearchRequest()
540
- request.with_duplicates = with_duplicates
636
+ ) -> nodereader_pb2.SearchRequest:
637
+ request = nodereader_pb2.SearchRequest()
638
+ request.paragraph = True
541
639
 
542
640
  # We need to ask for all and cut later
543
641
  request.page_number = 0
544
- request.result_per_page = page_number * page_size + page_size
642
+ request.result_per_page = top_k
643
+
644
+ request.body = query
645
+
646
+ # we don't have a specific filter only for resource_ids but key_filters
647
+ # parse "rid" and "rid/field" like ids, so it does the job
648
+ request.key_filters.append(rid)
649
+
650
+ if len(filters) > 0:
651
+ field_labels = filters
652
+ paragraph_labels: list[str] = []
653
+ if has_classification_label_filters(filters):
654
+ classification_labels = await get_classification_labels(kbid)
655
+ field_labels, paragraph_labels = split_labels_by_type(filters, classification_labels)
656
+ request.filter.field_labels.extend(field_labels)
657
+ request.filter.paragraph_labels.extend(paragraph_labels)
658
+
659
+ request.faceted.labels.extend([translate_label(facet) for facet in faceted])
660
+ request.fields.extend(fields)
661
+
662
+ if sort:
663
+ request.order.field = sort
664
+ request.order.type = sort_ord # type: ignore
665
+
666
+ request.with_duplicates = with_duplicates
545
667
 
546
668
  if range_creation_start is not None:
547
669
  request.timestamps.from_created.FromDatetime(range_creation_start)
@@ -555,44 +677,20 @@ async def paragraph_query_to_pb(
555
677
  if range_modification_end is not None:
556
678
  request.timestamps.to_modified.FromDatetime(range_modification_end)
557
679
 
558
- if SearchOptions.PARAGRAPH in features:
559
- request.uuid = rid
560
- request.body = query
561
- if len(filters) > 0:
562
- field_labels = filters
563
- paragraph_labels: list[str] = []
564
- if has_classification_label_filters(filters):
565
- classification_labels = await get_classification_labels(kbid)
566
- field_labels, paragraph_labels = split_labels_by_type(
567
- filters, classification_labels
568
- )
569
- request.filter.field_labels.extend(field_labels)
570
- request.filter.paragraph_labels.extend(paragraph_labels)
571
-
572
- request.faceted.labels.extend([translate_label(facet) for facet in faceted])
573
- if sort:
574
- request.order.field = sort
575
- request.order.type = sort_ord # type: ignore
576
- request.fields.extend(fields)
577
-
578
680
  return request
579
681
 
580
682
 
581
- @query_parse_dependency_observer.wrap({"type": "convert_vectors"})
582
- async def convert_vectors(kbid: str, query: str) -> list[float]:
583
- predict = get_predict()
584
- return await predict.convert_sentence_to_vector(kbid, query)
585
-
586
-
587
683
  @query_parse_dependency_observer.wrap({"type": "query_information"})
588
684
  async def query_information(
589
685
  kbid: str,
590
686
  query: str,
687
+ semantic_model: Optional[str],
591
688
  generative_model: Optional[str] = None,
592
689
  rephrase: bool = False,
690
+ rephrase_prompt: Optional[str] = None,
593
691
  ) -> QueryInfo:
594
692
  predict = get_predict()
595
- return await predict.query(kbid, query, generative_model, rephrase)
693
+ return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
596
694
 
597
695
 
598
696
  @query_parse_dependency_observer.wrap({"type": "detect_entities"})
@@ -632,9 +730,7 @@ def expand_entities(
632
730
  )
633
731
 
634
732
  if entity.value in duplicated_entities_by_value[entity.subtype]:
635
- source_duplicate = duplicated_entities_by_value[entity.subtype][
636
- entity.value
637
- ]
733
+ source_duplicate = duplicated_entities_by_value[entity.subtype][entity.value]
638
734
  result_entities[source_duplicate] = utils_pb2.RelationNode(
639
735
  ntype=utils_pb2.RelationNode.NodeType.ENTITY,
640
736
  subtype=entity.subtype,
@@ -667,13 +763,16 @@ def parse_entities_to_filters(
667
763
  if entity_filter not in request.filter.field_labels:
668
764
  request.filter.field_labels.append(entity_filter)
669
765
  added_filters.append(entity_filter)
766
+
670
767
  # We need to expand the filter expression with the automatically detected entities.
671
768
  if len(added_filters) > 0:
769
+ # So far, autofilters feature will only yield 'and' expressions with the detected entities.
770
+ # More complex autofilters can be added here if we leverage the query endpoint.
672
771
  expanded_expression = {"and": [{"literal": entity} for entity in added_filters]}
673
- if request.filter.expression:
674
- expression = json.loads(request.filter.expression)
675
- expanded_expression["and"].extend(expression)
676
- request.filter.expression = json.dumps(expanded_expression)
772
+ if request.filter.labels_expression:
773
+ expression = json.loads(request.filter.labels_expression)
774
+ expanded_expression["and"].append(expression)
775
+ request.filter.labels_expression = json.dumps(expanded_expression)
677
776
  return added_filters
678
777
 
679
778
 
@@ -687,6 +786,7 @@ def suggest_query_to_pb(
687
786
  range_creation_end: Optional[datetime] = None,
688
787
  range_modification_start: Optional[datetime] = None,
689
788
  range_modification_end: Optional[datetime] = None,
789
+ hidden: Optional[bool] = None,
690
790
  ) -> nodereader_pb2.SuggestRequest:
691
791
  request = nodereader_pb2.SuggestRequest()
692
792
 
@@ -696,10 +796,21 @@ def suggest_query_to_pb(
696
796
 
697
797
  if SuggestOptions.PARAGRAPH in features:
698
798
  request.features.append(nodereader_pb2.SuggestFeatures.PARAGRAPHS)
699
- filters = [translate_label(fltr) for fltr in filters]
700
- request.filter.field_labels.extend(filters)
701
799
  request.fields.extend(fields)
702
800
 
801
+ if hidden is not None:
802
+ if hidden:
803
+ filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
804
+ else:
805
+ filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore
806
+
807
+ expression = convert_to_node_filters(filters)
808
+ if expression:
809
+ expression = translate_label_filters(expression)
810
+
811
+ request.filter.field_labels.extend(flatten_filter_literals(expression))
812
+ request.filter.labels_expression = json.dumps(expression)
813
+
703
814
  if range_creation_start is not None:
704
815
  request.timestamps.from_created.FromDatetime(range_creation_start)
705
816
  if range_creation_end is not None:
@@ -722,49 +833,28 @@ PROCESSING_STATUS_TO_PB_MAP = {
722
833
  }
723
834
 
724
835
 
725
- @query_parse_dependency_observer.wrap({"type": "min_score"})
726
- async def get_kb_model_default_min_score(kbid: str) -> Optional[float]:
727
- txn = await get_read_only_transaction()
728
- model = await datamanagers.kb.get_model_metadata(txn, kbid=kbid)
729
- if model.HasField("default_min_score"):
730
- return model.default_min_score
731
- else:
732
- return None
733
-
734
-
735
- @alru_cache(maxsize=None)
736
- async def get_default_semantic_min_score(kbid: str) -> float:
737
- fallback = 0.7
738
- model_min_score = await get_kb_model_default_min_score(kbid)
739
- if model_min_score is not None:
740
- return model_min_score
741
- return fallback
742
-
743
-
744
836
  @query_parse_dependency_observer.wrap({"type": "synonyms"})
745
837
  async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
746
- txn = await get_read_only_transaction()
747
- return await Synonyms(txn, kbid).get()
838
+ async with get_driver().transaction(read_only=True) as txn:
839
+ return await datamanagers.synonyms.get(txn, kbid=kbid)
748
840
 
749
841
 
750
842
  @query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
751
843
  async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
752
- txn = await get_read_only_transaction()
753
- return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
844
+ async with get_driver().transaction(read_only=True) as txn:
845
+ return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
754
846
 
755
847
 
756
848
  @query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
757
849
  async def get_deleted_entity_groups(kbid: str) -> list[str]:
758
- txn = await get_read_only_transaction()
759
- return list(
760
- (await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups
761
- )
850
+ async with get_driver().transaction(read_only=True) as txn:
851
+ return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
762
852
 
763
853
 
764
854
  @query_parse_dependency_observer.wrap({"type": "classification_labels"})
765
855
  async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
766
- txn = await get_read_only_transaction()
767
- return await datamanagers.labels.get_labels(txn, kbid=kbid)
856
+ async with get_driver().transaction(read_only=True) as txn:
857
+ return await datamanagers.labels.get_labels(txn, kbid=kbid)
768
858
 
769
859
 
770
860
  def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]):
@@ -783,9 +873,48 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
783
873
  "Paragraph labels can only be used with 'all' filter",
784
874
  )
785
875
  for term in filters["and"]:
786
- # Nested expressions are not allowed with paragraph labels
787
- if "literal" not in term:
876
+ # Nested expressions are not allowed with paragraph labels (only "literal" and "not(literal)")
877
+ if "not" in term:
878
+ subterm = term["not"]
879
+ if "literal" not in subterm:
880
+ # AND (NOT( X )) where X is anything other than a literal
881
+ raise InvalidQueryError(
882
+ "filters",
883
+ "Paragraph labels can only be used with 'all' filter",
884
+ )
885
+ elif "literal" not in term:
788
886
  raise InvalidQueryError(
789
887
  "filters",
790
888
  "Paragraph labels can only be used with 'all' filter",
791
889
  )
890
+
891
+
892
+ @alru_cache(maxsize=None)
893
+ async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
894
+ # This can be safely cached as the matryoshka dimension is not expected to change
895
+ return await get_matryoshka_dimension(kbid, vectorset)
896
+
897
+
898
+ @query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
899
+ async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
900
+ async with get_driver().transaction(read_only=True) as txn:
901
+ matryoshka_dimension = None
902
+ if not vectorset:
903
+ # XXX this should be migrated once we remove the "default" vectorset
904
+ # concept
905
+ matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
906
+ else:
907
+ vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
908
+ if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
909
+ matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
910
+
911
+ return matryoshka_dimension
912
+
913
+
914
+ def get_sort_field_proto(obj: SortField) -> Optional[nodereader_pb2.OrderBy.OrderField.ValueType]:
915
+ return {
916
+ SortField.SCORE: None,
917
+ SortField.CREATED: nodereader_pb2.OrderBy.OrderField.CREATED,
918
+ SortField.MODIFIED: nodereader_pb2.OrderBy.OrderField.MODIFIED,
919
+ SortField.TITLE: None,
920
+ }[obj]