nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -19,19 +19,19 @@
19
19
  #
20
20
  import asyncio
21
21
  import json
22
+ import string
22
23
  from datetime import datetime
23
24
  from typing import Any, Awaitable, Optional, Union
24
25
 
25
26
  from async_lru import alru_cache
26
- from nucliadb_protos.noderesources_pb2 import Resource
27
27
 
28
28
  from nucliadb.common import datamanagers
29
- from nucliadb.middleware.transaction import get_read_only_transaction
29
+ from nucliadb.common.maindb.utils import get_driver
30
30
  from nucliadb.search import logger
31
31
  from nucliadb.search.predict import SendToPredictError, convert_relations
32
32
  from nucliadb.search.search.filters import (
33
33
  convert_to_node_filters,
34
- flat_filter_labels,
34
+ flatten_filter_literals,
35
35
  has_classification_label_filters,
36
36
  split_labels_by_type,
37
37
  translate_label,
@@ -41,17 +41,22 @@ from nucliadb.search.search.metrics import (
41
41
  node_features,
42
42
  query_parse_dependency_observer,
43
43
  )
44
+ from nucliadb.search.search.rank_fusion import (
45
+ RankFusionAlgorithm,
46
+ )
47
+ from nucliadb.search.search.rerankers import (
48
+ Reranker,
49
+ )
44
50
  from nucliadb.search.utilities import get_predict
45
- from nucliadb_models.labels import translate_system_to_alias_label
51
+ from nucliadb_models.internal.predict import QueryInfo
52
+ from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
46
53
  from nucliadb_models.metadata import ResourceProcessingStatus
47
54
  from nucliadb_models.search import (
48
55
  Filter,
49
56
  MaxTokens,
50
57
  MinScore,
51
- QueryInfo,
52
58
  SearchOptions,
53
59
  SortField,
54
- SortFieldMap,
55
60
  SortOptions,
56
61
  SortOrder,
57
62
  SortOrderMap,
@@ -59,6 +64,7 @@ from nucliadb_models.search import (
59
64
  )
60
65
  from nucliadb_models.security import RequestSecurity
61
66
  from nucliadb_protos import knowledgebox_pb2, nodereader_pb2, utils_pb2
67
+ from nucliadb_protos.noderesources_pb2 import Resource
62
68
 
63
69
  from .exceptions import InvalidQueryError
64
70
 
@@ -67,7 +73,6 @@ INDEX_SORTABLE_FIELDS = [
67
73
  SortField.MODIFIED,
68
74
  ]
69
75
 
70
- MAX_VECTOR_RESULTS_ALLOWED = 2000
71
76
  DEFAULT_GENERIC_SEMANTIC_THRESHOLD = 0.7
72
77
 
73
78
 
@@ -82,6 +87,7 @@ class QueryParser:
82
87
  """
83
88
 
84
89
  _query_information_task: Optional[asyncio.Task] = None
90
+ _get_vectorset_task: Optional[asyncio.Task] = None
85
91
  _detected_entities_task: Optional[asyncio.Task] = None
86
92
  _entities_meta_cache_task: Optional[asyncio.Task] = None
87
93
  _deleted_entities_groups_task: Optional[asyncio.Task] = None
@@ -95,9 +101,9 @@ class QueryParser:
95
101
  kbid: str,
96
102
  features: list[SearchOptions],
97
103
  query: str,
98
- filters: Union[list[str], list[Filter]],
99
- page_number: int,
100
- page_size: int,
104
+ label_filters: Union[list[str], list[Filter]],
105
+ keyword_filters: Union[list[str], list[Filter]],
106
+ top_k: int,
101
107
  min_score: MinScore,
102
108
  faceted: Optional[list[str]] = None,
103
109
  sort: Optional[SortOptions] = None,
@@ -107,6 +113,7 @@ class QueryParser:
107
113
  range_modification_end: Optional[datetime] = None,
108
114
  fields: Optional[list[str]] = None,
109
115
  user_vector: Optional[list[float]] = None,
116
+ vectorset: Optional[str] = None,
110
117
  with_duplicates: bool = False,
111
118
  with_status: Optional[ResourceProcessingStatus] = None,
112
119
  with_synonyms: bool = False,
@@ -114,17 +121,28 @@ class QueryParser:
114
121
  key_filters: Optional[list[str]] = None,
115
122
  security: Optional[RequestSecurity] = None,
116
123
  generative_model: Optional[str] = None,
117
- rephrase: Optional[bool] = False,
124
+ rephrase: bool = False,
125
+ rephrase_prompt: Optional[str] = None,
118
126
  max_tokens: Optional[MaxTokens] = None,
127
+ hidden: Optional[bool] = None,
128
+ rank_fusion: Optional[RankFusionAlgorithm] = None,
129
+ reranker: Optional[Reranker] = None,
119
130
  ):
120
131
  self.kbid = kbid
121
132
  self.features = features
122
133
  self.query = query
123
- self.filters: dict[str, Any] = convert_to_node_filters(filters)
124
- self.flat_filter_labels: list[str] = []
134
+ self.hidden = hidden
135
+ if self.hidden is not None:
136
+ if self.hidden:
137
+ label_filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
138
+ else:
139
+ label_filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore
140
+
141
+ self.label_filters: dict[str, Any] = convert_to_node_filters(label_filters)
142
+ self.flat_label_filters: list[str] = []
143
+ self.keyword_filters: dict[str, Any] = convert_to_node_filters(keyword_filters)
125
144
  self.faceted = faceted or []
126
- self.page_number = page_number
127
- self.page_size = page_size
145
+ self.top_k = top_k
128
146
  self.min_score = min_score
129
147
  self.sort = sort
130
148
  self.range_creation_start = range_creation_start
@@ -133,6 +151,7 @@ class QueryParser:
133
151
  self.range_modification_end = range_modification_end
134
152
  self.fields = fields or []
135
153
  self.user_vector = user_vector
154
+ self.vectorset = vectorset
136
155
  self.with_duplicates = with_duplicates
137
156
  self.with_status = with_status
138
157
  self.with_synonyms = with_synonyms
@@ -141,15 +160,18 @@ class QueryParser:
141
160
  self.security = security
142
161
  self.generative_model = generative_model
143
162
  self.rephrase = rephrase
163
+ self.rephrase_prompt = rephrase_prompt
144
164
  self.query_endpoint_used = False
145
- if len(self.filters) > 0:
146
- self.filters = translate_label_filters(self.filters)
147
- self.flat_filter_labels = flat_filter_labels(self.filters)
165
+ if len(self.label_filters) > 0:
166
+ self.label_filters = translate_label_filters(self.label_filters)
167
+ self.flat_label_filters = flatten_filter_literals(self.label_filters)
148
168
  self.max_tokens = max_tokens
169
+ self.rank_fusion = rank_fusion
170
+ self.reranker = reranker
149
171
 
150
172
  @property
151
173
  def has_vector_search(self) -> bool:
152
- return SearchOptions.VECTOR in self.features
174
+ return SearchOptions.SEMANTIC in self.features
153
175
 
154
176
  @property
155
177
  def has_relations_search(self) -> bool:
@@ -157,34 +179,62 @@ class QueryParser:
157
179
 
158
180
  def _get_query_information(self) -> Awaitable[QueryInfo]:
159
181
  if self._query_information_task is None: # pragma: no cover
160
- self._query_information_task = asyncio.create_task(
161
- query_information(
162
- self.kbid, self.query, self.generative_model, self.rephrase
163
- )
164
- )
182
+ self._query_information_task = asyncio.create_task(self._query_information())
165
183
  return self._query_information_task
166
184
 
185
+ async def _query_information(self) -> QueryInfo:
186
+ vectorset = await self.select_query_vectorset()
187
+ return await query_information(
188
+ self.kbid, self.query, vectorset, self.generative_model, self.rephrase, self.rephrase_prompt
189
+ )
190
+
191
+ def _get_vectorset(self) -> Awaitable[Optional[str]]:
192
+ if self._get_vectorset_task is None:
193
+ self._get_vectorset_task = asyncio.create_task(self._select_vectorset())
194
+ return self._get_vectorset_task
195
+
196
+ async def _select_vectorset(self) -> Optional[str]:
197
+ if self.vectorset:
198
+ return self.vectorset
199
+
200
+ # When vectorset is not provided we get the default from Predict API
201
+
202
+ try:
203
+ query_information = await self._get_query_information()
204
+ except SendToPredictError:
205
+ return None
206
+
207
+ if query_information.sentence is None:
208
+ logger.error(
209
+ "Asking for a vectorset but /query didn't return one", extra={"kbid": self.kbid}
210
+ )
211
+ return None
212
+
213
+ for vectorset in query_information.sentence.vectors.keys():
214
+ self.vectorset = vectorset
215
+ break
216
+
217
+ return self.vectorset
218
+
167
219
  def _get_matryoshka_dimension(self) -> Awaitable[Optional[int]]:
168
220
  if self._get_matryoshka_dimension_task is None:
169
- self._get_matryoshka_dimension_task = asyncio.create_task(
170
- get_matryoshka_dimension_cached(self.kbid)
171
- )
221
+ self._get_matryoshka_dimension_task = asyncio.create_task(self._matryoshka_dimension())
172
222
  return self._get_matryoshka_dimension_task
173
223
 
224
+ async def _matryoshka_dimension(self) -> Optional[int]:
225
+ vectorset = await self._select_vectorset()
226
+ return await get_matryoshka_dimension_cached(self.kbid, vectorset)
227
+
174
228
  def _get_detected_entities(self) -> Awaitable[list[utils_pb2.RelationNode]]:
175
229
  if self._detected_entities_task is None: # pragma: no cover
176
- self._detected_entities_task = asyncio.create_task(
177
- detect_entities(self.kbid, self.query)
178
- )
230
+ self._detected_entities_task = asyncio.create_task(detect_entities(self.kbid, self.query))
179
231
  return self._detected_entities_task
180
232
 
181
233
  def _get_entities_meta_cache(
182
234
  self,
183
235
  ) -> Awaitable[datamanagers.entities.EntitiesMetaCache]:
184
236
  if self._entities_meta_cache_task is None:
185
- self._entities_meta_cache_task = asyncio.create_task(
186
- get_entities_meta_cache(self.kbid)
187
- )
237
+ self._entities_meta_cache_task = asyncio.create_task(get_entities_meta_cache(self.kbid))
188
238
  return self._entities_meta_cache_task
189
239
 
190
240
  def _get_deleted_entity_groups(self) -> Awaitable[list[str]]:
@@ -211,9 +261,7 @@ class QueryParser:
211
261
  This will schedule concurrent tasks for different data that needs to be pulled
212
262
  for the sake of the query being performed
213
263
  """
214
- if len(self.filters) > 0 and has_classification_label_filters(
215
- self.flat_filter_labels
216
- ):
264
+ if len(self.label_filters) > 0 and has_classification_label_filters(self.flat_label_filters):
217
265
  asyncio.ensure_future(self._get_classification_labels())
218
266
 
219
267
  if self.has_vector_search and self.user_vector is None:
@@ -253,26 +301,28 @@ class QueryParser:
253
301
  autofilters = await self.parse_relation_search(request)
254
302
  await self.parse_synonyms(request)
255
303
  await self.parse_min_score(request, incomplete)
304
+ await self.adjust_page_size(request, self.rank_fusion, self.reranker)
256
305
  return request, incomplete, autofilters
257
306
 
258
307
  async def parse_filters(self, request: nodereader_pb2.SearchRequest) -> None:
259
- if len(self.filters) > 0:
260
- field_labels = self.flat_filter_labels
308
+ if len(self.label_filters) > 0:
309
+ field_labels = self.flat_label_filters
261
310
  paragraph_labels: list[str] = []
262
- if has_classification_label_filters(self.flat_filter_labels):
311
+ if has_classification_label_filters(self.flat_label_filters):
263
312
  classification_labels = await self._get_classification_labels()
264
313
  field_labels, paragraph_labels = split_labels_by_type(
265
- self.flat_filter_labels, classification_labels
314
+ self.flat_label_filters, classification_labels
266
315
  )
267
- check_supported_filters(self.filters, paragraph_labels)
316
+ check_supported_filters(self.label_filters, paragraph_labels)
268
317
 
269
318
  request.filter.field_labels.extend(field_labels)
270
319
  request.filter.paragraph_labels.extend(paragraph_labels)
271
- request.filter.expression = json.dumps(self.filters)
320
+ request.filter.labels_expression = json.dumps(self.label_filters)
272
321
 
273
- request.faceted.labels.extend(
274
- [translate_label(facet) for facet in self.faceted]
275
- )
322
+ if len(self.keyword_filters) > 0:
323
+ request.filter.keywords_expression = json.dumps(self.keyword_filters)
324
+
325
+ request.faceted.labels.extend([translate_label(facet) for facet in self.faceted])
276
326
  request.fields.extend(self.fields)
277
327
 
278
328
  if self.security is not None and len(self.security.groups) > 0:
@@ -322,9 +372,7 @@ class QueryParser:
322
372
  order=SortOrder.DESC,
323
373
  limit=None,
324
374
  )
325
- elif (
326
- self.sort.field not in INDEX_SORTABLE_FIELDS and self.sort.limit is None
327
- ):
375
+ elif self.sort.field not in INDEX_SORTABLE_FIELDS and self.sort.limit is None:
328
376
  raise InvalidQueryError(
329
377
  "sort_field",
330
378
  f"Sort by '{self.sort.field}' requires setting a sort limit",
@@ -337,35 +385,32 @@ class QueryParser:
337
385
  # have consistent results, we must limit them
338
386
  request.result_per_page = self.sort.limit
339
387
  else:
340
- request.result_per_page = self.page_number * self.page_size + self.page_size
388
+ request.result_per_page = self.top_k
341
389
 
342
- sort_field = SortFieldMap[self.sort.field] if self.sort else None
390
+ sort_field = get_sort_field_proto(self.sort.field) if self.sort else None
343
391
  if sort_field is not None:
344
392
  request.order.sort_by = sort_field
345
393
  request.order.type = SortOrderMap[self.sort.order] # type: ignore
346
394
 
347
- if (
348
- self.has_vector_search
349
- and request.result_per_page > MAX_VECTOR_RESULTS_ALLOWED
350
- ):
351
- raise InvalidQueryError(
352
- "page_size",
353
- f"Pagination of semantic results limit reached: {MAX_VECTOR_RESULTS_ALLOWED}. If you want to paginate through all results, please disable the vector search feature.", # noqa: E501
354
- )
355
-
356
- async def parse_min_score(
357
- self, request: nodereader_pb2.SearchRequest, incomplete: bool
358
- ) -> None:
395
+ async def parse_min_score(self, request: nodereader_pb2.SearchRequest, incomplete: bool) -> None:
359
396
  semantic_min_score = DEFAULT_GENERIC_SEMANTIC_THRESHOLD
360
397
  if self.min_score.semantic is not None:
361
398
  semantic_min_score = self.min_score.semantic
362
399
  elif self.has_vector_search and not incomplete:
363
400
  query_information = await self._get_query_information()
364
- if query_information.semantic_threshold is not None:
365
- semantic_min_score = query_information.semantic_threshold
401
+ vectorset = await self._select_vectorset()
402
+ if vectorset is not None:
403
+ semantic_threshold = query_information.semantic_thresholds.get(vectorset, None)
404
+ if semantic_threshold is not None:
405
+ semantic_min_score = semantic_threshold
406
+ else:
407
+ logger.warning(
408
+ "Semantic threshold not found in query information, using default",
409
+ extra={"kbid": self.kbid},
410
+ )
366
411
  else:
367
412
  logger.warning(
368
- "Semantic threshold not found in query information, using default",
413
+ "Vectorset unset by user or predict, using default semantic threshold",
369
414
  extra={"kbid": self.kbid},
370
415
  )
371
416
  self.min_score.semantic = semantic_min_score
@@ -373,15 +418,34 @@ class QueryParser:
373
418
  request.min_score_bm25 = self.min_score.bm25
374
419
 
375
420
  def parse_document_search(self, request: nodereader_pb2.SearchRequest) -> None:
376
- if SearchOptions.DOCUMENT in self.features:
421
+ if SearchOptions.FULLTEXT in self.features:
377
422
  request.document = True
378
423
  node_features.inc({"type": "documents"})
379
424
 
380
425
  def parse_paragraph_search(self, request: nodereader_pb2.SearchRequest) -> None:
381
- if SearchOptions.PARAGRAPH in self.features:
426
+ if SearchOptions.KEYWORD in self.features:
382
427
  request.paragraph = True
383
428
  node_features.inc({"type": "paragraphs"})
384
429
 
430
+ async def select_query_vectorset(self) -> Optional[str]:
431
+ """Set and return the requested vectorset parameter (if used) validated
432
+ for the current KB.
433
+
434
+ """
435
+ if not self.vectorset:
436
+ return None
437
+
438
+ # validate vectorset
439
+ async with datamanagers.with_ro_transaction() as txn:
440
+ if not await datamanagers.vectorsets.exists(
441
+ txn, kbid=self.kbid, vectorset_id=self.vectorset
442
+ ):
443
+ raise InvalidQueryError(
444
+ "vectorset",
445
+ f"Vectorset {self.vectorset} doesn't exist in you Knowledge Box",
446
+ )
447
+ return self.vectorset
448
+
385
449
  async def parse_vector_search(self, request: nodereader_pb2.SearchRequest) -> bool:
386
450
  if not self.has_vector_search:
387
451
  return False
@@ -389,6 +453,11 @@ class QueryParser:
389
453
  node_features.inc({"type": "vectors"})
390
454
 
391
455
  incomplete = False
456
+
457
+ vectorset = await self._select_vectorset()
458
+ if vectorset is not None:
459
+ request.vectorset = vectorset
460
+
392
461
  query_vector = None
393
462
  if self.user_vector is None:
394
463
  try:
@@ -398,11 +467,24 @@ class QueryParser:
398
467
  incomplete = True
399
468
  else:
400
469
  if query_info and query_info.sentence:
401
- query_vector = query_info.sentence.data
470
+ if vectorset:
471
+ if vectorset in query_info.sentence.vectors:
472
+ query_vector = query_info.sentence.vectors[vectorset]
473
+ else:
474
+ incomplete = True
475
+ else:
476
+ for vectorset_id, vector in query_info.sentence.vectors.items():
477
+ if vector:
478
+ query_vector = vector
479
+ break
480
+ else:
481
+ incomplete = True
482
+
402
483
  else:
403
484
  incomplete = True
404
485
  else:
405
486
  query_vector = self.user_vector
487
+
406
488
  if query_vector is not None:
407
489
  matryoshka_dimension = await self._get_matryoshka_dimension()
408
490
  if matryoshka_dimension is not None:
@@ -410,11 +492,10 @@ class QueryParser:
410
492
  # accordingly
411
493
  query_vector = query_vector[:matryoshka_dimension]
412
494
  request.vector.extend(query_vector)
495
+
413
496
  return incomplete
414
497
 
415
- async def parse_relation_search(
416
- self, request: nodereader_pb2.SearchRequest
417
- ) -> list[str]:
498
+ async def parse_relation_search(self, request: nodereader_pb2.SearchRequest) -> list[str]:
418
499
  autofilters = []
419
500
  if self.has_relations_search or self.autofilter:
420
501
  if not self.query_endpoint_used:
@@ -422,9 +503,7 @@ class QueryParser:
422
503
  else:
423
504
  query_info_result = await self._get_query_information()
424
505
  if query_info_result.entities:
425
- detected_entities = convert_relations(
426
- query_info_result.entities.dict()
427
- )
506
+ detected_entities = convert_relations(query_info_result.entities.model_dump())
428
507
  else:
429
508
  detected_entities = []
430
509
  meta_cache = await self._get_entities_meta_cache()
@@ -432,9 +511,7 @@ class QueryParser:
432
511
  if self.has_relations_search:
433
512
  request.relation_subgraph.entry_points.extend(detected_entities)
434
513
  request.relation_subgraph.depth = 1
435
- request.relation_subgraph.deleted_groups.extend(
436
- await self._get_deleted_entity_groups()
437
- )
514
+ request.relation_subgraph.deleted_groups.extend(await self._get_deleted_entity_groups())
438
515
  for group_id, deleted_entities in meta_cache.deleted_entities.items():
439
516
  request.relation_subgraph.deleted_entities.append(
440
517
  nodereader_pb2.EntitiesSubgraphRequest.DeletedEntities(
@@ -444,13 +521,21 @@ class QueryParser:
444
521
  node_features.inc({"type": "relations"})
445
522
  if self.autofilter:
446
523
  entity_filters = parse_entities_to_filters(request, detected_entities)
447
- autofilters.extend(
448
- [translate_system_to_alias_label(e) for e in entity_filters]
449
- )
524
+ autofilters.extend([translate_system_to_alias_label(e) for e in entity_filters])
450
525
  return autofilters
451
526
 
452
527
  async def parse_synonyms(self, request: nodereader_pb2.SearchRequest) -> None:
453
- if not self.with_synonyms:
528
+ """
529
+ Replace the terms in the query with an expression that will make it match with the configured synonyms.
530
+ We're using the Tantivy's query language here: https://docs.rs/tantivy/latest/tantivy/query/struct.QueryParser.html
531
+
532
+ Example:
533
+ - Synonyms: Foo -> Bar, Baz
534
+ - Query: "What is Foo?"
535
+ - Advanced Query: "What is (Foo OR Bar OR Baz)?"
536
+ """
537
+ if not self.with_synonyms or not self.query:
538
+ # Nothing to do
454
539
  return
455
540
 
456
541
  if self.has_vector_search or self.has_relations_search:
@@ -459,27 +544,32 @@ class QueryParser:
459
544
  "Search with custom synonyms is only supported on paragraph and document search",
460
545
  )
461
546
 
462
- if not self.query:
463
- # Nothing to do
464
- return
465
-
466
547
  synonyms = await self._get_synomyns()
467
548
  if synonyms is None:
468
549
  # No synonyms found
469
550
  return
470
551
 
471
- synonyms_found: list[str] = []
472
- advanced_query = []
473
- for term in self.query.split(" "):
474
- advanced_query.append(term)
475
- term_synonyms = synonyms.terms.get(term)
476
- if term_synonyms is None or len(term_synonyms.synonyms) == 0:
477
- # No synonyms found for this term
478
- continue
479
- synonyms_found.extend(term_synonyms.synonyms)
480
-
481
- if len(synonyms_found):
482
- request.advanced_query = " OR ".join(advanced_query + synonyms_found)
552
+ # Calculate term variants: 'term' -> '(term OR synonym1 OR synonym2)'
553
+ variants: dict[str, str] = {}
554
+ for term, term_synonyms in synonyms.terms.items():
555
+ if len(term_synonyms.synonyms) > 0:
556
+ variants[term] = "({})".format(" OR ".join([term] + list(term_synonyms.synonyms)))
557
+
558
+ # Split the query into terms
559
+ query_terms = self.query.split()
560
+
561
+ # Remove punctuation from the query terms
562
+ clean_query_terms = [term.strip(string.punctuation) for term in query_terms]
563
+
564
+ # Replace the original terms with the variants if the cleaned term is in the variants
565
+ term_with_synonyms_found = False
566
+ for index, clean_term in enumerate(clean_query_terms):
567
+ if clean_term in variants:
568
+ term_with_synonyms_found = True
569
+ query_terms[index] = query_terms[index].replace(clean_term, variants[clean_term])
570
+
571
+ if term_with_synonyms_found:
572
+ request.advanced_query = " ".join(query_terms)
483
573
  request.ClearField("body")
484
574
 
485
575
  async def get_visual_llm_enabled(self) -> bool:
@@ -501,17 +591,41 @@ class QueryParser:
501
591
  return self.max_tokens.answer
502
592
  return None
503
593
 
594
+ async def adjust_page_size(
595
+ self,
596
+ request: nodereader_pb2.SearchRequest,
597
+ rank_fusion: Optional[RankFusionAlgorithm],
598
+ reranker: Optional[Reranker],
599
+ ):
600
+ """Adjust requested page size depending on rank fusion and reranking algorithms.
601
+
602
+ Some rerankers want more results than the requested by the user so
603
+ reranking can have more choices.
604
+
605
+ """
606
+ rank_fusion_window = 0
607
+ if rank_fusion is not None:
608
+ rank_fusion_window = rank_fusion.window
609
+
610
+ reranker_window = 0
611
+ if reranker is not None:
612
+ reranker_window = reranker.window or 0
613
+
614
+ request.result_per_page = max(
615
+ request.result_per_page,
616
+ rank_fusion_window,
617
+ reranker_window,
618
+ )
619
+
504
620
 
505
621
  async def paragraph_query_to_pb(
506
622
  kbid: str,
507
- features: list[SearchOptions],
508
623
  rid: str,
509
624
  query: str,
510
625
  fields: list[str],
511
626
  filters: list[str],
512
627
  faceted: list[str],
513
- page_number: int,
514
- page_size: int,
628
+ top_k: int,
515
629
  range_creation_start: Optional[datetime] = None,
516
630
  range_creation_end: Optional[datetime] = None,
517
631
  range_modification_start: Optional[datetime] = None,
@@ -519,13 +633,37 @@ async def paragraph_query_to_pb(
519
633
  sort: Optional[str] = None,
520
634
  sort_ord: str = SortOrder.DESC.value,
521
635
  with_duplicates: bool = False,
522
- ) -> nodereader_pb2.ParagraphSearchRequest:
523
- request = nodereader_pb2.ParagraphSearchRequest()
524
- request.with_duplicates = with_duplicates
636
+ ) -> nodereader_pb2.SearchRequest:
637
+ request = nodereader_pb2.SearchRequest()
638
+ request.paragraph = True
525
639
 
526
640
  # We need to ask for all and cut later
527
641
  request.page_number = 0
528
- request.result_per_page = page_number * page_size + page_size
642
+ request.result_per_page = top_k
643
+
644
+ request.body = query
645
+
646
+ # we don't have a specific filter only for resource_ids but key_filters
647
+ # parse "rid" and "rid/field" like ids, so it does the job
648
+ request.key_filters.append(rid)
649
+
650
+ if len(filters) > 0:
651
+ field_labels = filters
652
+ paragraph_labels: list[str] = []
653
+ if has_classification_label_filters(filters):
654
+ classification_labels = await get_classification_labels(kbid)
655
+ field_labels, paragraph_labels = split_labels_by_type(filters, classification_labels)
656
+ request.filter.field_labels.extend(field_labels)
657
+ request.filter.paragraph_labels.extend(paragraph_labels)
658
+
659
+ request.faceted.labels.extend([translate_label(facet) for facet in faceted])
660
+ request.fields.extend(fields)
661
+
662
+ if sort:
663
+ request.order.field = sort
664
+ request.order.type = sort_ord # type: ignore
665
+
666
+ request.with_duplicates = with_duplicates
529
667
 
530
668
  if range_creation_start is not None:
531
669
  request.timestamps.from_created.FromDatetime(range_creation_start)
@@ -539,26 +677,6 @@ async def paragraph_query_to_pb(
539
677
  if range_modification_end is not None:
540
678
  request.timestamps.to_modified.FromDatetime(range_modification_end)
541
679
 
542
- if SearchOptions.PARAGRAPH in features:
543
- request.uuid = rid
544
- request.body = query
545
- if len(filters) > 0:
546
- field_labels = filters
547
- paragraph_labels: list[str] = []
548
- if has_classification_label_filters(filters):
549
- classification_labels = await get_classification_labels(kbid)
550
- field_labels, paragraph_labels = split_labels_by_type(
551
- filters, classification_labels
552
- )
553
- request.filter.field_labels.extend(field_labels)
554
- request.filter.paragraph_labels.extend(paragraph_labels)
555
-
556
- request.faceted.labels.extend([translate_label(facet) for facet in faceted])
557
- if sort:
558
- request.order.field = sort
559
- request.order.type = sort_ord # type: ignore
560
- request.fields.extend(fields)
561
-
562
680
  return request
563
681
 
564
682
 
@@ -566,11 +684,13 @@ async def paragraph_query_to_pb(
566
684
  async def query_information(
567
685
  kbid: str,
568
686
  query: str,
687
+ semantic_model: Optional[str],
569
688
  generative_model: Optional[str] = None,
570
689
  rephrase: bool = False,
690
+ rephrase_prompt: Optional[str] = None,
571
691
  ) -> QueryInfo:
572
692
  predict = get_predict()
573
- return await predict.query(kbid, query, generative_model, rephrase)
693
+ return await predict.query(kbid, query, semantic_model, generative_model, rephrase, rephrase_prompt)
574
694
 
575
695
 
576
696
  @query_parse_dependency_observer.wrap({"type": "detect_entities"})
@@ -610,9 +730,7 @@ def expand_entities(
610
730
  )
611
731
 
612
732
  if entity.value in duplicated_entities_by_value[entity.subtype]:
613
- source_duplicate = duplicated_entities_by_value[entity.subtype][
614
- entity.value
615
- ]
733
+ source_duplicate = duplicated_entities_by_value[entity.subtype][entity.value]
616
734
  result_entities[source_duplicate] = utils_pb2.RelationNode(
617
735
  ntype=utils_pb2.RelationNode.NodeType.ENTITY,
618
736
  subtype=entity.subtype,
@@ -651,10 +769,10 @@ def parse_entities_to_filters(
651
769
  # So far, autofilters feature will only yield 'and' expressions with the detected entities.
652
770
  # More complex autofilters can be added here if we leverage the query endpoint.
653
771
  expanded_expression = {"and": [{"literal": entity} for entity in added_filters]}
654
- if request.filter.expression:
655
- expression = json.loads(request.filter.expression)
772
+ if request.filter.labels_expression:
773
+ expression = json.loads(request.filter.labels_expression)
656
774
  expanded_expression["and"].append(expression)
657
- request.filter.expression = json.dumps(expanded_expression)
775
+ request.filter.labels_expression = json.dumps(expanded_expression)
658
776
  return added_filters
659
777
 
660
778
 
@@ -668,6 +786,7 @@ def suggest_query_to_pb(
668
786
  range_creation_end: Optional[datetime] = None,
669
787
  range_modification_start: Optional[datetime] = None,
670
788
  range_modification_end: Optional[datetime] = None,
789
+ hidden: Optional[bool] = None,
671
790
  ) -> nodereader_pb2.SuggestRequest:
672
791
  request = nodereader_pb2.SuggestRequest()
673
792
 
@@ -677,10 +796,21 @@ def suggest_query_to_pb(
677
796
 
678
797
  if SuggestOptions.PARAGRAPH in features:
679
798
  request.features.append(nodereader_pb2.SuggestFeatures.PARAGRAPHS)
680
- filters = [translate_label(fltr) for fltr in filters]
681
- request.filter.field_labels.extend(filters)
682
799
  request.fields.extend(fields)
683
800
 
801
+ if hidden is not None:
802
+ if hidden:
803
+ filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
804
+ else:
805
+ filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore
806
+
807
+ expression = convert_to_node_filters(filters)
808
+ if expression:
809
+ expression = translate_label_filters(expression)
810
+
811
+ request.filter.field_labels.extend(flatten_filter_literals(expression))
812
+ request.filter.labels_expression = json.dumps(expression)
813
+
684
814
  if range_creation_start is not None:
685
815
  request.timestamps.from_created.FromDatetime(range_creation_start)
686
816
  if range_creation_end is not None:
@@ -705,28 +835,26 @@ PROCESSING_STATUS_TO_PB_MAP = {
705
835
 
706
836
  @query_parse_dependency_observer.wrap({"type": "synonyms"})
707
837
  async def get_kb_synonyms(kbid: str) -> Optional[knowledgebox_pb2.Synonyms]:
708
- txn = await get_read_only_transaction()
709
- return await datamanagers.synonyms.get(txn, kbid=kbid)
838
+ async with get_driver().transaction(read_only=True) as txn:
839
+ return await datamanagers.synonyms.get(txn, kbid=kbid)
710
840
 
711
841
 
712
842
  @query_parse_dependency_observer.wrap({"type": "entities_meta_cache"})
713
843
  async def get_entities_meta_cache(kbid: str) -> datamanagers.entities.EntitiesMetaCache:
714
- txn = await get_read_only_transaction()
715
- return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
844
+ async with get_driver().transaction(read_only=True) as txn:
845
+ return await datamanagers.entities.get_entities_meta_cache(txn, kbid=kbid)
716
846
 
717
847
 
718
848
  @query_parse_dependency_observer.wrap({"type": "deleted_entities_groups"})
719
849
  async def get_deleted_entity_groups(kbid: str) -> list[str]:
720
- txn = await get_read_only_transaction()
721
- return list(
722
- (await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups
723
- )
850
+ async with get_driver().transaction(read_only=True) as txn:
851
+ return list((await datamanagers.entities.get_deleted_groups(txn, kbid=kbid)).entities_groups)
724
852
 
725
853
 
726
854
  @query_parse_dependency_observer.wrap({"type": "classification_labels"})
727
855
  async def get_classification_labels(kbid: str) -> knowledgebox_pb2.Labels:
728
- txn = await get_read_only_transaction()
729
- return await datamanagers.labels.get_labels(txn, kbid=kbid)
856
+ async with get_driver().transaction(read_only=True) as txn:
857
+ return await datamanagers.labels.get_labels(txn, kbid=kbid)
730
858
 
731
859
 
732
860
  def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]):
@@ -745,8 +873,16 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
745
873
  "Paragraph labels can only be used with 'all' filter",
746
874
  )
747
875
  for term in filters["and"]:
748
- # Nested expressions are not allowed with paragraph labels
749
- if "literal" not in term:
876
+ # Nested expressions are not allowed with paragraph labels (only "literal" and "not(literal)")
877
+ if "not" in term:
878
+ subterm = term["not"]
879
+ if "literal" not in subterm:
880
+ # AND (NOT( X )) where X is anything other than a literal
881
+ raise InvalidQueryError(
882
+ "filters",
883
+ "Paragraph labels can only be used with 'all' filter",
884
+ )
885
+ elif "literal" not in term:
750
886
  raise InvalidQueryError(
751
887
  "filters",
752
888
  "Paragraph labels can only be used with 'all' filter",
@@ -754,12 +890,31 @@ def check_supported_filters(filters: dict[str, Any], paragraph_labels: list[str]
754
890
 
755
891
 
756
892
  @alru_cache(maxsize=None)
757
- async def get_matryoshka_dimension_cached(kbid: str) -> Optional[int]:
893
+ async def get_matryoshka_dimension_cached(kbid: str, vectorset: Optional[str]) -> Optional[int]:
758
894
  # This can be safely cached as the matryoshka dimension is not expected to change
759
- return await get_matryoshka_dimension(kbid)
895
+ return await get_matryoshka_dimension(kbid, vectorset)
760
896
 
761
897
 
762
898
  @query_parse_dependency_observer.wrap({"type": "matryoshka_dimension"})
763
- async def get_matryoshka_dimension(kbid: str) -> Optional[int]:
764
- txn = await get_read_only_transaction()
765
- return await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
899
+ async def get_matryoshka_dimension(kbid: str, vectorset: Optional[str]) -> Optional[int]:
900
+ async with get_driver().transaction(read_only=True) as txn:
901
+ matryoshka_dimension = None
902
+ if not vectorset:
903
+ # XXX this should be migrated once we remove the "default" vectorset
904
+ # concept
905
+ matryoshka_dimension = await datamanagers.kb.get_matryoshka_vector_dimension(txn, kbid=kbid)
906
+ else:
907
+ vectorset_config = await datamanagers.vectorsets.get(txn, kbid=kbid, vectorset_id=vectorset)
908
+ if vectorset_config is not None and vectorset_config.vectorset_index_config.vector_dimension:
909
+ matryoshka_dimension = vectorset_config.vectorset_index_config.vector_dimension
910
+
911
+ return matryoshka_dimension
912
+
913
+
914
+ def get_sort_field_proto(obj: SortField) -> Optional[nodereader_pb2.OrderBy.OrderField.ValueType]:
915
+ return {
916
+ SortField.SCORE: None,
917
+ SortField.CREATED: nodereader_pb2.OrderBy.OrderField.CREATED,
918
+ SortField.MODIFIED: nodereader_pb2.OrderBy.OrderField.MODIFIED,
919
+ SortField.TITLE: None,
920
+ }[obj]