nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,33 +18,37 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
- from datetime import datetime
22
21
  from typing import Optional, Union
23
22
 
24
23
  from fastapi import Body, Header, Query, Request, Response
25
24
  from fastapi.openapi.models import Example
26
25
  from fastapi_versioning import version
27
- from pydantic.error_wrappers import ValidationError
26
+ from pydantic import ValidationError
28
27
 
29
28
  from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
30
29
  from nucliadb.models.responses import HTTPClientError
31
30
  from nucliadb.search import predict
32
31
  from nucliadb.search.api.v1.router import KB_PREFIX, api
33
32
  from nucliadb.search.api.v1.utils import fastapi_query
33
+ from nucliadb.search.search import cache
34
34
  from nucliadb.search.search.exceptions import InvalidQueryError
35
35
  from nucliadb.search.search.find import find
36
- from nucliadb.search.search.utils import min_score_from_query_params
36
+ from nucliadb.search.search.utils import maybe_log_request_payload, min_score_from_query_params
37
37
  from nucliadb_models.common import FieldTypeName
38
38
  from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
39
39
  from nucliadb_models.search import (
40
40
  FindRequest,
41
41
  KnowledgeboxFindResults,
42
42
  NucliaDBClientType,
43
+ RankFusionName,
44
+ Reranker,
45
+ RerankerName,
43
46
  ResourceProperties,
44
47
  SearchOptions,
45
48
  SearchParamDefaults,
46
49
  )
47
50
  from nucliadb_models.security import RequestSecurity
51
+ from nucliadb_models.utils import DateTime
48
52
  from nucliadb_utils.authentication import requires
49
53
  from nucliadb_utils.exceptions import LimitsExceededError
50
54
 
@@ -54,7 +58,7 @@ FIND_EXAMPLES = {
54
58
  description="Perform a hybrid search that will return text and semantic results matching the query",
55
59
  value={
56
60
  "query": "How can I be an effective product manager?",
57
- "features": [SearchOptions.PARAGRAPH, SearchOptions.VECTOR],
61
+ "features": [SearchOptions.KEYWORD, SearchOptions.SEMANTIC],
58
62
  },
59
63
  )
60
64
  }
@@ -63,7 +67,7 @@ FIND_EXAMPLES = {
63
67
  @api.get(
64
68
  f"/{KB_PREFIX}/{{kbid}}/find",
65
69
  status_code=200,
66
- name="Find Knowledge Box",
70
+ summary="Find Knowledge Box",
67
71
  description="Find on a Knowledge Box",
68
72
  response_model=KnowledgeboxFindResults,
69
73
  response_model_exclude_unset=True,
@@ -78,39 +82,35 @@ async def find_knowledgebox(
78
82
  query: str = fastapi_query(SearchParamDefaults.query),
79
83
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
80
84
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
81
- page_number: int = fastapi_query(SearchParamDefaults.page_number),
82
- page_size: int = fastapi_query(SearchParamDefaults.page_size),
85
+ top_k: Optional[int] = fastapi_query(SearchParamDefaults.top_k),
83
86
  min_score: Optional[float] = Query(
84
87
  default=None,
85
- description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/docs/using/search/#minimum-score", # noqa: E501
88
+ description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
86
89
  deprecated=True,
87
90
  ),
88
91
  min_score_semantic: Optional[float] = Query(
89
92
  default=None,
90
- description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/docs/using/search/#minimum-score", # noqa: E501
93
+ description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
91
94
  ),
92
95
  min_score_bm25: float = Query(
93
96
  default=0,
94
97
  description="Minimum bm25 score to filter paragraph and document index results",
95
98
  ge=0,
96
99
  ),
97
- range_creation_start: Optional[datetime] = fastapi_query(
98
- SearchParamDefaults.range_creation_start
99
- ),
100
- range_creation_end: Optional[datetime] = fastapi_query(
101
- SearchParamDefaults.range_creation_end
102
- ),
103
- range_modification_start: Optional[datetime] = fastapi_query(
100
+ vectorset: Optional[str] = fastapi_query(SearchParamDefaults.vectorset),
101
+ range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
102
+ range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
103
+ range_modification_start: Optional[DateTime] = fastapi_query(
104
104
  SearchParamDefaults.range_modification_start
105
105
  ),
106
- range_modification_end: Optional[datetime] = fastapi_query(
106
+ range_modification_end: Optional[DateTime] = fastapi_query(
107
107
  SearchParamDefaults.range_modification_end
108
108
  ),
109
109
  features: list[SearchOptions] = fastapi_query(
110
110
  SearchParamDefaults.search_features,
111
111
  default=[
112
- SearchOptions.PARAGRAPH,
113
- SearchOptions.VECTOR,
112
+ SearchOptions.KEYWORD,
113
+ SearchOptions.SEMANTIC,
114
114
  ],
115
115
  ),
116
116
  debug: bool = fastapi_query(SearchParamDefaults.debug),
@@ -119,13 +119,14 @@ async def find_knowledgebox(
119
119
  field_type_filter: list[FieldTypeName] = fastapi_query(
120
120
  SearchParamDefaults.field_type_filter, alias="field_type"
121
121
  ),
122
- extracted: list[ExtractedDataTypeName] = fastapi_query(
123
- SearchParamDefaults.extracted
124
- ),
122
+ extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
125
123
  with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
126
124
  with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
127
125
  autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
128
126
  security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
127
+ show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
128
+ rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
129
+ reranker: Union[RerankerName, Reranker] = fastapi_query(SearchParamDefaults.reranker),
129
130
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
130
131
  x_nucliadb_user: str = Header(""),
131
132
  x_forwarded_for: str = Header(""),
@@ -138,11 +139,9 @@ async def find_knowledgebox(
138
139
  query=query,
139
140
  fields=fields,
140
141
  filters=filters,
141
- page_number=page_number,
142
- page_size=page_size,
143
- min_score=min_score_from_query_params(
144
- min_score_bm25, min_score_semantic, min_score
145
- ),
142
+ top_k=top_k, # type: ignore
143
+ min_score=min_score_from_query_params(min_score_bm25, min_score_semantic, min_score),
144
+ vectorset=vectorset,
146
145
  range_creation_end=range_creation_end,
147
146
  range_creation_start=range_creation_start,
148
147
  range_modification_end=range_modification_end,
@@ -157,20 +156,21 @@ async def find_knowledgebox(
157
156
  with_synonyms=with_synonyms,
158
157
  autofilter=autofilter,
159
158
  security=security,
159
+ show_hidden=show_hidden,
160
+ rank_fusion=rank_fusion,
161
+ reranker=reranker,
160
162
  )
161
163
  except ValidationError as exc:
162
164
  detail = json.loads(exc.json())
163
165
  return HTTPClientError(status_code=422, detail=detail)
164
166
 
165
- return await _find_endpoint(
166
- response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
167
- )
167
+ return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
168
168
 
169
169
 
170
170
  @api.post(
171
171
  f"/{KB_PREFIX}/{{kbid}}/find",
172
172
  status_code=200,
173
- name="Find Knowledge Box",
173
+ summary="Find Knowledge Box",
174
174
  description="Find on a Knowledge Box",
175
175
  response_model=KnowledgeboxFindResults,
176
176
  response_model_exclude_unset=True,
@@ -187,9 +187,7 @@ async def find_post_knowledgebox(
187
187
  x_nucliadb_user: str = Header(""),
188
188
  x_forwarded_for: str = Header(""),
189
189
  ) -> Union[KnowledgeboxFindResults, HTTPClientError]:
190
- return await _find_endpoint(
191
- response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
192
- )
190
+ return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
193
191
 
194
192
 
195
193
  async def _find_endpoint(
@@ -201,11 +199,13 @@ async def _find_endpoint(
201
199
  x_forwarded_for: str,
202
200
  ) -> Union[KnowledgeboxFindResults, HTTPClientError]:
203
201
  try:
204
- results, incomplete, _ = await find(
205
- kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
206
- )
207
- response.status_code = 206 if incomplete else 200
208
- return results
202
+ maybe_log_request_payload(kbid, "/find", item)
203
+ with cache.request_caches():
204
+ results, incomplete, _ = await find(
205
+ kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
206
+ )
207
+ response.status_code = 206 if incomplete else 200
208
+ return results
209
209
  except KnowledgeBoxNotFound:
210
210
  return HTTPClientError(status_code=404, detail="Knowledge Box not found")
211
211
  except LimitsExceededError as exc:
@@ -214,6 +214,6 @@ async def _find_endpoint(
214
214
  return HTTPClientError(status_code=412, detail=str(exc))
215
215
  except predict.ProxiedPredictAPIError as err:
216
216
  return HTTPClientError(
217
- status_code=503,
218
- detail=f"Inference service unavailable. {err.status}: {err.detail}",
217
+ status_code=err.status,
218
+ detail=err.detail,
219
219
  )
@@ -24,29 +24,35 @@ from fastapi import HTTPException, Request
24
24
  from fastapi_versioning import version
25
25
  from grpc import StatusCode as GrpcStatusCode
26
26
  from grpc.aio import AioRpcError
27
- from nucliadb_protos.noderesources_pb2 import Shard
28
- from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
29
- from nucliadb_protos.writer_pb2 import Shards
30
27
 
31
28
  from nucliadb.common import datamanagers
32
29
  from nucliadb.common.cluster.exceptions import ShardsNotFound
33
30
  from nucliadb.common.cluster.manager import choose_node
34
31
  from nucliadb.common.cluster.utils import get_shard_manager
32
+ from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
33
+ from nucliadb.common.counters import IndexCounts
34
+ from nucliadb.common.external_index_providers.manager import get_external_index_manager
35
+ from nucliadb.common.models_utils import from_proto
35
36
  from nucliadb.search import logger
36
37
  from nucliadb.search.api.v1.router import KB_PREFIX, api
37
38
  from nucliadb.search.api.v1.utils import fastapi_query
38
39
  from nucliadb.search.search.shards import get_shard
39
40
  from nucliadb.search.settings import settings
41
+ from nucliadb_models.internal.shards import KnowledgeboxShards
40
42
  from nucliadb_models.resource import NucliaDBRoles
41
43
  from nucliadb_models.search import (
42
44
  KnowledgeboxCounters,
43
- KnowledgeboxShards,
44
45
  SearchParamDefaults,
45
46
  )
47
+ from nucliadb_protos.noderesources_pb2 import Shard
48
+ from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
49
+ from nucliadb_protos.writer_pb2 import Shards
46
50
  from nucliadb_telemetry import errors
51
+ from nucliadb_utils import const
47
52
  from nucliadb_utils.authentication import requires, requires_one
53
+ from nucliadb_utils.utilities import has_feature
48
54
 
49
- AVG_PARAGRAPH_SIZE_BYTES = 10_000
55
+ MAX_PARAGRAPHS_FOR_SMALL_KB = 250_000
50
56
 
51
57
 
52
58
  @api.get(
@@ -68,7 +74,7 @@ async def knowledgebox_shards(request: Request, kbid: str) -> KnowledgeboxShards
68
74
  status_code=404,
69
75
  detail="The knowledgebox or its shards configuration is missing",
70
76
  )
71
- return KnowledgeboxShards.from_message(shards)
77
+ return from_proto.kb_shards(shards)
72
78
 
73
79
 
74
80
  @api.get(
@@ -84,24 +90,85 @@ async def knowledgebox_shards(request: Request, kbid: str) -> KnowledgeboxShards
84
90
  async def knowledgebox_counters(
85
91
  request: Request,
86
92
  kbid: str,
87
- vectorset: str = fastapi_query(SearchParamDefaults.vectorset),
88
93
  debug: bool = fastapi_query(SearchParamDefaults.debug),
89
94
  ) -> KnowledgeboxCounters:
90
- shard_manager = get_shard_manager()
91
-
92
95
  try:
93
- shard_groups: list[PBShardObject] = await shard_manager.get_shards_by_kbid(kbid)
96
+ return await _kb_counters(kbid, debug=debug)
94
97
  except ShardsNotFound:
95
98
  raise HTTPException(
96
99
  status_code=404,
97
100
  detail="The knowledgebox or its shards configuration is missing",
98
101
  )
99
102
 
103
+
104
+ async def _kb_counters(
105
+ kbid: str,
106
+ debug: bool = False,
107
+ ) -> KnowledgeboxCounters:
108
+ """
109
+ Resources count is calculated from maindb and cached
110
+ Field count is calculated from the index node cluster
111
+ Paragraphs and Sentences count is calculated from the index node cluster or the external index provider.
112
+ Index size is estimated from the paragraphs count.
113
+ """
114
+ counters = KnowledgeboxCounters(
115
+ resources=0,
116
+ paragraphs=0,
117
+ fields=0,
118
+ sentences=0,
119
+ index_size=0,
120
+ )
121
+ external_index_manager = await get_external_index_manager(kbid)
122
+ if external_index_manager is not None:
123
+ index_counts = await external_index_manager.get_index_counts()
124
+ counters.paragraphs = index_counts.paragraphs
125
+ counters.sentences = index_counts.sentences
126
+ is_small_kb = index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
127
+ resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
128
+ # TODO: Find a way to query the fields count from the external index provider or use the catalog
129
+ counters.resources = counters.fields = resource_count
130
+ else:
131
+ node_index_counts, queried_shards = await get_node_index_counts(kbid)
132
+ counters.fields = node_index_counts.fields
133
+ counters.paragraphs = node_index_counts.paragraphs
134
+ counters.sentences = node_index_counts.sentences
135
+ is_small_kb = node_index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
136
+ resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
137
+ counters.resources = resource_count
138
+ counters.index_size = counters.paragraphs * AVG_PARAGRAPH_SIZE_BYTES
139
+ if debug and queried_shards is not None:
140
+ counters.shards = queried_shards
141
+ return counters
142
+
143
+
144
+ async def get_resources_count(kbid: str, force_calculate: bool = False) -> int:
145
+ async with datamanagers.with_ro_transaction() as txn:
146
+ if force_calculate:
147
+ # For small kbs, this is faster and more up to date
148
+ resource_count = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
149
+ else:
150
+ resource_count = await datamanagers.resources.get_number_of_resources(txn, kbid=kbid)
151
+ if resource_count == -1:
152
+ # WARNING: standalone, this value will never be cached
153
+ resource_count = await datamanagers.resources.calculate_number_of_resources(
154
+ txn, kbid=kbid
155
+ )
156
+ return resource_count
157
+
158
+
159
+ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
160
+ """
161
+ Get the index counts for a knowledgebox that has an index in the index node cluster.
162
+ """
163
+ shard_manager = get_shard_manager()
164
+ shard_groups: list[PBShardObject] = await shard_manager.get_shards_by_kbid(kbid)
100
165
  ops = []
101
166
  queried_shards = []
102
167
  for shard_object in shard_groups:
103
168
  try:
104
- node, shard_id = choose_node(shard_object)
169
+ node, shard_id = choose_node(
170
+ shard_object, use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid})
171
+ )
105
172
  except KeyError:
106
173
  raise HTTPException(
107
174
  status_code=500,
@@ -111,7 +178,7 @@ async def knowledgebox_counters(
111
178
  if shard_id is not None:
112
179
  # At least one node is alive for this shard group
113
180
  # let's add it ot the query list if has a valid value
114
- ops.append(get_shard(node, shard_id, vectorset=vectorset))
181
+ ops.append(get_shard(node, shard_id))
115
182
  queried_shards.append(shard_id)
116
183
 
117
184
  if not ops:
@@ -122,7 +189,7 @@ async def knowledgebox_counters(
122
189
  )
123
190
 
124
191
  try:
125
- results: Optional[list[Shard]] = await asyncio.wait_for( # type: ignore
192
+ results: Optional[list[Shard]] = await asyncio.wait_for(
126
193
  asyncio.gather(*ops, return_exceptions=True), # type: ignore
127
194
  timeout=settings.search_timeout,
128
195
  )
@@ -139,56 +206,17 @@ async def knowledgebox_counters(
139
206
  if results is None:
140
207
  raise HTTPException(status_code=503, detail=f"No shards found")
141
208
 
142
- field_count = 0
143
- paragraph_count = 0
144
- sentence_count = 0
145
-
209
+ counts = IndexCounts(
210
+ fields=0,
211
+ paragraphs=0,
212
+ sentences=0,
213
+ )
146
214
  for shard in results:
147
215
  if isinstance(shard, Exception):
148
216
  logger.error("Error getting shard info", exc_info=shard)
149
217
  errors.capture_exception(shard)
150
- raise HTTPException(
151
- status_code=500, detail=f"Error while geting shard data"
152
- )
153
-
154
- field_count += shard.fields
155
- paragraph_count += shard.paragraphs
156
- sentence_count += shard.sentences
157
-
158
- async with datamanagers.with_transaction() as txn:
159
- try:
160
- if len(shard_groups) <= 1:
161
- # for smaller kbs, this is faster and more up to date
162
- resource_count = (
163
- await datamanagers.resources.calculate_number_of_resources(
164
- txn, kbid=kbid
165
- )
166
- )
167
- else:
168
- resource_count = await datamanagers.resources.get_number_of_resources(
169
- txn, kbid=kbid
170
- )
171
- if resource_count == -1:
172
- # WARNING: standalone, this value will never be cached
173
- resource_count = (
174
- await datamanagers.resources.calculate_number_of_resources(
175
- txn, kbid=kbid
176
- )
177
- )
178
- except Exception as exc:
179
- errors.capture_exception(exc)
180
- raise HTTPException(
181
- status_code=500, detail="Couldn't retrieve counters right now"
182
- )
183
-
184
- counters = KnowledgeboxCounters(
185
- resources=resource_count,
186
- paragraphs=paragraph_count,
187
- fields=field_count,
188
- sentences=sentence_count,
189
- index_size=paragraph_count * AVG_PARAGRAPH_SIZE_BYTES,
190
- )
191
-
192
- if debug:
193
- counters.shards = queried_shards
194
- return counters
218
+ raise HTTPException(status_code=500, detail=f"Error while geting shard data")
219
+ counts.fields += shard.fields
220
+ counts.paragraphs += shard.paragraphs
221
+ counts.sentences += shard.sentences
222
+ return counts, queried_shards
@@ -39,7 +39,7 @@ DESCRIPTION = "Convenience endpoint that proxies requests to the Predict API. It
39
39
  @api.get(
40
40
  path=f"/{KB_PREFIX}/{{kbid}}/predict/{{endpoint}}",
41
41
  status_code=200,
42
- name="Predict API Proxy",
42
+ summary="Predict API Proxy",
43
43
  description=DESCRIPTION,
44
44
  response_model=None,
45
45
  tags=["Search"],
@@ -47,7 +47,7 @@ DESCRIPTION = "Convenience endpoint that proxies requests to the Predict API. It
47
47
  @api.post(
48
48
  path=f"/{KB_PREFIX}/{{kbid}}/predict/{{endpoint}}",
49
49
  status_code=200,
50
- name="Predict API Proxy",
50
+ summary="Predict API Proxy",
51
51
  description=DESCRIPTION,
52
52
  response_model=None,
53
53
  tags=["Search"],
@@ -17,146 +17,95 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from typing import Union
20
+ from typing import Optional, Union
21
21
 
22
- from fastapi import Body, Header, Request, Response
23
- from fastapi.openapi.models import Example
22
+ from fastapi import Header, Request, Response
24
23
  from fastapi_versioning import version
25
- from nucliadb_protos.resources_pb2 import FieldComputedMetadata
26
- from nucliadb_protos.utils_pb2 import ExtractedText
24
+ from starlette.responses import StreamingResponse
27
25
 
28
- from nucliadb.common.maindb.utils import get_driver
29
- from nucliadb.ingest.orm.knowledgebox import KnowledgeBox
26
+ from nucliadb.common import datamanagers
30
27
  from nucliadb.models.responses import HTTPClientError
31
- from nucliadb.search import SERVICE_NAME, logger
32
- from nucliadb.search.api.v1.router import KB_PREFIX, api
33
- from nucliadb.search.predict import SendToPredictError
34
- from nucliadb.search.search.exceptions import InvalidQueryError, ResourceNotFoundError
35
- from nucliadb.search.utilities import get_predict
28
+ from nucliadb.search.api.v1.router import KB_PREFIX, RESOURCE_SLUG_PREFIX, api
36
29
  from nucliadb_models.resource import NucliaDBRoles
37
- from nucliadb_models.search import AskRequest, AskResponse, TextBlocks
38
- from nucliadb_utils import const
30
+ from nucliadb_models.search import AskRequest, NucliaDBClientType, SyncAskResponse
39
31
  from nucliadb_utils.authentication import requires
40
- from nucliadb_utils.exceptions import LimitsExceededError
41
- from nucliadb_utils.utilities import get_storage, has_feature
42
32
 
43
- ASK_EXAMPLES = {
44
- "Ask a Resource": Example(
45
- summary="Ask a question to the document",
46
- description="Ask a question to the document. The whole document is sent as context to the generative AI",
47
- value={
48
- "question": "Does this document contain personal information?",
49
- },
50
- )
51
- }
33
+ from ..ask import create_ask_response
52
34
 
53
35
 
54
36
  @api.post(
55
37
  f"/{KB_PREFIX}/{{kbid}}/resource/{{rid}}/ask",
56
38
  status_code=200,
57
- name="Ask a Resource",
58
- summary="Ask a question to a resource",
59
- description="Ask to the complete content of the resource",
39
+ summary="Ask a resource (by id)",
40
+ description="Ask questions to a resource",
60
41
  tags=["Search"],
61
- response_model=None,
62
- # TODO: set to True once feature is fully enabled
63
- include_in_schema=False,
42
+ response_model=SyncAskResponse,
64
43
  )
65
44
  @requires(NucliaDBRoles.READER)
66
45
  @version(1)
67
- async def resource_ask_endpoint(
46
+ async def resource_ask_endpoint_by_uuid(
68
47
  request: Request,
69
- response: Response,
70
48
  kbid: str,
71
49
  rid: str,
72
- item: AskRequest = Body(
73
- openapi_examples=ASK_EXAMPLES, description="Ask a question payload"
50
+ item: AskRequest,
51
+ x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
52
+ x_nucliadb_user: str = Header(""),
53
+ x_forwarded_for: str = Header(""),
54
+ x_synchronous: bool = Header(
55
+ False,
56
+ description="When set to true, outputs response as JSON in a non-streaming way. "
57
+ "This is slower and requires waiting for entire answer to be ready.",
74
58
  ),
75
- x_nucliadb_user: str = Header("", description="User Id", include_in_schema=False),
76
- ) -> Union[AskResponse, HTTPClientError]:
77
- if not has_feature(const.Features.ASK_YOUR_DOCUMENTS):
78
- return HTTPClientError(status_code=404, detail="Feature not yet available")
79
-
80
- try:
81
- return await resource_ask(kbid, rid, item, user_id=x_nucliadb_user)
82
- except ResourceNotFoundError:
83
- return HTTPClientError(status_code=404, detail="Resource not found")
84
- except LimitsExceededError as exc:
85
- return HTTPClientError(status_code=exc.status_code, detail=exc.detail)
86
- except SendToPredictError:
87
- return HTTPClientError(status_code=503, detail="Ask service not available")
88
- except InvalidQueryError as exc:
89
- return HTTPClientError(status_code=412, detail=str(exc))
59
+ ) -> Union[StreamingResponse, HTTPClientError, Response]:
60
+ return await create_ask_response(
61
+ kbid,
62
+ item,
63
+ x_nucliadb_user,
64
+ x_ndb_client,
65
+ x_forwarded_for,
66
+ x_synchronous,
67
+ resource=rid,
68
+ )
90
69
 
91
70
 
92
- async def resource_ask(
71
+ @api.post(
72
+ f"/{KB_PREFIX}/{{kbid}}/{RESOURCE_SLUG_PREFIX}/{{slug}}/ask",
73
+ status_code=200,
74
+ summary="Ask a resource (by slug)",
75
+ description="Ask questions to a resource",
76
+ tags=["Search"],
77
+ response_model=SyncAskResponse,
78
+ )
79
+ @requires(NucliaDBRoles.READER)
80
+ @version(1)
81
+ async def resource_ask_endpoint_by_slug(
82
+ request: Request,
93
83
  kbid: str,
94
- rid: str,
84
+ slug: str,
95
85
  item: AskRequest,
96
- user_id: str,
97
- ) -> AskResponse:
98
- blocks = await get_resource_text_blocks(kbid, rid)
99
-
100
- predict = get_predict()
101
- answer = await predict.ask_document(kbid, item.question, blocks, user_id)
102
-
103
- return AskResponse(answer=answer)
104
-
105
-
106
- async def get_resource_text_blocks(kbid: str, rid: str) -> TextBlocks:
107
- """
108
- Iterate over all fields of the resource and get its extracted text.
109
- Slice file extracted texts by paragraphs.
110
- """
111
- blocks = []
112
- driver = get_driver()
113
- storage = await get_storage(service_name=SERVICE_NAME)
114
- async with driver.transaction() as txn:
115
- kb = KnowledgeBox(txn, storage, kbid)
116
- orm_resource = await kb.get(rid)
117
- if orm_resource is None:
118
- raise ResourceNotFoundError()
119
-
120
- for field_type, field_id in await orm_resource.get_fields_ids():
121
- field_obj = await orm_resource.get_field(field_id, field_type, load=False)
122
- etxt = await field_obj.get_extracted_text()
123
- if etxt is None:
124
- logger.warning(
125
- f"Skipping field {field_id}, as it does not have extracted text yet!"
126
- )
127
- continue
128
-
129
- fcm = await field_obj.get_field_metadata()
130
- if fcm is None:
131
- logger.warning(f"Field metadata not found for {field_id}")
132
- blocks.append(get_field_blocks(etxt))
133
- else:
134
- blocks.append(get_field_blocks_split_by_paragraphs(etxt, fcm))
135
- return blocks
136
-
137
-
138
- def get_field_blocks_split_by_paragraphs(
139
- etxt: ExtractedText, fcm: FieldComputedMetadata
140
- ) -> list[str]:
141
- block = []
142
- for paragraph in fcm.metadata.paragraphs:
143
- block.append(etxt.text[paragraph.start : paragraph.end])
144
-
145
- for split, metadata in fcm.split_metadata.items():
146
- for split_paragraph in metadata.paragraphs:
147
- split_text = etxt.split_text.get(split)
148
- if split_text is None:
149
- logger.warning(f"Split {split} not found in extracted text")
150
- continue
151
- block.append(split_text[split_paragraph.start : split_paragraph.end])
152
- return block
86
+ x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
87
+ x_nucliadb_user: str = Header(""),
88
+ x_forwarded_for: str = Header(""),
89
+ x_synchronous: bool = Header(
90
+ False,
91
+ description="When set to true, outputs response as JSON in a non-streaming way. "
92
+ "This is slower and requires waiting for entire answer to be ready.",
93
+ ),
94
+ ) -> Union[StreamingResponse, HTTPClientError, Response]:
95
+ resource_id = await get_resource_uuid_by_slug(kbid, slug)
96
+ if resource_id is None:
97
+ return HTTPClientError(status_code=404, detail="Resource not found")
98
+ return await create_ask_response(
99
+ kbid,
100
+ item,
101
+ x_nucliadb_user,
102
+ x_ndb_client,
103
+ x_forwarded_for,
104
+ x_synchronous,
105
+ resource=resource_id,
106
+ )
153
107
 
154
108
 
155
- def get_field_blocks(etxt: ExtractedText) -> list[str]:
156
- blocks = []
157
- if etxt.text:
158
- blocks.append(etxt.text)
159
- for split_etxt in etxt.split_text.values():
160
- if split_etxt:
161
- blocks.append(split_etxt)
162
- return blocks
109
+ async def get_resource_uuid_by_slug(kbid: str, slug: str) -> Optional[str]:
110
+ async with datamanagers.with_ro_transaction() as txn:
111
+ return await datamanagers.resources.get_resource_uuid_from_slug(txn, kbid=kbid, slug=slug)