nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -402
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +64 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +114 -113
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +25 -127
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,184 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from time import time
21
+ from typing import Optional, Union
22
+
23
+ from fastapi import Request, Response
24
+ from fastapi_versioning import version
25
+
26
+ from nucliadb.common.datamanagers.exceptions import KnowledgeBoxNotFound
27
+ from nucliadb.common.maindb.pg import PGDriver
28
+ from nucliadb.common.maindb.utils import get_driver
29
+ from nucliadb.models.responses import HTTPClientError
30
+ from nucliadb.search import logger
31
+ from nucliadb.search.api.v1.router import KB_PREFIX, api
32
+ from nucliadb.search.api.v1.utils import fastapi_query
33
+ from nucliadb.search.search import cache
34
+ from nucliadb.search.search.exceptions import InvalidQueryError
35
+ from nucliadb.search.search.merge import fetch_resources
36
+ from nucliadb.search.search.pgcatalog import pgcatalog_search
37
+ from nucliadb.search.search.query_parser.parser import parse_catalog
38
+ from nucliadb.search.search.utils import (
39
+ maybe_log_request_payload,
40
+ )
41
+ from nucliadb_models.common import FieldTypeName
42
+ from nucliadb_models.metadata import ResourceProcessingStatus
43
+ from nucliadb_models.resource import NucliaDBRoles
44
+ from nucliadb_models.search import (
45
+ CatalogRequest,
46
+ CatalogResponse,
47
+ KnowledgeboxSearchResults,
48
+ ResourceProperties,
49
+ SearchParamDefaults,
50
+ SortField,
51
+ SortOptions,
52
+ SortOrder,
53
+ )
54
+ from nucliadb_models.utils import DateTime
55
+ from nucliadb_utils.authentication import requires
56
+ from nucliadb_utils.exceptions import LimitsExceededError
57
+
58
+
59
+ @api.get(
60
+ f"/{KB_PREFIX}/{{kbid}}/catalog",
61
+ status_code=200,
62
+ summary="List resources of a Knowledge Box",
63
+ description="List resources of a Knowledge Box",
64
+ response_model=KnowledgeboxSearchResults,
65
+ response_model_exclude_unset=True,
66
+ tags=["Search"],
67
+ )
68
+ @requires(NucliaDBRoles.READER)
69
+ @version(1)
70
+ async def catalog_get(
71
+ request: Request,
72
+ response: Response,
73
+ kbid: str,
74
+ query: str = fastapi_query(SearchParamDefaults.query),
75
+ filters: list[str] = fastapi_query(SearchParamDefaults.filters),
76
+ faceted: list[str] = fastapi_query(SearchParamDefaults.faceted),
77
+ sort_field: SortField = fastapi_query(SearchParamDefaults.sort_field),
78
+ sort_limit: Optional[int] = fastapi_query(SearchParamDefaults.sort_limit),
79
+ sort_order: SortOrder = fastapi_query(SearchParamDefaults.sort_order),
80
+ page_number: int = fastapi_query(SearchParamDefaults.catalog_page_number),
81
+ page_size: int = fastapi_query(SearchParamDefaults.catalog_page_size),
82
+ shards: list[str] = fastapi_query(SearchParamDefaults.shards, deprecated=True),
83
+ with_status: Optional[ResourceProcessingStatus] = fastapi_query(
84
+ SearchParamDefaults.with_status, deprecated="Use filters instead"
85
+ ),
86
+ debug: bool = fastapi_query(SearchParamDefaults.debug, include_in_schema=False),
87
+ range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
88
+ range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
89
+ range_modification_start: Optional[DateTime] = fastapi_query(
90
+ SearchParamDefaults.range_modification_start
91
+ ),
92
+ range_modification_end: Optional[DateTime] = fastapi_query(
93
+ SearchParamDefaults.range_modification_end
94
+ ),
95
+ hidden: Optional[bool] = fastapi_query(SearchParamDefaults.hidden),
96
+ ) -> Union[KnowledgeboxSearchResults, HTTPClientError]:
97
+ item = CatalogRequest(
98
+ query=query,
99
+ filters=filters,
100
+ faceted=faceted,
101
+ page_number=page_number,
102
+ page_size=page_size,
103
+ shards=shards,
104
+ debug=debug,
105
+ with_status=with_status,
106
+ range_creation_start=range_creation_start,
107
+ range_creation_end=range_creation_end,
108
+ range_modification_start=range_modification_start,
109
+ range_modification_end=range_modification_end,
110
+ hidden=hidden,
111
+ )
112
+ if sort_field:
113
+ item.sort = SortOptions(field=sort_field, limit=sort_limit, order=sort_order)
114
+ return await catalog(kbid, item)
115
+
116
+
117
+ @api.post(
118
+ f"/{KB_PREFIX}/{{kbid}}/catalog",
119
+ status_code=200,
120
+ summary="List resources of a Knowledge Box",
121
+ description="List resources of a Knowledge Box",
122
+ response_model=KnowledgeboxSearchResults,
123
+ response_model_exclude_unset=True,
124
+ tags=["Search"],
125
+ )
126
+ @requires(NucliaDBRoles.READER)
127
+ @version(1)
128
+ async def catalog_post(
129
+ request: Request,
130
+ kbid: str,
131
+ item: CatalogRequest,
132
+ ) -> Union[CatalogResponse, HTTPClientError]:
133
+ return await catalog(kbid, item)
134
+
135
+
136
+ async def catalog(
137
+ kbid: str,
138
+ item: CatalogRequest,
139
+ ):
140
+ """
141
+ Catalog endpoint is a simplified version of the search endpoint, it only
142
+ returns bm25 results on titles and it does not support vector search.
143
+ It is useful for listing resources in a knowledge box.
144
+ """
145
+ if not pgcatalog_enabled(): # pragma: no cover
146
+ return HTTPClientError(status_code=501, detail="PG driver is needed for catalog search")
147
+
148
+ maybe_log_request_payload(kbid, "/catalog", item)
149
+ start_time = time()
150
+ try:
151
+ with cache.request_caches():
152
+ query_parser = parse_catalog(kbid, item)
153
+
154
+ catalog_results = CatalogResponse()
155
+ catalog_results.fulltext = await pgcatalog_search(query_parser)
156
+ catalog_results.resources = await fetch_resources(
157
+ resources=[r.rid for r in catalog_results.fulltext.results],
158
+ kbid=kbid,
159
+ show=[ResourceProperties.BASIC, ResourceProperties.ERRORS],
160
+ field_type_filter=list(FieldTypeName),
161
+ extracted=[],
162
+ )
163
+ return catalog_results
164
+ except InvalidQueryError as exc:
165
+ return HTTPClientError(status_code=412, detail=str(exc))
166
+ except KnowledgeBoxNotFound:
167
+ return HTTPClientError(status_code=404, detail="Knowledge Box not found")
168
+ except LimitsExceededError as exc:
169
+ return HTTPClientError(status_code=exc.status_code, detail=exc.detail)
170
+ finally:
171
+ duration = time() - start_time
172
+ if duration > 2: # pragma: no cover
173
+ logger.warning(
174
+ "Slow catalog request",
175
+ extra={
176
+ "kbid": kbid,
177
+ "duration": duration,
178
+ "query": item.model_dump_json(),
179
+ },
180
+ )
181
+
182
+
183
+ def pgcatalog_enabled():
184
+ return isinstance(get_driver(), PGDriver)
@@ -18,18 +18,18 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
-
22
21
  from fastapi import Header, Request, Response
23
22
  from fastapi_versioning import version
24
23
 
24
+ from nucliadb.common.models_utils import to_proto
25
25
  from nucliadb.models.responses import HTTPClientError
26
- from nucliadb.search import logger, predict
26
+ from nucliadb.search import logger
27
27
  from nucliadb.search.api.v1.router import KB_PREFIX, api
28
- from nucliadb.search.utilities import get_predict
29
28
  from nucliadb_models.resource import NucliaDBRoles
30
29
  from nucliadb_models.search import FeedbackRequest, NucliaDBClientType
31
30
  from nucliadb_telemetry import errors
32
31
  from nucliadb_utils.authentication import requires
32
+ from nucliadb_utils.utilities import get_audit
33
33
 
34
34
 
35
35
  @api.post(
@@ -51,28 +51,20 @@ async def send_feedback_endpoint(
51
51
  x_forwarded_for: str = Header(""),
52
52
  ):
53
53
  try:
54
- return await send_feedback(
55
- kbid, item, x_nucliadb_user, x_ndb_client, x_forwarded_for
56
- )
57
- except predict.ProxiedPredictAPIError as err:
58
- return HTTPClientError(
59
- status_code=err.status,
60
- detail=err.detail,
61
- )
54
+ audit = get_audit()
55
+ if audit is not None:
56
+ audit.feedback(
57
+ kbid=kbid,
58
+ user=x_nucliadb_user,
59
+ client_type=to_proto.client_type(x_ndb_client),
60
+ origin=x_forwarded_for,
61
+ learning_id=item.ident,
62
+ good=item.good,
63
+ task=to_proto.feedback_task(item.task),
64
+ feedback=item.feedback,
65
+ text_block_id=item.text_block_id,
66
+ )
62
67
  except Exception as ex:
63
68
  errors.capture_exception(ex)
64
69
  logger.exception("Unexpected error sending feedback", extra={"kbid": kbid})
65
70
  return HTTPClientError(status_code=500, detail=f"Internal server error")
66
-
67
-
68
- async def send_feedback(
69
- kbid: str,
70
- item: FeedbackRequest,
71
- x_nucliadb_user: str,
72
- x_ndb_client: NucliaDBClientType,
73
- x_forwarded_for: str,
74
- ):
75
- predict = get_predict()
76
- await predict.send_feedback(
77
- kbid, item, x_nucliadb_user, x_ndb_client, x_forwarded_for
78
- )
@@ -18,7 +18,6 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import json
21
- from datetime import datetime
22
21
  from typing import Optional, Union
23
22
 
24
23
  from fastapi import Body, Header, Query, Request, Response
@@ -31,20 +30,25 @@ from nucliadb.models.responses import HTTPClientError
31
30
  from nucliadb.search import predict
32
31
  from nucliadb.search.api.v1.router import KB_PREFIX, api
33
32
  from nucliadb.search.api.v1.utils import fastapi_query
33
+ from nucliadb.search.search import cache
34
34
  from nucliadb.search.search.exceptions import InvalidQueryError
35
35
  from nucliadb.search.search.find import find
36
- from nucliadb.search.search.utils import min_score_from_query_params
36
+ from nucliadb.search.search.utils import maybe_log_request_payload, min_score_from_query_params
37
37
  from nucliadb_models.common import FieldTypeName
38
38
  from nucliadb_models.resource import ExtractedDataTypeName, NucliaDBRoles
39
39
  from nucliadb_models.search import (
40
40
  FindRequest,
41
41
  KnowledgeboxFindResults,
42
42
  NucliaDBClientType,
43
+ RankFusionName,
44
+ Reranker,
45
+ RerankerName,
43
46
  ResourceProperties,
44
47
  SearchOptions,
45
48
  SearchParamDefaults,
46
49
  )
47
50
  from nucliadb_models.security import RequestSecurity
51
+ from nucliadb_models.utils import DateTime
48
52
  from nucliadb_utils.authentication import requires
49
53
  from nucliadb_utils.exceptions import LimitsExceededError
50
54
 
@@ -54,7 +58,7 @@ FIND_EXAMPLES = {
54
58
  description="Perform a hybrid search that will return text and semantic results matching the query",
55
59
  value={
56
60
  "query": "How can I be an effective product manager?",
57
- "features": [SearchOptions.PARAGRAPH, SearchOptions.VECTOR],
61
+ "features": [SearchOptions.KEYWORD, SearchOptions.SEMANTIC],
58
62
  },
59
63
  )
60
64
  }
@@ -78,39 +82,35 @@ async def find_knowledgebox(
78
82
  query: str = fastapi_query(SearchParamDefaults.query),
79
83
  fields: list[str] = fastapi_query(SearchParamDefaults.fields),
80
84
  filters: list[str] = fastapi_query(SearchParamDefaults.filters),
81
- page_number: int = fastapi_query(SearchParamDefaults.page_number),
82
- page_size: int = fastapi_query(SearchParamDefaults.page_size),
85
+ top_k: Optional[int] = fastapi_query(SearchParamDefaults.top_k),
83
86
  min_score: Optional[float] = Query(
84
87
  default=None,
85
- description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/docs/using/search/#minimum-score", # noqa: E501
88
+ description="Minimum similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
86
89
  deprecated=True,
87
90
  ),
88
91
  min_score_semantic: Optional[float] = Query(
89
92
  default=None,
90
- description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/docs/using/search/#minimum-score", # noqa: E501
93
+ description="Minimum semantic similarity score to filter vector index results. If not specified, the default minimum score of the semantic model associated to the Knowledge Box will be used. Check out the documentation for more information on how to use this parameter: https://docs.nuclia.dev/docs/rag/advanced/search#minimum-score", # noqa: E501
91
94
  ),
92
95
  min_score_bm25: float = Query(
93
96
  default=0,
94
97
  description="Minimum bm25 score to filter paragraph and document index results",
95
98
  ge=0,
96
99
  ),
97
- range_creation_start: Optional[datetime] = fastapi_query(
98
- SearchParamDefaults.range_creation_start
99
- ),
100
- range_creation_end: Optional[datetime] = fastapi_query(
101
- SearchParamDefaults.range_creation_end
102
- ),
103
- range_modification_start: Optional[datetime] = fastapi_query(
100
+ vectorset: Optional[str] = fastapi_query(SearchParamDefaults.vectorset),
101
+ range_creation_start: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_start),
102
+ range_creation_end: Optional[DateTime] = fastapi_query(SearchParamDefaults.range_creation_end),
103
+ range_modification_start: Optional[DateTime] = fastapi_query(
104
104
  SearchParamDefaults.range_modification_start
105
105
  ),
106
- range_modification_end: Optional[datetime] = fastapi_query(
106
+ range_modification_end: Optional[DateTime] = fastapi_query(
107
107
  SearchParamDefaults.range_modification_end
108
108
  ),
109
109
  features: list[SearchOptions] = fastapi_query(
110
110
  SearchParamDefaults.search_features,
111
111
  default=[
112
- SearchOptions.PARAGRAPH,
113
- SearchOptions.VECTOR,
112
+ SearchOptions.KEYWORD,
113
+ SearchOptions.SEMANTIC,
114
114
  ],
115
115
  ),
116
116
  debug: bool = fastapi_query(SearchParamDefaults.debug),
@@ -119,13 +119,14 @@ async def find_knowledgebox(
119
119
  field_type_filter: list[FieldTypeName] = fastapi_query(
120
120
  SearchParamDefaults.field_type_filter, alias="field_type"
121
121
  ),
122
- extracted: list[ExtractedDataTypeName] = fastapi_query(
123
- SearchParamDefaults.extracted
124
- ),
122
+ extracted: list[ExtractedDataTypeName] = fastapi_query(SearchParamDefaults.extracted),
125
123
  with_duplicates: bool = fastapi_query(SearchParamDefaults.with_duplicates),
126
124
  with_synonyms: bool = fastapi_query(SearchParamDefaults.with_synonyms),
127
125
  autofilter: bool = fastapi_query(SearchParamDefaults.autofilter),
128
126
  security_groups: list[str] = fastapi_query(SearchParamDefaults.security_groups),
127
+ show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden),
128
+ rank_fusion: RankFusionName = fastapi_query(SearchParamDefaults.rank_fusion),
129
+ reranker: Union[RerankerName, Reranker] = fastapi_query(SearchParamDefaults.reranker),
129
130
  x_ndb_client: NucliaDBClientType = Header(NucliaDBClientType.API),
130
131
  x_nucliadb_user: str = Header(""),
131
132
  x_forwarded_for: str = Header(""),
@@ -138,11 +139,9 @@ async def find_knowledgebox(
138
139
  query=query,
139
140
  fields=fields,
140
141
  filters=filters,
141
- page_number=page_number,
142
- page_size=page_size,
143
- min_score=min_score_from_query_params(
144
- min_score_bm25, min_score_semantic, min_score
145
- ),
142
+ top_k=top_k, # type: ignore
143
+ min_score=min_score_from_query_params(min_score_bm25, min_score_semantic, min_score),
144
+ vectorset=vectorset,
146
145
  range_creation_end=range_creation_end,
147
146
  range_creation_start=range_creation_start,
148
147
  range_modification_end=range_modification_end,
@@ -157,14 +156,15 @@ async def find_knowledgebox(
157
156
  with_synonyms=with_synonyms,
158
157
  autofilter=autofilter,
159
158
  security=security,
159
+ show_hidden=show_hidden,
160
+ rank_fusion=rank_fusion,
161
+ reranker=reranker,
160
162
  )
161
163
  except ValidationError as exc:
162
164
  detail = json.loads(exc.json())
163
165
  return HTTPClientError(status_code=422, detail=detail)
164
166
 
165
- return await _find_endpoint(
166
- response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
167
- )
167
+ return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
168
168
 
169
169
 
170
170
  @api.post(
@@ -187,9 +187,7 @@ async def find_post_knowledgebox(
187
187
  x_nucliadb_user: str = Header(""),
188
188
  x_forwarded_for: str = Header(""),
189
189
  ) -> Union[KnowledgeboxFindResults, HTTPClientError]:
190
- return await _find_endpoint(
191
- response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
192
- )
190
+ return await _find_endpoint(response, kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for)
193
191
 
194
192
 
195
193
  async def _find_endpoint(
@@ -201,11 +199,13 @@ async def _find_endpoint(
201
199
  x_forwarded_for: str,
202
200
  ) -> Union[KnowledgeboxFindResults, HTTPClientError]:
203
201
  try:
204
- results, incomplete, _ = await find(
205
- kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
206
- )
207
- response.status_code = 206 if incomplete else 200
208
- return results
202
+ maybe_log_request_payload(kbid, "/find", item)
203
+ with cache.request_caches():
204
+ results, incomplete, _ = await find(
205
+ kbid, item, x_ndb_client, x_nucliadb_user, x_forwarded_for
206
+ )
207
+ response.status_code = 206 if incomplete else 200
208
+ return results
209
209
  except KnowledgeBoxNotFound:
210
210
  return HTTPClientError(status_code=404, detail="Knowledge Box not found")
211
211
  except LimitsExceededError as exc:
@@ -24,29 +24,35 @@ from fastapi import HTTPException, Request
24
24
  from fastapi_versioning import version
25
25
  from grpc import StatusCode as GrpcStatusCode
26
26
  from grpc.aio import AioRpcError
27
- from nucliadb_protos.noderesources_pb2 import Shard
28
- from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
29
- from nucliadb_protos.writer_pb2 import Shards
30
27
 
31
28
  from nucliadb.common import datamanagers
32
29
  from nucliadb.common.cluster.exceptions import ShardsNotFound
33
30
  from nucliadb.common.cluster.manager import choose_node
34
31
  from nucliadb.common.cluster.utils import get_shard_manager
32
+ from nucliadb.common.constants import AVG_PARAGRAPH_SIZE_BYTES
33
+ from nucliadb.common.counters import IndexCounts
34
+ from nucliadb.common.external_index_providers.manager import get_external_index_manager
35
+ from nucliadb.common.models_utils import from_proto
35
36
  from nucliadb.search import logger
36
37
  from nucliadb.search.api.v1.router import KB_PREFIX, api
37
38
  from nucliadb.search.api.v1.utils import fastapi_query
38
39
  from nucliadb.search.search.shards import get_shard
39
40
  from nucliadb.search.settings import settings
41
+ from nucliadb_models.internal.shards import KnowledgeboxShards
40
42
  from nucliadb_models.resource import NucliaDBRoles
41
43
  from nucliadb_models.search import (
42
44
  KnowledgeboxCounters,
43
- KnowledgeboxShards,
44
45
  SearchParamDefaults,
45
46
  )
47
+ from nucliadb_protos.noderesources_pb2 import Shard
48
+ from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
49
+ from nucliadb_protos.writer_pb2 import Shards
46
50
  from nucliadb_telemetry import errors
51
+ from nucliadb_utils import const
47
52
  from nucliadb_utils.authentication import requires, requires_one
53
+ from nucliadb_utils.utilities import has_feature
48
54
 
49
- AVG_PARAGRAPH_SIZE_BYTES = 10_000
55
+ MAX_PARAGRAPHS_FOR_SMALL_KB = 250_000
50
56
 
51
57
 
52
58
  @api.get(
@@ -68,7 +74,7 @@ async def knowledgebox_shards(request: Request, kbid: str) -> KnowledgeboxShards
68
74
  status_code=404,
69
75
  detail="The knowledgebox or its shards configuration is missing",
70
76
  )
71
- return KnowledgeboxShards.from_message(shards)
77
+ return from_proto.kb_shards(shards)
72
78
 
73
79
 
74
80
  @api.get(
@@ -86,21 +92,83 @@ async def knowledgebox_counters(
86
92
  kbid: str,
87
93
  debug: bool = fastapi_query(SearchParamDefaults.debug),
88
94
  ) -> KnowledgeboxCounters:
89
- shard_manager = get_shard_manager()
90
-
91
95
  try:
92
- shard_groups: list[PBShardObject] = await shard_manager.get_shards_by_kbid(kbid)
96
+ return await _kb_counters(kbid, debug=debug)
93
97
  except ShardsNotFound:
94
98
  raise HTTPException(
95
99
  status_code=404,
96
100
  detail="The knowledgebox or its shards configuration is missing",
97
101
  )
98
102
 
103
+
104
+ async def _kb_counters(
105
+ kbid: str,
106
+ debug: bool = False,
107
+ ) -> KnowledgeboxCounters:
108
+ """
109
+ Resources count is calculated from maindb and cached
110
+ Field count is calculated from the index node cluster
111
+ Paragraphs and Sentences count is calculated from the index node cluster or the external index provider.
112
+ Index size is estimated from the paragraphs count.
113
+ """
114
+ counters = KnowledgeboxCounters(
115
+ resources=0,
116
+ paragraphs=0,
117
+ fields=0,
118
+ sentences=0,
119
+ index_size=0,
120
+ )
121
+ external_index_manager = await get_external_index_manager(kbid)
122
+ if external_index_manager is not None:
123
+ index_counts = await external_index_manager.get_index_counts()
124
+ counters.paragraphs = index_counts.paragraphs
125
+ counters.sentences = index_counts.sentences
126
+ is_small_kb = index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
127
+ resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
128
+ # TODO: Find a way to query the fields count from the external index provider or use the catalog
129
+ counters.resources = counters.fields = resource_count
130
+ else:
131
+ node_index_counts, queried_shards = await get_node_index_counts(kbid)
132
+ counters.fields = node_index_counts.fields
133
+ counters.paragraphs = node_index_counts.paragraphs
134
+ counters.sentences = node_index_counts.sentences
135
+ is_small_kb = node_index_counts.paragraphs < MAX_PARAGRAPHS_FOR_SMALL_KB
136
+ resource_count = await get_resources_count(kbid, force_calculate=is_small_kb)
137
+ counters.resources = resource_count
138
+ counters.index_size = counters.paragraphs * AVG_PARAGRAPH_SIZE_BYTES
139
+ if debug and queried_shards is not None:
140
+ counters.shards = queried_shards
141
+ return counters
142
+
143
+
144
+ async def get_resources_count(kbid: str, force_calculate: bool = False) -> int:
145
+ async with datamanagers.with_ro_transaction() as txn:
146
+ if force_calculate:
147
+ # For small kbs, this is faster and more up to date
148
+ resource_count = await datamanagers.resources.calculate_number_of_resources(txn, kbid=kbid)
149
+ else:
150
+ resource_count = await datamanagers.resources.get_number_of_resources(txn, kbid=kbid)
151
+ if resource_count == -1:
152
+ # WARNING: standalone, this value will never be cached
153
+ resource_count = await datamanagers.resources.calculate_number_of_resources(
154
+ txn, kbid=kbid
155
+ )
156
+ return resource_count
157
+
158
+
159
+ async def get_node_index_counts(kbid: str) -> tuple[IndexCounts, list[str]]:
160
+ """
161
+ Get the index counts for a knowledgebox that has an index in the index node cluster.
162
+ """
163
+ shard_manager = get_shard_manager()
164
+ shard_groups: list[PBShardObject] = await shard_manager.get_shards_by_kbid(kbid)
99
165
  ops = []
100
166
  queried_shards = []
101
167
  for shard_object in shard_groups:
102
168
  try:
103
- node, shard_id = choose_node(shard_object)
169
+ node, shard_id = choose_node(
170
+ shard_object, use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid})
171
+ )
104
172
  except KeyError:
105
173
  raise HTTPException(
106
174
  status_code=500,
@@ -121,7 +189,7 @@ async def knowledgebox_counters(
121
189
  )
122
190
 
123
191
  try:
124
- results: Optional[list[Shard]] = await asyncio.wait_for( # type: ignore
192
+ results: Optional[list[Shard]] = await asyncio.wait_for(
125
193
  asyncio.gather(*ops, return_exceptions=True), # type: ignore
126
194
  timeout=settings.search_timeout,
127
195
  )
@@ -138,56 +206,17 @@ async def knowledgebox_counters(
138
206
  if results is None:
139
207
  raise HTTPException(status_code=503, detail=f"No shards found")
140
208
 
141
- field_count = 0
142
- paragraph_count = 0
143
- sentence_count = 0
144
-
209
+ counts = IndexCounts(
210
+ fields=0,
211
+ paragraphs=0,
212
+ sentences=0,
213
+ )
145
214
  for shard in results:
146
215
  if isinstance(shard, Exception):
147
216
  logger.error("Error getting shard info", exc_info=shard)
148
217
  errors.capture_exception(shard)
149
- raise HTTPException(
150
- status_code=500, detail=f"Error while geting shard data"
151
- )
152
-
153
- field_count += shard.fields
154
- paragraph_count += shard.paragraphs
155
- sentence_count += shard.sentences
156
-
157
- async with datamanagers.with_transaction() as txn:
158
- try:
159
- if len(shard_groups) <= 1:
160
- # for smaller kbs, this is faster and more up to date
161
- resource_count = (
162
- await datamanagers.resources.calculate_number_of_resources(
163
- txn, kbid=kbid
164
- )
165
- )
166
- else:
167
- resource_count = await datamanagers.resources.get_number_of_resources(
168
- txn, kbid=kbid
169
- )
170
- if resource_count == -1:
171
- # WARNING: standalone, this value will never be cached
172
- resource_count = (
173
- await datamanagers.resources.calculate_number_of_resources(
174
- txn, kbid=kbid
175
- )
176
- )
177
- except Exception as exc:
178
- errors.capture_exception(exc)
179
- raise HTTPException(
180
- status_code=500, detail="Couldn't retrieve counters right now"
181
- )
182
-
183
- counters = KnowledgeboxCounters(
184
- resources=resource_count,
185
- paragraphs=paragraph_count,
186
- fields=field_count,
187
- sentences=sentence_count,
188
- index_size=paragraph_count * AVG_PARAGRAPH_SIZE_BYTES,
189
- )
190
-
191
- if debug:
192
- counters.shards = queried_shards
193
- return counters
218
+ raise HTTPException(status_code=500, detail=f"Error while geting shard data")
219
+ counts.fields += shard.fields
220
+ counts.paragraphs += shard.paragraphs
221
+ counts.sentences += shard.sentences
222
+ return counts, queried_shards
@@ -40,8 +40,6 @@ from ..ask import create_ask_response
40
40
  description="Ask questions to a resource",
41
41
  tags=["Search"],
42
42
  response_model=SyncAskResponse,
43
- # Add this to OpenAPI schema when endpoint is not in beta anymore
44
- include_in_schema=False,
45
43
  )
46
44
  @requires(NucliaDBRoles.READER)
47
45
  @version(1)
@@ -77,8 +75,6 @@ async def resource_ask_endpoint_by_uuid(
77
75
  description="Ask questions to a resource",
78
76
  tags=["Search"],
79
77
  response_model=SyncAskResponse,
80
- # Add this to OpenAPI schema when endpoint is not in beta anymore
81
- include_in_schema=False,
82
78
  )
83
79
  @requires(NucliaDBRoles.READER)
84
80
  @version(1)
@@ -111,7 +107,5 @@ async def resource_ask_endpoint_by_slug(
111
107
 
112
108
 
113
109
  async def get_resource_uuid_by_slug(kbid: str, slug: str) -> Optional[str]:
114
- async with datamanagers.with_transaction() as txn:
115
- return await datamanagers.resources.get_resource_uuid_from_slug(
116
- txn, kbid=kbid, slug=slug
117
- )
110
+ async with datamanagers.with_ro_transaction() as txn:
111
+ return await datamanagers.resources.get_resource_uuid_from_slug(txn, kbid=kbid, slug=slug)