nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -22,19 +22,10 @@ import datetime
22
22
  import math
23
23
  from typing import Any, Optional, Set, Union
24
24
 
25
- from nucliadb_protos.nodereader_pb2 import (
26
- DocumentResult,
27
- DocumentScored,
28
- DocumentSearchResponse,
29
- EntitiesSubgraphRequest,
30
- ParagraphResult,
31
- ParagraphSearchResponse,
32
- RelationSearchResponse,
33
- SearchResponse,
34
- SuggestResponse,
35
- VectorSearchResponse,
36
- )
37
-
25
+ from nucliadb.common.ids import FieldId, ParagraphId
26
+ from nucliadb.common.models_utils.from_proto import RelationTypePbMap
27
+ from nucliadb.search.search import cache
28
+ from nucliadb.search.search.cut import cut_page
38
29
  from nucliadb.search.search.fetch import (
39
30
  fetch_resources,
40
31
  get_labels_paragraph,
@@ -43,11 +34,11 @@ from nucliadb.search.search.fetch import (
43
34
  )
44
35
  from nucliadb_models.common import FieldTypeName
45
36
  from nucliadb_models.labels import translate_system_to_alias_label
46
- from nucliadb_models.metadata import RelationTypePbMap
47
37
  from nucliadb_models.resource import ExtractedDataTypeName
48
38
  from nucliadb_models.search import (
49
39
  DirectionalRelation,
50
40
  EntitySubgraph,
41
+ EntityType,
51
42
  KnowledgeboxSearchResults,
52
43
  KnowledgeboxSuggestResults,
53
44
  MinScore,
@@ -56,7 +47,6 @@ from nucliadb_models.search import (
56
47
  RelatedEntities,
57
48
  RelatedEntity,
58
49
  RelationDirection,
59
- RelationNodeTypeMap,
60
50
  Relations,
61
51
  ResourceProperties,
62
52
  ResourceResult,
@@ -69,10 +59,22 @@ from nucliadb_models.search import (
69
59
  SortOrder,
70
60
  TextPosition,
71
61
  )
62
+ from nucliadb_protos.nodereader_pb2 import (
63
+ DocumentResult,
64
+ DocumentScored,
65
+ DocumentSearchResponse,
66
+ EntitiesSubgraphRequest,
67
+ ParagraphResult,
68
+ ParagraphSearchResponse,
69
+ RelationSearchResponse,
70
+ SearchResponse,
71
+ SuggestResponse,
72
+ VectorSearchResponse,
73
+ )
74
+ from nucliadb_protos.utils_pb2 import RelationNode
72
75
 
73
- from .cache import get_resource_cache, get_resource_from_cache
74
76
  from .metrics import merge_observer
75
- from .paragraphs import ExtractedTextCache, get_paragraph_text, get_text_sentence
77
+ from .paragraphs import get_paragraph_text, get_text_sentence
76
78
 
77
79
  Bm25Score = tuple[float, float]
78
80
  TimestampScore = datetime.datetime
@@ -80,6 +82,15 @@ TitleScore = str
80
82
  SortValue = Union[Bm25Score, TimestampScore, TitleScore]
81
83
 
82
84
 
85
+ def relation_node_type_to_entity_type(node_type: RelationNode.NodeType.ValueType) -> EntityType:
86
+ return {
87
+ RelationNode.NodeType.ENTITY: EntityType.ENTITY,
88
+ RelationNode.NodeType.LABEL: EntityType.LABEL,
89
+ RelationNode.NodeType.RESOURCE: EntityType.RESOURCE,
90
+ RelationNode.NodeType.USER: EntityType.USER,
91
+ }[node_type]
92
+
93
+
83
94
  def sort_results_by_score(results: Union[list[ParagraphResult], list[DocumentResult]]):
84
95
  results.sort(key=lambda x: (x.score.bm25, x.score.booster), reverse=True)
85
96
 
@@ -97,7 +108,7 @@ async def get_sort_value(
97
108
  return (item.score.bm25, item.score.booster)
98
109
 
99
110
  score: Any = None
100
- resource = await get_resource_from_cache(kbid, item.uuid)
111
+ resource = await cache.get_resource(kbid, item.uuid)
101
112
  if resource is None:
102
113
  return score
103
114
 
@@ -118,8 +129,7 @@ async def get_sort_value(
118
129
  async def merge_documents_results(
119
130
  document_responses: list[DocumentSearchResponse],
120
131
  resources: list[str],
121
- count: int,
122
- page: int,
132
+ top_k: int,
123
133
  kbid: str,
124
134
  sort: SortOptions,
125
135
  min_score: float,
@@ -148,15 +158,9 @@ async def merge_documents_results(
148
158
  raw_resource_list.append((result, sort_value))
149
159
  total += document_response.total
150
160
 
151
- skip = page * count
152
- end = skip + count
153
- length = len(raw_resource_list)
154
-
155
- if length > end:
156
- next_page = True
157
-
158
- # We need to cut first and then sort, otherwise pagination will be wrong if the order is DESC
159
- raw_resource_list = raw_resource_list[min(skip, length) : min(end, length)]
161
+ # We need to cut first and then sort, otherwise the page will be wrong if the order is DESC
162
+ raw_resource_list, has_more = cut_page(raw_resource_list, top_k)
163
+ next_page = next_page or has_more
160
164
  raw_resource_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
161
165
 
162
166
  result_resource_list: list[ResourceResult] = []
@@ -181,8 +185,8 @@ async def merge_documents_results(
181
185
  results=result_resource_list,
182
186
  query=query,
183
187
  total=total,
184
- page_number=page,
185
- page_size=count,
188
+ page_number=0, # Bw/c with pagination
189
+ page_size=top_k,
186
190
  next_page=next_page,
187
191
  min_score=min_score,
188
192
  )
@@ -207,65 +211,58 @@ async def merge_suggest_paragraph_results(
207
211
  if len(suggest_responses) > 1:
208
212
  sort_results_by_score(raw_paragraph_list)
209
213
 
210
- rcache = get_resource_cache(clear=True)
211
- etcache = ExtractedTextCache()
212
- try:
213
- result_paragraph_list: list[Paragraph] = []
214
- for result in raw_paragraph_list[:10]:
215
- _, field_type, field = result.field.split("/")
216
- text = await get_paragraph_text(
217
- kbid=kbid,
218
- rid=result.uuid,
219
- field=result.field,
220
- start=result.start,
221
- end=result.end,
222
- split=result.split,
223
- highlight=highlight,
224
- ematches=ematches, # type: ignore
225
- matches=result.matches, # type: ignore
226
- extracted_text_cache=etcache,
227
- )
228
- labels = await get_labels_paragraph(result, kbid)
229
- new_paragraph = Paragraph(
230
- score=result.score.bm25,
231
- rid=result.uuid,
232
- field_type=field_type,
233
- field=field,
234
- text=text,
235
- labels=labels,
236
- position=TextPosition(
237
- index=result.metadata.position.index,
238
- start=result.metadata.position.start,
239
- end=result.metadata.position.end,
240
- page_number=result.metadata.position.page_number,
214
+ result_paragraph_list: list[Paragraph] = []
215
+ for result in raw_paragraph_list[:10]:
216
+ _, field_type, field = result.field.split("/")
217
+ text = await get_paragraph_text(
218
+ kbid=kbid,
219
+ paragraph_id=ParagraphId(
220
+ field_id=FieldId(
221
+ rid=result.uuid,
222
+ type=field_type,
223
+ key=field,
224
+ subfield_id=result.split,
241
225
  ),
242
- )
243
- if len(result.metadata.position.start_seconds) or len(
244
- result.metadata.position.end_seconds
245
- ):
246
- new_paragraph.start_seconds = list(
247
- result.metadata.position.start_seconds
248
- )
249
- new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
250
- else:
251
- # TODO: Remove once we are sure all data has been migrated!
252
- seconds_positions = await get_seconds_paragraph(result, kbid)
253
- if seconds_positions is not None:
254
- new_paragraph.start_seconds = seconds_positions[0]
255
- new_paragraph.end_seconds = seconds_positions[1]
256
- result_paragraph_list.append(new_paragraph)
257
- return Paragraphs(results=result_paragraph_list, query=query, min_score=0)
258
- finally:
259
- etcache.clear()
260
- rcache.clear()
226
+ paragraph_start=result.start,
227
+ paragraph_end=result.end,
228
+ ),
229
+ highlight=highlight,
230
+ ematches=ematches, # type: ignore
231
+ matches=result.matches, # type: ignore
232
+ )
233
+ labels = await get_labels_paragraph(result, kbid)
234
+ new_paragraph = Paragraph(
235
+ score=result.score.bm25,
236
+ rid=result.uuid,
237
+ field_type=field_type,
238
+ field=field,
239
+ text=text,
240
+ labels=labels,
241
+ position=TextPosition(
242
+ index=result.metadata.position.index,
243
+ start=result.metadata.position.start,
244
+ end=result.metadata.position.end,
245
+ page_number=result.metadata.position.page_number,
246
+ ),
247
+ )
248
+ if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
249
+ new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
250
+ new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
251
+ else:
252
+ # TODO: Remove once we are sure all data has been migrated!
253
+ seconds_positions = await get_seconds_paragraph(result, kbid)
254
+ if seconds_positions is not None:
255
+ new_paragraph.start_seconds = seconds_positions[0]
256
+ new_paragraph.end_seconds = seconds_positions[1]
257
+ result_paragraph_list.append(new_paragraph)
258
+ return Paragraphs(results=result_paragraph_list, query=query, min_score=0)
261
259
 
262
260
 
263
261
  async def merge_vectors_results(
264
262
  vector_responses: list[VectorSearchResponse],
265
263
  resources: list[str],
266
264
  kbid: str,
267
- count: int,
268
- page: int,
265
+ top_k: int,
269
266
  min_score: Optional[float] = None,
270
267
  ):
271
268
  facets: dict[str, Any] = {}
@@ -282,12 +279,10 @@ async def merge_vectors_results(
282
279
  if len(vector_responses) > 1:
283
280
  raw_vectors_list.sort(key=lambda x: x.score, reverse=True)
284
281
 
285
- skip = page * count
286
- end_element = skip + count
287
- length = len(raw_vectors_list)
282
+ raw_vectors_list, _ = cut_page(raw_vectors_list, top_k)
288
283
 
289
284
  result_sentence_list: list[Sentence] = []
290
- for result in raw_vectors_list[min(skip, length) : min(end_element, length)]:
285
+ for result in raw_vectors_list:
291
286
  id_count = result.doc_id.id.count("/")
292
287
  if id_count == 4:
293
288
  rid, field_type, field, index, position = result.doc_id.id.split("/")
@@ -335,8 +330,8 @@ async def merge_vectors_results(
335
330
  return Sentences(
336
331
  results=result_sentence_list,
337
332
  facets=facets,
338
- page_number=page,
339
- page_size=count,
333
+ page_number=0, # Bw/c with pagination
334
+ page_size=top_k,
340
335
  min_score=round(min_score or 0, ndigits=3),
341
336
  )
342
337
 
@@ -345,12 +340,11 @@ async def merge_paragraph_results(
345
340
  paragraph_responses: list[ParagraphSearchResponse],
346
341
  resources: list[str],
347
342
  kbid: str,
348
- count: int,
349
- page: int,
343
+ top_k: int,
350
344
  highlight: bool,
351
345
  sort: SortOptions,
352
346
  min_score: float,
353
- ):
347
+ ) -> Paragraphs:
354
348
  raw_paragraph_list: list[tuple[ParagraphResult, SortValue]] = []
355
349
  facets: dict[str, Any] = {}
356
350
  query = None
@@ -380,76 +374,68 @@ async def merge_paragraph_results(
380
374
 
381
375
  raw_paragraph_list.sort(key=lambda x: x[1], reverse=(sort.order == SortOrder.DESC))
382
376
 
383
- skip = page * count
384
- end = skip + count
385
- length = len(raw_paragraph_list)
386
-
387
- if length > end:
388
- next_page = True
377
+ raw_paragraph_list, has_more = cut_page(raw_paragraph_list, top_k)
378
+ next_page = next_page or has_more
389
379
 
390
380
  result_paragraph_list: list[Paragraph] = []
391
- etcache = ExtractedTextCache()
392
- try:
393
- for result, _ in raw_paragraph_list[min(skip, length) : min(end, length)]:
394
- _, field_type, field = result.field.split("/")
395
- text = await get_paragraph_text(
396
- kbid=kbid,
397
- rid=result.uuid,
398
- field=result.field,
399
- start=result.start,
400
- end=result.end,
401
- split=result.split,
402
- highlight=highlight,
403
- ematches=ematches,
404
- matches=result.matches, # type: ignore
405
- extracted_text_cache=etcache,
406
- )
407
- labels = await get_labels_paragraph(result, kbid)
408
- fuzzy_result = len(result.matches) > 0
409
- new_paragraph = Paragraph(
410
- score=result.score.bm25,
411
- rid=result.uuid,
412
- field_type=field_type,
413
- field=field,
414
- text=text,
415
- labels=labels,
416
- position=TextPosition(
417
- index=result.metadata.position.index,
418
- start=result.metadata.position.start,
419
- end=result.metadata.position.end,
420
- page_number=result.metadata.position.page_number,
381
+ for result, _ in raw_paragraph_list:
382
+ _, field_type, field = result.field.split("/")
383
+ text = await get_paragraph_text(
384
+ kbid=kbid,
385
+ paragraph_id=ParagraphId(
386
+ field_id=FieldId(
387
+ rid=result.uuid,
388
+ type=field_type,
389
+ key=field,
390
+ subfield_id=result.split,
421
391
  ),
422
- fuzzy_result=fuzzy_result,
423
- )
424
- if len(result.metadata.position.start_seconds) or len(
425
- result.metadata.position.end_seconds
426
- ):
427
- new_paragraph.start_seconds = list(
428
- result.metadata.position.start_seconds
429
- )
430
- new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
431
- else:
432
- # TODO: Remove once we are sure all data has been migrated!
433
- seconds_positions = await get_seconds_paragraph(result, kbid)
434
- if seconds_positions is not None:
435
- new_paragraph.start_seconds = seconds_positions[0]
436
- new_paragraph.end_seconds = seconds_positions[1]
437
-
438
- result_paragraph_list.append(new_paragraph)
439
- if new_paragraph.rid not in resources:
440
- resources.append(new_paragraph.rid)
441
- return Paragraphs(
442
- results=result_paragraph_list,
443
- facets=facets,
444
- query=query,
445
- total=total,
446
- page_number=page,
447
- page_size=count,
448
- next_page=next_page,
449
- min_score=min_score,
392
+ paragraph_start=result.start,
393
+ paragraph_end=result.end,
394
+ ),
395
+ highlight=highlight,
396
+ ematches=ematches,
397
+ matches=result.matches, # type: ignore
450
398
  )
451
- finally:
452
- etcache.clear()
399
+ labels = await get_labels_paragraph(result, kbid)
400
+ fuzzy_result = len(result.matches) > 0
401
+ new_paragraph = Paragraph(
402
+ score=result.score.bm25,
403
+ rid=result.uuid,
404
+ field_type=field_type,
405
+ field=field,
406
+ text=text,
407
+ labels=labels,
408
+ position=TextPosition(
409
+ index=result.metadata.position.index,
410
+ start=result.metadata.position.start,
411
+ end=result.metadata.position.end,
412
+ page_number=result.metadata.position.page_number,
413
+ ),
414
+ fuzzy_result=fuzzy_result,
415
+ )
416
+ if len(result.metadata.position.start_seconds) or len(result.metadata.position.end_seconds):
417
+ new_paragraph.start_seconds = list(result.metadata.position.start_seconds)
418
+ new_paragraph.end_seconds = list(result.metadata.position.end_seconds)
419
+ else:
420
+ # TODO: Remove once we are sure all data has been migrated!
421
+ seconds_positions = await get_seconds_paragraph(result, kbid)
422
+ if seconds_positions is not None:
423
+ new_paragraph.start_seconds = seconds_positions[0]
424
+ new_paragraph.end_seconds = seconds_positions[1]
425
+
426
+ result_paragraph_list.append(new_paragraph)
427
+ if new_paragraph.rid not in resources:
428
+ resources.append(new_paragraph.rid)
429
+ return Paragraphs(
430
+ results=result_paragraph_list,
431
+ facets=facets,
432
+ query=query,
433
+ total=total,
434
+ page_number=0, # Bw/c with pagination
435
+ page_size=top_k,
436
+ next_page=next_page,
437
+ min_score=min_score,
438
+ )
453
439
 
454
440
 
455
441
  @merge_observer.wrap({"type": "merge_relations"})
@@ -458,9 +444,7 @@ async def merge_relations_results(
458
444
  query: EntitiesSubgraphRequest,
459
445
  ) -> Relations:
460
446
  loop = asyncio.get_event_loop()
461
- return await loop.run_in_executor(
462
- None, _merge_relations_results, relations_responses, query
463
- )
447
+ return await loop.run_in_executor(None, _merge_relations_results, relations_responses, query)
464
448
 
465
449
 
466
450
  def _merge_relations_results(
@@ -483,7 +467,7 @@ def _merge_relations_results(
483
467
  relations.entities[origin.value].related_to.append(
484
468
  DirectionalRelation(
485
469
  entity=destination.value,
486
- entity_type=RelationNodeTypeMap[destination.ntype],
470
+ entity_type=relation_node_type_to_entity_type(destination.ntype),
487
471
  relation=relation_type,
488
472
  relation_label=relation_label,
489
473
  direction=RelationDirection.OUT,
@@ -493,7 +477,7 @@ def _merge_relations_results(
493
477
  relations.entities[destination.value].related_to.append(
494
478
  DirectionalRelation(
495
479
  entity=origin.value,
496
- entity_type=RelationNodeTypeMap[origin.ntype],
480
+ entity_type=relation_node_type_to_entity_type(origin.ntype),
497
481
  relation=relation_type,
498
482
  relation_label=relation_label,
499
483
  direction=RelationDirection.IN,
@@ -506,8 +490,7 @@ def _merge_relations_results(
506
490
  @merge_observer.wrap({"type": "merge"})
507
491
  async def merge_results(
508
492
  search_responses: list[SearchResponse],
509
- count: int,
510
- page: int,
493
+ top_k: int,
511
494
  kbid: str,
512
495
  show: list[ResourceProperties],
513
496
  field_type_filter: list[FieldTypeName],
@@ -530,77 +513,59 @@ async def merge_results(
530
513
 
531
514
  api_results = KnowledgeboxSearchResults()
532
515
 
533
- rcache = get_resource_cache(clear=True)
534
- try:
535
- resources: list[str] = list()
536
- api_results.fulltext = await merge_documents_results(
537
- documents, resources, count, page, kbid, sort, min_score=min_score.bm25
538
- )
516
+ resources: list[str] = list()
517
+ api_results.fulltext = await merge_documents_results(
518
+ documents, resources, top_k, kbid, sort, min_score=min_score.bm25
519
+ )
539
520
 
540
- api_results.paragraphs = await merge_paragraph_results(
541
- paragraphs,
542
- resources,
543
- kbid,
544
- count,
545
- page,
546
- highlight,
547
- sort,
548
- min_score=min_score.bm25,
549
- )
521
+ api_results.paragraphs = await merge_paragraph_results(
522
+ paragraphs,
523
+ resources,
524
+ kbid,
525
+ top_k,
526
+ highlight,
527
+ sort,
528
+ min_score=min_score.bm25,
529
+ )
550
530
 
551
- api_results.sentences = await merge_vectors_results(
552
- vectors, resources, kbid, count, page, min_score=min_score.semantic
553
- )
531
+ api_results.sentences = await merge_vectors_results(
532
+ vectors, resources, kbid, top_k, min_score=min_score.semantic
533
+ )
554
534
 
555
- api_results.relations = await merge_relations_results(
556
- relations, requested_relations
557
- )
535
+ api_results.relations = await merge_relations_results(relations, requested_relations)
558
536
 
559
- api_results.resources = await fetch_resources(
560
- resources, kbid, show, field_type_filter, extracted
561
- )
562
- return api_results
563
- finally:
564
- rcache.clear()
537
+ api_results.resources = await fetch_resources(resources, kbid, show, field_type_filter, extracted)
538
+ return api_results
565
539
 
566
540
 
567
541
  async def merge_paragraphs_results(
568
- paragraph_responses: list[ParagraphSearchResponse],
569
- count: int,
570
- page: int,
542
+ responses: list[SearchResponse],
543
+ top_k: int,
571
544
  kbid: str,
572
- show: list[ResourceProperties],
573
- field_type_filter: list[FieldTypeName],
574
- extracted: list[ExtractedDataTypeName],
575
545
  highlight_split: bool,
576
546
  min_score: float,
577
547
  ) -> ResourceSearchResults:
578
548
  paragraphs = []
579
- for result in paragraph_responses:
580
- paragraphs.append(result)
549
+ for result in responses:
550
+ paragraphs.append(result.paragraph)
581
551
 
582
552
  api_results = ResourceSearchResults()
583
553
 
584
- rcache = get_resource_cache(clear=True)
585
- try:
586
- resources: list[str] = list()
587
- api_results.paragraphs = await merge_paragraph_results(
588
- paragraphs,
589
- resources,
590
- kbid,
591
- count,
592
- page,
593
- highlight=highlight_split,
594
- sort=SortOptions(
595
- field=SortField.SCORE,
596
- order=SortOrder.DESC,
597
- limit=None,
598
- ),
599
- min_score=min_score,
600
- )
601
- return api_results
602
- finally:
603
- rcache.clear()
554
+ resources: list[str] = list()
555
+ api_results.paragraphs = await merge_paragraph_results(
556
+ paragraphs,
557
+ resources,
558
+ kbid,
559
+ top_k,
560
+ highlight=highlight_split,
561
+ sort=SortOptions(
562
+ field=SortField.SCORE,
563
+ order=SortOrder.DESC,
564
+ limit=None,
565
+ ),
566
+ min_score=min_score,
567
+ )
568
+ return api_results
604
569
 
605
570
 
606
571
  async def merge_suggest_entities_results(
@@ -609,8 +574,7 @@ async def merge_suggest_entities_results(
609
574
  unique_entities: Set[RelatedEntity] = set()
610
575
  for response in suggest_responses:
611
576
  response_entities = (
612
- RelatedEntity(family=e.subtype, value=e.value)
613
- for e in response.entity_results.nodes
577
+ RelatedEntity(family=e.subtype, value=e.value) for e in response.entity_results.nodes
614
578
  )
615
579
  unique_entities.update(response_entities)
616
580
 
@@ -17,10 +17,81 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import contextlib
21
+ import time
22
+ from typing import Optional
23
+
20
24
  from nucliadb_telemetry import metrics
21
25
 
22
26
  merge_observer = metrics.Observer("merge_results", labels={"type": ""})
23
27
  node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
24
- query_parse_dependency_observer = metrics.Observer(
25
- "query_parse_dependency", labels={"type": ""}
28
+ query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
29
+
30
+ buckets = [
31
+ 0.005,
32
+ 0.01,
33
+ 0.025,
34
+ 0.05,
35
+ 0.075,
36
+ 0.1,
37
+ 0.25,
38
+ 0.5,
39
+ 0.75,
40
+ 1.0,
41
+ 2.5,
42
+ 5.0,
43
+ 7.5,
44
+ 10.0,
45
+ 30.0,
46
+ 60.0,
47
+ metrics.INF,
48
+ ]
49
+
50
+ generative_first_chunk_histogram = metrics.Histogram(
51
+ name="generative_first_chunk",
52
+ buckets=buckets,
53
+ )
54
+ rag_histogram = metrics.Histogram(
55
+ name="rag",
56
+ labels={"step": ""},
57
+ buckets=buckets,
26
58
  )
59
+
60
+
61
+ class RAGMetrics:
62
+ def __init__(self):
63
+ self.global_start = time.monotonic()
64
+ self._start_times: dict[str, float] = {}
65
+ self._end_times: dict[str, float] = {}
66
+ self.first_chunk_yielded_at: Optional[float] = None
67
+
68
+ @contextlib.contextmanager
69
+ def time(self, step: str):
70
+ self._start(step)
71
+ try:
72
+ yield
73
+ finally:
74
+ self._end(step)
75
+
76
+ def steps(self) -> dict[str, float]:
77
+ return {step: self.elapsed(step) for step in self._end_times.keys()}
78
+
79
+ def elapsed(self, step: str) -> float:
80
+ return self._end_times[step] - self._start_times[step]
81
+
82
+ def record_first_chunk_yielded(self):
83
+ self.first_chunk_yielded_at = time.monotonic()
84
+ generative_first_chunk_histogram.observe(self.first_chunk_yielded_at - self.global_start)
85
+
86
+ def get_first_chunk_time(self) -> Optional[float]:
87
+ if self.first_chunk_yielded_at is None:
88
+ return None
89
+ return self.first_chunk_yielded_at - self.global_start
90
+
91
+ def _start(self, step: str):
92
+ self._start_times[step] = time.monotonic()
93
+
94
+ def _end(self, step: str):
95
+ self._end_times[step] = time.monotonic()
96
+ elapsed = self.elapsed(step)
97
+ rag_histogram.observe(elapsed, labels={"step": step})