nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -17,10 +17,81 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import contextlib
21
+ import time
22
+ from typing import Optional
23
+
20
24
  from nucliadb_telemetry import metrics
21
25
 
22
26
  merge_observer = metrics.Observer("merge_results", labels={"type": ""})
23
27
  node_features = metrics.Counter("nucliadb_node_features", labels={"type": ""})
24
- query_parse_dependency_observer = metrics.Observer(
25
- "query_parse_dependency", labels={"type": ""}
28
+ query_parse_dependency_observer = metrics.Observer("query_parse_dependency", labels={"type": ""})
29
+
30
+ buckets = [
31
+ 0.005,
32
+ 0.01,
33
+ 0.025,
34
+ 0.05,
35
+ 0.075,
36
+ 0.1,
37
+ 0.25,
38
+ 0.5,
39
+ 0.75,
40
+ 1.0,
41
+ 2.5,
42
+ 5.0,
43
+ 7.5,
44
+ 10.0,
45
+ 30.0,
46
+ 60.0,
47
+ metrics.INF,
48
+ ]
49
+
50
+ generative_first_chunk_histogram = metrics.Histogram(
51
+ name="generative_first_chunk",
52
+ buckets=buckets,
53
+ )
54
+ rag_histogram = metrics.Histogram(
55
+ name="rag",
56
+ labels={"step": ""},
57
+ buckets=buckets,
26
58
  )
59
+
60
+
61
+ class RAGMetrics:
62
+ def __init__(self):
63
+ self.global_start = time.monotonic()
64
+ self._start_times: dict[str, float] = {}
65
+ self._end_times: dict[str, float] = {}
66
+ self.first_chunk_yielded_at: Optional[float] = None
67
+
68
+ @contextlib.contextmanager
69
+ def time(self, step: str):
70
+ self._start(step)
71
+ try:
72
+ yield
73
+ finally:
74
+ self._end(step)
75
+
76
+ def steps(self) -> dict[str, float]:
77
+ return {step: self.elapsed(step) for step in self._end_times.keys()}
78
+
79
+ def elapsed(self, step: str) -> float:
80
+ return self._end_times[step] - self._start_times[step]
81
+
82
+ def record_first_chunk_yielded(self):
83
+ self.first_chunk_yielded_at = time.monotonic()
84
+ generative_first_chunk_histogram.observe(self.first_chunk_yielded_at - self.global_start)
85
+
86
+ def get_first_chunk_time(self) -> Optional[float]:
87
+ if self.first_chunk_yielded_at is None:
88
+ return None
89
+ return self.first_chunk_yielded_at - self.global_start
90
+
91
+ def _start(self, step: str):
92
+ self._start_times[step] = time.monotonic()
93
+
94
+ def _end(self, step: str):
95
+ self._end_times[step] = time.monotonic()
96
+ elapsed = self.elapsed(step)
97
+ rag_histogram.observe(elapsed, labels={"step": step})
@@ -17,20 +17,16 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
- import asyncio
21
20
  import logging
22
21
  import re
23
22
  import string
24
23
  from typing import Optional
25
24
 
26
- from nucliadb_protos.utils_pb2 import ExtractedText
27
-
25
+ from nucliadb.common.ids import FIELD_TYPE_STR_TO_PB, ParagraphId
28
26
  from nucliadb.ingest.fields.base import Field
29
- from nucliadb.ingest.orm.resource import KB_REVERSE
30
27
  from nucliadb.ingest.orm.resource import Resource as ResourceORM
31
- from nucliadb_telemetry import metrics
32
-
33
- from .cache import get_resource_from_cache
28
+ from nucliadb.search.search import cache
29
+ from nucliadb_telemetry import errors, metrics
34
30
 
35
31
  logger = logging.getLogger(__name__)
36
32
  PRE_WORD = string.punctuation + " "
@@ -56,62 +52,6 @@ GET_PARAGRAPH_LATENCY = metrics.Observer(
56
52
  )
57
53
 
58
54
 
59
- EXTRACTED_CACHE_OPS = metrics.Counter(
60
- "nucliadb_extracted_text_cache_ops", labels={"type": ""}
61
- )
62
-
63
-
64
- class ExtractedTextCache:
65
- """
66
- Used to cache extracted text from a resource in memory during
67
- the process of search results serialization.
68
- """
69
-
70
- def __init__(self):
71
- self.locks = {}
72
- self.values = {}
73
-
74
- def get_value(self, key: str) -> Optional[ExtractedText]:
75
- return self.values.get(key)
76
-
77
- def get_lock(self, key: str) -> asyncio.Lock:
78
- return self.locks.setdefault(key, asyncio.Lock())
79
-
80
- def set_value(self, key: str, value: ExtractedText) -> None:
81
- self.values[key] = value
82
-
83
- def clear(self):
84
- self.values.clear()
85
- self.locks.clear()
86
-
87
-
88
- async def get_field_extracted_text(
89
- field: Field, cache: Optional[ExtractedTextCache] = None
90
- ) -> Optional[ExtractedText]:
91
- if cache is None:
92
- return await field.get_extracted_text()
93
-
94
- key = f"{field.kbid}/{field.uuid}/{field.id}"
95
- extracted_text = cache.get_value(key)
96
- if extracted_text is not None:
97
- EXTRACTED_CACHE_OPS.inc({"type": "hit"})
98
- return extracted_text
99
-
100
- async with cache.get_lock(key):
101
- # Check again in case another task already fetched it
102
- extracted_text = cache.get_value(key)
103
- if extracted_text is not None:
104
- EXTRACTED_CACHE_OPS.inc({"type": "hit"})
105
- return extracted_text
106
-
107
- EXTRACTED_CACHE_OPS.inc({"type": "miss"})
108
- extracted_text = await field.get_extracted_text()
109
- if extracted_text is not None:
110
- # Only cache if we actually have extracted text
111
- cache.set_value(key, extracted_text)
112
- return extracted_text
113
-
114
-
115
55
  @GET_PARAGRAPH_LATENCY.wrap({"type": "full"})
116
56
  async def get_paragraph_from_full_text(
117
57
  *,
@@ -119,16 +59,23 @@ async def get_paragraph_from_full_text(
119
59
  start: int,
120
60
  end: int,
121
61
  split: Optional[str] = None,
122
- extracted_text_cache: Optional[ExtractedTextCache] = None,
62
+ log_on_missing_field: bool = True,
123
63
  ) -> str:
124
64
  """
125
65
  Pull paragraph from full text stored in database.
126
66
 
127
67
  This requires downloading the full text and then slicing it.
128
68
  """
129
- extracted_text = await get_field_extracted_text(field, cache=extracted_text_cache)
69
+ extracted_text = await cache.get_field_extracted_text(field)
130
70
  if extracted_text is None:
131
- logger.warning(f"{field} extracted_text does not exist on DB yet")
71
+ if log_on_missing_field:
72
+ logger.warning(
73
+ "Extracted_text for field does not exist on DB. This should not happen.",
74
+ extra={
75
+ "field_id": field.resource_unique_id,
76
+ "kbid": field.kbid,
77
+ },
78
+ )
132
79
  return ""
133
80
 
134
81
  if split not in (None, ""):
@@ -141,39 +88,46 @@ async def get_paragraph_from_full_text(
141
88
  async def get_paragraph_text(
142
89
  *,
143
90
  kbid: str,
144
- rid: str,
145
- field: str,
146
- start: int,
147
- end: int,
148
- split: Optional[str] = None,
91
+ paragraph_id: ParagraphId,
149
92
  highlight: bool = False,
150
93
  ematches: Optional[list[str]] = None,
151
94
  matches: Optional[list[str]] = None,
152
95
  orm_resource: Optional[
153
96
  ResourceORM
154
97
  ] = None, # allow passing in orm_resource to avoid extra DB calls or txn issues
155
- extracted_text_cache: Optional[ExtractedTextCache] = None,
98
+ log_on_missing_field: bool = True,
156
99
  ) -> str:
100
+ rid = paragraph_id.rid
101
+ field_type = paragraph_id.field_id.type
102
+ field_key = paragraph_id.field_id.key
103
+
157
104
  if orm_resource is None:
158
- orm_resource = await get_resource_from_cache(kbid, rid)
105
+ orm_resource = await cache.get_resource(kbid, rid)
159
106
  if orm_resource is None:
160
- logger.error(f"{kbid}/{rid}:{field} does not exist on DB")
107
+ if log_on_missing_field:
108
+ logger.warning(
109
+ "Resource does not exist on DB. This should not happen.",
110
+ extra={"resource_id": rid, "kbid": kbid, "field": f"{field_type}/{field_key}"},
111
+ )
161
112
  return ""
162
113
 
163
- _, field_type, field = field.split("/")
164
- field_type_int = KB_REVERSE[field_type]
165
- field_obj = await orm_resource.get_field(field, field_type_int, load=False)
114
+ field_type_int = FIELD_TYPE_STR_TO_PB[field_type]
115
+ field_obj = await orm_resource.get_field(field_key, field_type_int, load=False)
166
116
 
167
117
  text = await get_paragraph_from_full_text(
168
118
  field=field_obj,
169
- start=start,
170
- end=end,
171
- split=split,
172
- extracted_text_cache=extracted_text_cache,
119
+ start=paragraph_id.paragraph_start,
120
+ end=paragraph_id.paragraph_end,
121
+ split=paragraph_id.field_id.subfield_id,
122
+ log_on_missing_field=log_on_missing_field,
173
123
  )
174
124
 
175
125
  if highlight:
176
- text = highlight_paragraph(text, words=matches, ematches=ematches)
126
+ try:
127
+ text = highlight_paragraph(text, words=matches, ematches=ematches)
128
+ except Exception as ex:
129
+ errors.capture_exception(ex)
130
+ logger.exception("Error highlighting paragraph", extra={"kbid": kbid})
177
131
  return text
178
132
 
179
133
 
@@ -191,19 +145,17 @@ async def get_text_sentence(
191
145
  Leave separated from get paragraph for now until we understand the differences
192
146
  better.
193
147
  """
194
- orm_resource = await get_resource_from_cache(kbid, rid)
148
+ orm_resource = await cache.get_resource(kbid, rid)
195
149
 
196
150
  if orm_resource is None:
197
151
  logger.warning(f"{rid} does not exist on DB")
198
152
  return ""
199
153
 
200
- field_type_int = KB_REVERSE[field_type]
154
+ field_type_int = FIELD_TYPE_STR_TO_PB[field_type]
201
155
  field_obj = await orm_resource.get_field(field, field_type_int, load=False)
202
156
  extracted_text = await field_obj.get_extracted_text()
203
157
  if extracted_text is None:
204
- logger.info(
205
- f"{rid} {field} {field_type_int} extracted_text does not exist on DB"
206
- )
158
+ logger.info(f"{rid} {field} {field_type_int} extracted_text does not exist on DB")
207
159
  return ""
208
160
  start = start - 1
209
161
  if start < 0:
@@ -216,36 +168,42 @@ async def get_text_sentence(
216
168
  return splitted_text
217
169
 
218
170
 
219
- def get_regex(some_string: str) -> str:
220
- return r"\b" + some_string.lower() + r"\b"
221
-
222
-
223
171
  def highlight_paragraph(
224
172
  text: str, words: Optional[list[str]] = None, ematches: Optional[list[str]] = None
225
173
  ) -> str:
174
+ """
175
+ Highlight `text` with <mark></mark> tags around the words in `words` and `ematches`.
176
+
177
+ Parameters:
178
+ - text: The text to highlight.
179
+ - words: A list of words to highlight.
180
+ - ematches: A list of exact matches to highlight.
181
+
182
+ Returns:
183
+ - The highlighted text.
184
+ """
185
+ REGEX_TEMPLATE = r"(^|\s)({text})(\s|$)"
226
186
  text_lower = text.lower()
227
187
 
228
188
  marks = [0] * (len(text_lower) + 1)
229
- if ematches is not None:
230
- for quote in ematches:
231
- quote_regex = get_regex(quote.lower())
232
- try:
233
- for match in re.finditer(quote_regex, text_lower):
234
- start, end = match.span()
235
- marks[start] = 1
236
- marks[end] = 2
237
- except re.error:
238
- logger.warning(
239
- f"Regex errors while highlighting text. Regex: {quote_regex}"
240
- )
241
- continue
189
+ ematches = ematches or []
190
+ for quote in ematches:
191
+ quote_regex = REGEX_TEMPLATE.format(text=re.escape(quote.lower()))
192
+ try:
193
+ for match in re.finditer(quote_regex, text_lower):
194
+ start, end = match.span(2)
195
+ marks[start] = 1
196
+ marks[end] = 2
197
+ except re.error:
198
+ logger.warning(f"Regex errors while highlighting text. Regex: {quote_regex}")
199
+ continue
242
200
 
243
201
  words = words or []
244
202
  for word in words:
245
- word_regex = get_regex(word.lower())
203
+ word_regex = REGEX_TEMPLATE.format(text=re.escape(word.lower()))
246
204
  try:
247
205
  for match in re.finditer(word_regex, text_lower):
248
- start, end = match.span()
206
+ start, end = match.span(2)
249
207
  if marks[start] == 0 and marks[end] == 0:
250
208
  marks[start] = 1
251
209
  marks[end] = 2
@@ -0,0 +1,233 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ import logging
22
+ from collections import defaultdict
23
+ from typing import Any, cast
24
+
25
+ from psycopg.rows import dict_row
26
+
27
+ from nucliadb.common.maindb.pg import PGDriver
28
+ from nucliadb.common.maindb.utils import get_driver
29
+ from nucliadb.search.search.query_parser.models import CatalogQuery
30
+ from nucliadb_models.labels import translate_system_to_alias_label
31
+ from nucliadb_models.metadata import ResourceProcessingStatus
32
+ from nucliadb_models.search import (
33
+ ResourceResult,
34
+ Resources,
35
+ SortField,
36
+ SortOrder,
37
+ )
38
+ from nucliadb_telemetry import metrics
39
+
40
+ from .filters import translate_label
41
+
42
+ observer = metrics.Observer("pg_catalog_search", labels={"op": ""})
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ def _filter_operands(operands):
47
+ literals = []
48
+ nonliterals = []
49
+ for operand in operands:
50
+ op, params = next(iter(operand.items()))
51
+ if op == "literal":
52
+ literals.append(params)
53
+ else:
54
+ nonliterals.append(operand)
55
+
56
+ return literals, nonliterals
57
+
58
+
59
+ def _convert_filter(filter, filter_params):
60
+ op, operands = next(iter(filter.items()))
61
+ if op == "literal":
62
+ param_name = f"param{len(filter_params)}"
63
+ filter_params[param_name] = [operands]
64
+ return f"labels @> %({param_name})s"
65
+ elif op in ("and", "or"):
66
+ array_op = "@>" if op == "and" else "&&"
67
+ sql = []
68
+ literals, nonliterals = _filter_operands(operands)
69
+ if literals:
70
+ param_name = f"param{len(filter_params)}"
71
+ filter_params[param_name] = literals
72
+ sql.append(f"labels {array_op} %({param_name})s")
73
+ for nonlit in nonliterals:
74
+ sql.append(_convert_filter(nonlit, filter_params))
75
+ return "(" + f" {op.upper()} ".join(sql) + ")"
76
+ elif op == "not":
77
+ return f"(NOT {_convert_filter(operands, filter_params)})"
78
+ else:
79
+ raise ValueError(f"Invalid operator {op}")
80
+
81
+
82
+ def _prepare_query(catalog_query: CatalogQuery):
83
+ filter_sql = ["kbid = %(kbid)s"]
84
+ filter_params: dict[str, Any] = {"kbid": catalog_query.kbid}
85
+
86
+ if catalog_query.query:
87
+ # This is doing tokenization inside the SQL server (to keep the index updated). We could move it to
88
+ # the python code at update/query time if it ever becomes a problem but for now, a single regex
89
+ # executed per query is not a problem.
90
+ filter_sql.append(
91
+ "regexp_split_to_array(lower(title), '\\W') @> regexp_split_to_array(lower(%(query)s), '\\W')"
92
+ )
93
+ filter_params["query"] = catalog_query.query
94
+
95
+ if catalog_query.filters.creation.after:
96
+ filter_sql.append("created_at > %(created_at_start)s")
97
+ filter_params["created_at_start"] = catalog_query.filters.creation.after
98
+
99
+ if catalog_query.filters.creation.before:
100
+ filter_sql.append("created_at < %(created_at_end)s")
101
+ filter_params["created_at_end"] = catalog_query.filters.creation.before
102
+
103
+ if catalog_query.filters.modification.after:
104
+ filter_sql.append("modified_at > %(modified_at_start)s")
105
+ filter_params["modified_at_start"] = catalog_query.filters.modification.after
106
+
107
+ if catalog_query.filters.modification.before:
108
+ filter_sql.append("modified_at < %(modified_at_end)s")
109
+ filter_params["modified_at_end"] = catalog_query.filters.modification.before
110
+
111
+ if catalog_query.filters.labels:
112
+ filter_sql.append(_convert_filter(catalog_query.filters.labels, filter_params))
113
+
114
+ order_sql = ""
115
+ if catalog_query.sort:
116
+ if catalog_query.sort.field == SortField.CREATED:
117
+ order_field = "created_at"
118
+ elif catalog_query.sort.field == SortField.MODIFIED:
119
+ order_field = "modified_at"
120
+ elif catalog_query.sort.field == SortField.TITLE:
121
+ order_field = "title"
122
+ else:
123
+ # Deprecated order by score, use created_at instead
124
+ order_field = "created_at"
125
+
126
+ if catalog_query.sort.order == SortOrder.ASC:
127
+ order_dir = "ASC"
128
+ else:
129
+ order_dir = "DESC"
130
+
131
+ order_sql = f" ORDER BY {order_field} {order_dir}"
132
+
133
+ if catalog_query.filters.with_status:
134
+ filter_sql.append("labels && %(status)s")
135
+ if catalog_query.filters.with_status == ResourceProcessingStatus.PROCESSED:
136
+ filter_params["status"] = ["/n/s/PROCESSED", "/n/s/ERROR"]
137
+ else:
138
+ filter_params["status"] = ["/n/s/PENDING"]
139
+
140
+ return (
141
+ f"SELECT * FROM catalog WHERE {' AND '.join(filter_sql)}{order_sql}",
142
+ filter_params,
143
+ )
144
+
145
+
146
+ def _pg_driver() -> PGDriver:
147
+ return cast(PGDriver, get_driver())
148
+
149
+
150
+ @observer.wrap({"op": "search"})
151
+ async def pgcatalog_search(catalog_query: CatalogQuery) -> Resources:
152
+ # Prepare SQL query
153
+ query, query_params = _prepare_query(catalog_query)
154
+
155
+ async with _pg_driver()._get_connection() as conn, conn.cursor(row_factory=dict_row) as cur:
156
+ facets = {}
157
+
158
+ # Faceted search
159
+ if catalog_query.faceted:
160
+ with observer({"op": "facets"}):
161
+ tmp_facets: dict[str, dict[str, int]] = {
162
+ translate_label(f): defaultdict(int) for f in catalog_query.faceted
163
+ }
164
+ facet_filters = " OR ".join(f"label LIKE '{f}/%%'" for f in tmp_facets.keys())
165
+ for facet in tmp_facets.keys():
166
+ if not (
167
+ facet.startswith("/n/s") or facet.startswith("/n/i") or facet.startswith("/l")
168
+ ):
169
+ logger.warn(
170
+ f"Unexpected facet used at catalog: {facet}, kbid={catalog_query.kbid}"
171
+ )
172
+
173
+ await cur.execute(
174
+ f"SELECT label, COUNT(*) FROM (SELECT unnest(labels) AS label FROM ({query}) fc) nl WHERE ({facet_filters}) GROUP BY 1 ORDER BY 1",
175
+ query_params,
176
+ )
177
+
178
+ for row in await cur.fetchall():
179
+ label = row["label"]
180
+ label_parts = label.split("/")
181
+ parent = "/".join(label_parts[:-1])
182
+ count = row["count"]
183
+ if parent in tmp_facets:
184
+ tmp_facets[parent][translate_system_to_alias_label(label)] = count
185
+
186
+ # No need to get recursive because our facets are at most 3 levels deep (e.g: /l/set/label)
187
+ if len(label_parts) >= 3:
188
+ grandparent = "/".join(label_parts[:-2])
189
+ if grandparent in tmp_facets:
190
+ tmp_facets[grandparent][translate_system_to_alias_label(parent)] += count
191
+
192
+ facets = {translate_system_to_alias_label(k): v for k, v in tmp_facets.items()}
193
+
194
+ # Totals
195
+ with observer({"op": "totals"}):
196
+ await cur.execute(
197
+ f"SELECT COUNT(*) FROM ({query}) fc",
198
+ query_params,
199
+ )
200
+ total = (await cur.fetchone())["count"] # type: ignore
201
+
202
+ # Query
203
+ with observer({"op": "query"}):
204
+ offset = catalog_query.page_size * catalog_query.page_number
205
+ await cur.execute(
206
+ f"{query} LIMIT %(page_size)s OFFSET %(offset)s",
207
+ {
208
+ **query_params,
209
+ "page_size": catalog_query.page_size,
210
+ "offset": offset,
211
+ },
212
+ )
213
+ data = await cur.fetchall()
214
+
215
+ return Resources(
216
+ facets=facets,
217
+ results=[
218
+ ResourceResult(
219
+ rid=str(r["rid"]).replace("-", ""),
220
+ field="title",
221
+ field_type="a",
222
+ labels=[label for label in r["labels"] if label.startswith("/l/")],
223
+ score=0,
224
+ )
225
+ for r in data
226
+ ],
227
+ query=catalog_query.query,
228
+ total=total,
229
+ page_number=catalog_query.page_number,
230
+ page_size=catalog_query.page_size,
231
+ next_page=(offset + len(data) < total),
232
+ min_score=0,
233
+ )
@@ -84,5 +84,5 @@ async def predict_proxy(
84
84
 
85
85
 
86
86
  async def exists_kb(kbid: str) -> bool:
87
- async with datamanagers.with_transaction(read_only=True) as txn:
87
+ async with datamanagers.with_ro_transaction() as txn:
88
88
  return await datamanagers.kb.exists_kb(txn, kbid=kbid)