nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,270 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ import logging
22
+ from abc import ABC, abstractmethod, abstractproperty
23
+ from dataclasses import dataclass
24
+ from typing import Optional
25
+
26
+ from nucliadb.search.predict import ProxiedPredictAPIError, SendToPredictError
27
+ from nucliadb.search.search.query_parser import models as parser_models
28
+ from nucliadb.search.utilities import get_predict
29
+ from nucliadb_models.internal.predict import RerankModel
30
+ from nucliadb_models.search import (
31
+ SCORE_TYPE,
32
+ KnowledgeboxFindResults,
33
+ )
34
+ from nucliadb_telemetry.metrics import Observer
35
+
36
+ logger = logging.getLogger(__name__)
37
+
38
+ reranker_observer = Observer("reranker", labels={"type": ""})
39
+
40
+
41
+ @dataclass
42
+ class RerankableItem:
43
+ id: str
44
+ score: float
45
+ score_type: SCORE_TYPE
46
+ content: str
47
+
48
+
49
+ @dataclass
50
+ class RankedItem:
51
+ id: str
52
+ score: float
53
+ score_type: SCORE_TYPE
54
+
55
+
56
+ @dataclass
57
+ class RerankingOptions:
58
+ kbid: str
59
+
60
+ # Query used to retrieve the results to be reranked. Smart rerankers will use it
61
+ query: str
62
+
63
+
64
+ class Reranker(ABC):
65
+ @abstractproperty
66
+ def window(self) -> Optional[int]:
67
+ """Number of elements the reranker requests. `None` means no specific
68
+ window is enforced."""
69
+ ...
70
+
71
+ @property
72
+ def needs_extra_results(self) -> bool:
73
+ return self.window is not None
74
+
75
+ async def rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
76
+ """Given a query and a set of resources, rerank elements and return the
77
+ list of reranked items sorted by decreasing score. The list will contain
78
+ at most, `window` elements.
79
+
80
+ """
81
+ # Enforce reranker window and drop the rest
82
+ # XXX: other search engines allow a mix of reranked and not reranked
83
+ # results, there's no technical reason we can't do it
84
+ items = items[: self.window]
85
+ reranked = await self._rerank(items, options)
86
+ return reranked
87
+
88
+ @abstractmethod
89
+ async def _rerank(
90
+ self, items: list[RerankableItem], options: RerankingOptions
91
+ ) -> list[RankedItem]: ...
92
+
93
+
94
+ class NoopReranker(Reranker):
95
+ """No-operation reranker. Given a list of items to rerank, it does nothing
96
+ with them and return the items in the same order. It can be use to not alter
97
+ the previous ordering.
98
+
99
+ """
100
+
101
+ @property
102
+ def window(self) -> Optional[int]:
103
+ return None
104
+
105
+ @reranker_observer.wrap({"type": "noop"})
106
+ async def _rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
107
+ return [
108
+ RankedItem(
109
+ id=item.id,
110
+ score=item.score,
111
+ score_type=item.score_type,
112
+ )
113
+ for item in items
114
+ ]
115
+
116
+
117
+ class PredictReranker(Reranker):
118
+ """Rerank using a reranking model.
119
+
120
+ It uses Predict API to rerank elements using a model trained for this
121
+
122
+ """
123
+
124
+ def __init__(self, window: int):
125
+ self._window = window
126
+
127
+ @property
128
+ def window(self) -> int:
129
+ return self._window
130
+
131
+ @reranker_observer.wrap({"type": "predict"})
132
+ async def _rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
133
+ if len(items) == 0:
134
+ return []
135
+
136
+ predict = get_predict()
137
+
138
+ # Conversion to format expected by predict. At the same time,
139
+ # deduplicates paragraphs found in different indices
140
+ context = {item.id: item.content for item in items}
141
+ request = RerankModel(
142
+ question=options.query,
143
+ user_id="", # TODO
144
+ context=context,
145
+ )
146
+ try:
147
+ response = await predict.rerank(options.kbid, request)
148
+ except (SendToPredictError, ProxiedPredictAPIError):
149
+ # predict failed, we can't rerank
150
+ reranked = [
151
+ RankedItem(
152
+ id=item.id,
153
+ score=item.score,
154
+ score_type=item.score_type,
155
+ )
156
+ for item in items
157
+ ]
158
+ else:
159
+ reranked = [
160
+ RankedItem(
161
+ id=id,
162
+ score=score,
163
+ score_type=SCORE_TYPE.RERANKER,
164
+ )
165
+ for id, score in response.context_scores.items()
166
+ ]
167
+ sort_by_score(reranked)
168
+ best = reranked
169
+ return best
170
+
171
+
172
+ class MultiMatchBoosterReranker(Reranker):
173
+ """This reranker gives more value to items that come from different indices"""
174
+
175
+ @property
176
+ def window(self) -> Optional[int]:
177
+ return None
178
+
179
+ @reranker_observer.wrap({"type": "multi_match_booster"})
180
+ async def _rerank(self, items: list[RerankableItem], options: RerankingOptions) -> list[RankedItem]:
181
+ """Given a list of rerankable items, boost matches that appear multiple
182
+ times. The returned list can be smaller than the initial, as repeated
183
+ matches are deduplicated.
184
+ """
185
+ reranked_by_id = {}
186
+ for item in items:
187
+ if item.id not in reranked_by_id:
188
+ reranked_by_id[item.id] = RankedItem(
189
+ id=item.id,
190
+ score=item.score,
191
+ score_type=item.score_type,
192
+ )
193
+ else:
194
+ # it's a mutiple match, boost the score
195
+ if reranked_by_id[item.id].score < item.score:
196
+ # previous implementation noted that we are using vector
197
+ # score x2 when we find a multiple match. However, this may
198
+ # not be true, as the same paragraph could come in any
199
+ # position in the rank fusioned result list
200
+ reranked_by_id[item.id].score = item.score * 2
201
+
202
+ reranked_by_id[item.id].score_type = SCORE_TYPE.BOTH
203
+
204
+ reranked = list(reranked_by_id.values())
205
+ sort_by_score(reranked)
206
+ return reranked
207
+
208
+
209
+ def get_reranker(reranker: parser_models.Reranker) -> Reranker:
210
+ algorithm: Reranker
211
+
212
+ if isinstance(reranker, parser_models.NoopReranker):
213
+ algorithm = NoopReranker()
214
+
215
+ elif isinstance(reranker, parser_models.MultiMatchBoosterReranker):
216
+ algorithm = MultiMatchBoosterReranker()
217
+
218
+ elif isinstance(reranker, parser_models.PredictReranker):
219
+ algorithm = PredictReranker(reranker.window)
220
+
221
+ else:
222
+ logger.warning(f"Unknown reranker requested: {reranker}. Using default instead")
223
+ algorithm = MultiMatchBoosterReranker()
224
+
225
+ return algorithm
226
+
227
+
228
+ def sort_by_score(items: list[RankedItem]):
229
+ """Sort `items` in place by decreasing score"""
230
+ items.sort(key=lambda item: item.score, reverse=True)
231
+
232
+
233
+ def apply_reranking(results: KnowledgeboxFindResults, reranked: list[RankedItem]):
234
+ """Given a list of reranked items, update the find results payload.
235
+
236
+ *ATENTION* we assume `reranked` is an ordered list of decreasing relevance
237
+ and contains *only* the items relevant for this response. Any paragraph not
238
+ found in `reranked` will be removed from the `results`
239
+
240
+ """
241
+ inverted_results = {}
242
+ for rid, resource in results.resources.items():
243
+ for field_id, field in resource.fields.items():
244
+ for paragraph_id, paragraph in field.paragraphs.items():
245
+ inverted_results[paragraph_id] = (
246
+ paragraph,
247
+ (field_id, field),
248
+ (rid, resource),
249
+ )
250
+
251
+ # update results and best matches according to new scores
252
+ results.best_matches.clear()
253
+ for order, item in enumerate(reranked):
254
+ paragraph_id = item.id
255
+ paragraph = inverted_results[paragraph_id][0]
256
+ paragraph.score = item.score
257
+ paragraph.score_type = item.score_type
258
+ paragraph.order = order
259
+ results.best_matches.append(paragraph_id)
260
+
261
+ # prune uneeded results (not appearing in `reranked`)
262
+ extra = set(inverted_results.keys()) - set(results.best_matches)
263
+ for paragraph_id in extra:
264
+ _, (field_id, field), (rid, resource) = inverted_results[paragraph_id]
265
+ field.paragraphs.pop(paragraph_id)
266
+ if len(field.paragraphs) == 0:
267
+ resource.fields.pop(field_id)
268
+
269
+ if len(resource.fields) == 0:
270
+ results.resources.pop(rid)
@@ -18,22 +18,16 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
- from typing import Optional
22
21
 
22
+ from nucliadb.common.cluster.base import AbstractIndexNode
23
23
  from nucliadb_protos.nodereader_pb2 import (
24
24
  GetShardRequest,
25
- ParagraphSearchRequest,
26
- ParagraphSearchResponse,
27
- RelationSearchRequest,
28
- RelationSearchResponse,
29
25
  SearchRequest,
30
26
  SearchResponse,
31
27
  SuggestRequest,
32
28
  SuggestResponse,
33
29
  )
34
30
  from nucliadb_protos.noderesources_pb2 import Shard
35
-
36
- from nucliadb.common.cluster.base import AbstractIndexNode
37
31
  from nucliadb_telemetry import metrics
38
32
 
39
33
  node_observer = metrics.Observer(
@@ -45,9 +39,7 @@ node_observer = metrics.Observer(
45
39
  )
46
40
 
47
41
 
48
- async def query_shard(
49
- node: AbstractIndexNode, shard: str, query: SearchRequest
50
- ) -> SearchResponse:
42
+ async def query_shard(node: AbstractIndexNode, shard: str, query: SearchRequest) -> SearchResponse:
51
43
  req = SearchRequest()
52
44
  req.CopyFrom(query)
53
45
  req.shard = shard
@@ -55,42 +47,16 @@ async def query_shard(
55
47
  return await node.reader.Search(req) # type: ignore
56
48
 
57
49
 
58
- async def get_shard(
59
- node: AbstractIndexNode, shard_id: str, vectorset: Optional[str] = None
60
- ) -> Shard:
50
+ async def get_shard(node: AbstractIndexNode, shard_id: str) -> Shard:
61
51
  req = GetShardRequest()
62
52
  req.shard_id.id = shard_id
63
- if vectorset is not None:
64
- req.vectorset = vectorset
65
53
  with node_observer({"type": "get_shard", "node_id": node.id}):
66
54
  return await node.reader.GetShard(req) # type: ignore
67
55
 
68
56
 
69
- async def query_paragraph_shard(
70
- node: AbstractIndexNode, shard: str, query: ParagraphSearchRequest
71
- ) -> ParagraphSearchResponse:
72
- req = ParagraphSearchRequest()
73
- req.CopyFrom(query)
74
- req.id = shard
75
- with node_observer({"type": "paragraph_search", "node_id": node.id}):
76
- return await node.reader.ParagraphSearch(req) # type: ignore
77
-
78
-
79
- async def suggest_shard(
80
- node: AbstractIndexNode, shard: str, query: SuggestRequest
81
- ) -> SuggestResponse:
57
+ async def suggest_shard(node: AbstractIndexNode, shard: str, query: SuggestRequest) -> SuggestResponse:
82
58
  req = SuggestRequest()
83
59
  req.CopyFrom(query)
84
60
  req.shard = shard
85
61
  with node_observer({"type": "suggest", "node_id": node.id}):
86
62
  return await node.reader.Suggest(req) # type: ignore
87
-
88
-
89
- async def relations_shard(
90
- node: AbstractIndexNode, shard: str, query: RelationSearchRequest
91
- ) -> RelationSearchResponse:
92
- req = RelationSearchRequest()
93
- req.CopyFrom(query)
94
- req.shard_id = shard
95
- with node_observer({"type": "relation_search", "node_id": node.id}):
96
- return await node.reader.RelationSearch(req) # type: ignore
@@ -20,8 +20,6 @@
20
20
  import asyncio
21
21
  from typing import Optional
22
22
 
23
- from nucliadb_protos.utils_pb2 import ExtractedText
24
-
25
23
  from nucliadb.common import datamanagers
26
24
  from nucliadb.common.maindb.utils import get_driver
27
25
  from nucliadb.ingest.fields.base import Field
@@ -35,6 +33,7 @@ from nucliadb_models.search import (
35
33
  SummarizeRequest,
36
34
  SummarizeResourceModel,
37
35
  )
36
+ from nucliadb_protos.utils_pb2 import ExtractedText
38
37
  from nucliadb_utils.utilities import get_storage
39
38
 
40
39
  ExtractedTexts = list[tuple[str, str, Optional[ExtractedText]]]
@@ -42,30 +41,31 @@ ExtractedTexts = list[tuple[str, str, Optional[ExtractedText]]]
42
41
  MAX_GET_EXTRACTED_TEXT_OPS = 20
43
42
 
44
43
 
44
+ class NoResourcesToSummarize(Exception):
45
+ pass
46
+
47
+
45
48
  async def summarize(kbid: str, request: SummarizeRequest) -> SummarizedResponse:
46
49
  predict_request = SummarizeModel()
47
50
  predict_request.generative_model = request.generative_model
48
51
  predict_request.user_prompt = request.user_prompt
49
52
  predict_request.summary_kind = request.summary_kind
50
53
 
51
- for uuid_or_slug, field_id, extracted_text in await get_extracted_texts(
52
- kbid, request.resources
53
- ):
54
+ for uuid_or_slug, field_id, extracted_text in await get_extracted_texts(kbid, request.resources):
54
55
  if extracted_text is None:
55
56
  continue
56
57
 
57
- fields = predict_request.resources.setdefault(
58
- uuid_or_slug, SummarizeResourceModel()
59
- ).fields
58
+ fields = predict_request.resources.setdefault(uuid_or_slug, SummarizeResourceModel()).fields
60
59
  fields[field_id] = extracted_text.text
61
60
 
61
+ if len(predict_request.resources) == 0:
62
+ raise NoResourcesToSummarize()
63
+
62
64
  predict = get_predict()
63
65
  return await predict.summarize(kbid, predict_request)
64
66
 
65
67
 
66
- async def get_extracted_texts(
67
- kbid: str, resource_uuids_or_slugs: list[str]
68
- ) -> ExtractedTexts:
68
+ async def get_extracted_texts(kbid: str, resource_uuids_or_slugs: list[str]) -> ExtractedTexts:
69
69
  results: ExtractedTexts = []
70
70
 
71
71
  driver = get_driver()
@@ -75,7 +75,7 @@ async def get_extracted_texts(
75
75
  tasks = []
76
76
 
77
77
  # Schedule getting extracted text for each field of each resource
78
- async with driver.transaction() as txn:
78
+ async with driver.transaction(read_only=True) as txn:
79
79
  if not await datamanagers.kb.exists_kb(txn, kbid=kbid):
80
80
  raise datamanagers.exceptions.KnowledgeBoxNotFound(kbid)
81
81
 
@@ -83,16 +83,12 @@ async def get_extracted_texts(
83
83
  for uuid_or_slug in set(resource_uuids_or_slugs):
84
84
  uuid = await get_resource_uuid(kb_orm, uuid_or_slug)
85
85
  if uuid is None:
86
- logger.warning(
87
- f"Resource {uuid_or_slug} not found in KB", extra={"kbid": kbid}
88
- )
86
+ logger.warning(f"Resource {uuid_or_slug} not found in KB", extra={"kbid": kbid})
89
87
  continue
90
88
  resource_orm = Resource(txn=txn, storage=storage, kb=kb_orm, uuid=uuid)
91
89
  fields = await resource_orm.get_fields(force=True)
92
90
  for _, field in fields.items():
93
- task = asyncio.create_task(
94
- get_extracted_text(uuid_or_slug, field, max_tasks)
95
- )
91
+ task = asyncio.create_task(get_extracted_text(uuid_or_slug, field, max_tasks))
96
92
  tasks.append(task)
97
93
 
98
94
  if len(tasks) == 0:
@@ -17,9 +17,26 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import logging
20
21
  from typing import Optional, Union
21
22
 
23
+ from pydantic import BaseModel
24
+
25
+ from nucliadb.common.datamanagers.atomic import kb
22
26
  from nucliadb_models.search import BaseSearchRequest, MinScore
27
+ from nucliadb_utils import const
28
+ from nucliadb_utils.utilities import has_feature
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ async def filter_hidden_resources(kbid: str, show_hidden: bool) -> Optional[bool]:
34
+ kb_config = await kb.get_config(kbid=kbid)
35
+ hidden_enabled = kb_config and kb_config.hidden_resources_enabled
36
+ if hidden_enabled and not show_hidden:
37
+ return False
38
+ else:
39
+ return None # None = No filtering, show all resources
23
40
 
24
41
 
25
42
  def is_empty_query(request: BaseSearchRequest) -> bool:
@@ -36,7 +53,7 @@ def is_exact_match_only_query(request: BaseSearchRequest) -> bool:
36
53
  'foo "something" else' -> False
37
54
  """
38
55
  query = request.query.strip()
39
- return len(query) > 0 and query[0] == '"' and query[-1] == '"'
56
+ return len(query) > 0 and query.startswith('"') and query.endswith('"')
40
57
 
41
58
 
42
59
  def should_disable_vector_search(request: BaseSearchRequest) -> bool:
@@ -58,9 +75,7 @@ def min_score_from_query_params(
58
75
  deprecated_min_score: Optional[float],
59
76
  ) -> MinScore:
60
77
  # Keep backward compatibility with the deprecated min_score parameter
61
- semantic = (
62
- deprecated_min_score if min_score_semantic is None else min_score_semantic
63
- )
78
+ semantic = deprecated_min_score if min_score_semantic is None else min_score_semantic
64
79
  return MinScore(bm25=min_score_bm25, semantic=semantic)
65
80
 
66
81
 
@@ -72,3 +87,11 @@ def min_score_from_payload(min_score: Optional[Union[float, MinScore]]) -> MinSc
72
87
  elif isinstance(min_score, float):
73
88
  return MinScore(bm25=0, semantic=min_score)
74
89
  return min_score
90
+
91
+
92
+ def maybe_log_request_payload(kbid: str, endpoint: str, item: BaseModel):
93
+ if has_feature(const.Features.LOG_REQUEST_PAYLOADS, context={"kbid": kbid}, default=False):
94
+ logger.info(
95
+ "Request payload",
96
+ extra={"kbid": kbid, "endpoint": endpoint, "payload": item.model_dump_json()},
97
+ )
@@ -18,6 +18,8 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
+ from typing import Optional
22
+
21
23
  from pydantic import Field
22
24
 
23
25
  from nucliadb.ingest.settings import DriverSettings
@@ -28,8 +30,20 @@ class Settings(DriverSettings):
28
30
  slow_find_log_threshold: float = Field(
29
31
  default=3.0,
30
32
  title="Slow query log threshold",
31
- description="The threshold in seconds for logging slow queries",
33
+ description="The threshold in seconds for logging slow find queries",
34
+ )
35
+
36
+ slow_node_query_log_threshold: float = Field(
37
+ default=2.0,
38
+ title="Slow node query log threshold",
39
+ description="The threshold in seconds for logging slow node queries",
40
+ )
41
+ prequeries_max_parallel: int = Field(
42
+ default=2,
43
+ title="Prequeries max parallel",
44
+ description="The maximum number of prequeries to run in parallel per /ask request",
32
45
  )
46
+ nidx_address: Optional[str] = Field(default=None)
33
47
 
34
48
 
35
49
  settings = Settings()
@@ -98,9 +98,7 @@ def get_temp_access_token(request: Request):
98
98
  logger.warning(
99
99
  "Dynamically generating JWK key. Please set JWK_KEY env variable to avoid this message."
100
100
  )
101
- settings.jwk_key = orjson.dumps(
102
- jwk.JWK.generate(kty="oct", size=256, kid="dyn")
103
- ).decode("utf-8")
101
+ settings.jwk_key = orjson.dumps(jwk.JWK.generate(kty="oct", size=256, kid="dyn")).decode("utf-8")
104
102
  jwetoken.add_recipient(jwk.JWK(**orjson.loads(settings.jwk_key)))
105
103
  token = jwetoken.serialize(compact=True)
106
104
  return JSONResponse({"token": token})
@@ -154,16 +152,14 @@ def introspect_endpoint(request: Request) -> StreamingResponse:
154
152
  return StreamingResponse(
155
153
  content=introspect.stream_tar(request.app),
156
154
  status_code=200,
157
- headers={
158
- "Content-Disposition": f"attachment; filename=introspect_{introspect_id}.tar.gz"
159
- },
155
+ headers={"Content-Disposition": f"attachment; filename=introspect_{introspect_id}.tar.gz"},
160
156
  media_type="application/octet-stream",
161
157
  )
162
158
 
163
159
 
164
160
  @standalone_api_router.get("/pull/position")
165
161
  async def pull_status(request: Request) -> JSONResponse:
166
- async with datamanagers.with_transaction() as txn:
162
+ async with datamanagers.with_ro_transaction() as txn:
167
163
  # standalone assumes 1 partition
168
164
  current_offset = await datamanagers.processing.get_pull_offset(
169
165
  txn, pull_type_id=processing.get_nua_api_id(), partition="1"
@@ -180,9 +176,7 @@ class UpdatePullPosition(pydantic.BaseModel):
180
176
 
181
177
 
182
178
  @standalone_api_router.patch("/pull/position")
183
- async def update_pull_position(
184
- request: Request, item: UpdatePullPosition
185
- ) -> JSONResponse:
179
+ async def update_pull_position(request: Request, item: UpdatePullPosition) -> JSONResponse:
186
180
  async with datamanagers.with_transaction() as txn:
187
181
  # standalone assumes 1 partition
188
182
  await datamanagers.processing.set_pull_offset(
@@ -20,29 +20,34 @@
20
20
  import logging
21
21
  import os
22
22
 
23
- import nucliadb_admin_assets # type: ignore
24
23
  from fastapi import FastAPI
25
24
  from fastapi.responses import RedirectResponse
26
25
  from fastapi.staticfiles import StaticFiles
27
26
  from starlette.middleware import Middleware
28
27
  from starlette.middleware.authentication import AuthenticationMiddleware
29
28
  from starlette.middleware.cors import CORSMiddleware
29
+ from starlette.requests import ClientDisconnect
30
30
  from starlette.responses import HTMLResponse
31
31
  from starlette.routing import Mount
32
32
 
33
- from nucliadb.common.context.fastapi import set_app_context
33
+ import nucliadb_admin_assets # type: ignore
34
34
  from nucliadb.middleware import ProcessTimeHeaderMiddleware
35
- from nucliadb.middleware.transaction import ReadOnlyTransactionMiddleware
36
35
  from nucliadb.reader import API_PREFIX
37
36
  from nucliadb.reader.api.v1.router import api as api_reader_v1
38
37
  from nucliadb.search.api.v1.router import api as api_search_v1
39
- from nucliadb.standalone.lifecycle import finalize, initialize
38
+ from nucliadb.standalone.lifecycle import lifespan
40
39
  from nucliadb.train.api.v1.router import api as api_train_v1
41
40
  from nucliadb.writer.api.v1.router import api as api_writer_v1
42
41
  from nucliadb_telemetry.fastapi import metrics_endpoint
42
+ from nucliadb_telemetry.fastapi.utils import (
43
+ client_disconnect_handler,
44
+ global_exception_handler,
45
+ )
46
+ from nucliadb_utils.audit.stream import AuditMiddleware
43
47
  from nucliadb_utils.fastapi.openapi import extend_openapi
44
48
  from nucliadb_utils.fastapi.versioning import VersionedFastAPI
45
49
  from nucliadb_utils.settings import http_settings, running_settings
50
+ from nucliadb_utils.utilities import get_audit
46
51
 
47
52
  from .api_router import standalone_api_router
48
53
  from .auth import get_auth_backend
@@ -66,7 +71,7 @@ HOMEPAGE_HTML = """
66
71
  <h2>Quick Links</h2>
67
72
  <ul>
68
73
  <li><a href="/admin">Admin UI</a></li>
69
- <li><a href="https://docs.nuclia.dev/docs/guides/nucliadb/deploy/basics">NucliaDB Deployment Documentation</a></li>
74
+ <li><a href="https://docs.nuclia.dev/docs/management/nucliadb/deploy/basics">NucliaDB Deployment Documentation</a></li>
70
75
  <li><a href="https://docs.nuclia.dev/docs/api">API Reference</a></li>
71
76
  <li><a href="/api/v1/docs">API Explorer</a></li>
72
77
  <li><a href="/metrics">Metrics</a></li>
@@ -89,7 +94,7 @@ def application_factory(settings: Settings) -> FastAPI:
89
94
  AuthenticationMiddleware,
90
95
  backend=get_auth_backend(settings),
91
96
  ),
92
- Middleware(ReadOnlyTransactionMiddleware),
97
+ Middleware(AuditMiddleware, audit_utility_getter=get_audit),
93
98
  ]
94
99
  if running_settings.debug:
95
100
  middleware.append(Middleware(ProcessTimeHeaderMiddleware))
@@ -97,8 +102,11 @@ def application_factory(settings: Settings) -> FastAPI:
97
102
  fastapi_settings = dict(
98
103
  debug=running_settings.debug,
99
104
  middleware=middleware,
100
- on_startup=[initialize],
101
- on_shutdown=[finalize],
105
+ lifespan=lifespan,
106
+ exception_handlers={
107
+ Exception: global_exception_handler,
108
+ ClientDisconnect: client_disconnect_handler,
109
+ },
102
110
  )
103
111
 
104
112
  base_app = FastAPI(title="NucliaDB API", **fastapi_settings) # type: ignore
@@ -131,9 +139,7 @@ def application_factory(settings: Settings) -> FastAPI:
131
139
  # mount admin app assets
132
140
  application.mount(
133
141
  "/admin",
134
- StaticFiles(
135
- directory=os.path.dirname(nucliadb_admin_assets.__file__), html=True
136
- ),
142
+ StaticFiles(directory=os.path.dirname(nucliadb_admin_assets.__file__), html=True),
137
143
  name="static",
138
144
  )
139
145
  # redirect /contributor -> /admin
@@ -149,7 +155,4 @@ def application_factory(settings: Settings) -> FastAPI:
149
155
  if isinstance(route, Mount):
150
156
  route.app.settings = settings # type: ignore
151
157
 
152
- # Inject application context into the fastapi app's state
153
- set_app_context(application)
154
-
155
158
  return application