nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,23 +18,14 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import asyncio
21
- from enum import Enum
22
- from typing import Any, Optional, TypeVar, Union, overload
21
+ import json
22
+ from enum import Enum, auto
23
+ from typing import Any, Optional, Sequence, TypeVar, Union, overload
23
24
 
24
25
  from fastapi import HTTPException
26
+ from google.protobuf.json_format import MessageToDict
25
27
  from grpc import StatusCode as GrpcStatusCode
26
28
  from grpc.aio import AioRpcError
27
- from nucliadb_protos.nodereader_pb2 import (
28
- ParagraphSearchRequest,
29
- ParagraphSearchResponse,
30
- RelationSearchRequest,
31
- RelationSearchResponse,
32
- SearchRequest,
33
- SearchResponse,
34
- SuggestRequest,
35
- SuggestResponse,
36
- )
37
- from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
38
29
 
39
30
  from nucliadb.common.cluster import manager as cluster_manager
40
31
  from nucliadb.common.cluster.base import AbstractIndexNode
@@ -42,64 +33,51 @@ from nucliadb.common.cluster.exceptions import ShardsNotFound
42
33
  from nucliadb.common.cluster.utils import get_shard_manager
43
34
  from nucliadb.search import logger
44
35
  from nucliadb.search.search.shards import (
45
- query_paragraph_shard,
46
36
  query_shard,
47
- relations_shard,
48
37
  suggest_shard,
49
38
  )
50
39
  from nucliadb.search.settings import settings
40
+ from nucliadb_protos.nodereader_pb2 import (
41
+ SearchRequest,
42
+ SearchResponse,
43
+ SuggestRequest,
44
+ SuggestResponse,
45
+ )
46
+ from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
51
47
  from nucliadb_telemetry import errors
52
48
  from nucliadb_utils import const
53
49
  from nucliadb_utils.utilities import has_feature
54
50
 
55
51
 
56
52
  class Method(Enum):
57
- SEARCH = 1
58
- PARAGRAPH = 2
59
- SUGGEST = 3
60
- RELATIONS = 4
53
+ SEARCH = auto()
54
+ SUGGEST = auto()
61
55
 
62
56
 
63
57
  METHODS = {
64
58
  Method.SEARCH: query_shard,
65
- Method.PARAGRAPH: query_paragraph_shard,
66
59
  Method.SUGGEST: suggest_shard,
67
- Method.RELATIONS: relations_shard,
68
60
  }
69
61
 
70
- REQUEST_TYPE = Union[
71
- SuggestRequest, ParagraphSearchRequest, SearchRequest, RelationSearchRequest
72
- ]
62
+ REQUEST_TYPE = Union[SuggestRequest, SearchRequest]
73
63
 
74
64
  T = TypeVar(
75
65
  "T",
76
66
  SuggestResponse,
77
- ParagraphSearchResponse,
78
67
  SearchResponse,
79
- RelationSearchResponse,
80
68
  )
81
69
 
82
70
 
83
- @overload # type: ignore
84
- async def node_query(
85
- kbid: str,
86
- method: Method,
87
- pb_query: SuggestRequest,
88
- target_shard_replicas: Optional[list[str]] = None,
89
- use_read_replica_nodes: bool = True,
90
- ) -> tuple[list[SuggestResponse], bool, list[tuple[AbstractIndexNode, str]]]:
91
- ...
92
-
93
-
94
71
  @overload
95
72
  async def node_query(
96
73
  kbid: str,
97
74
  method: Method,
98
- pb_query: ParagraphSearchRequest,
75
+ pb_query: SuggestRequest,
99
76
  target_shard_replicas: Optional[list[str]] = None,
100
77
  use_read_replica_nodes: bool = True,
101
- ) -> tuple[list[ParagraphSearchResponse], bool, list[tuple[AbstractIndexNode, str]]]:
102
- ...
78
+ timeout: Optional[float] = None,
79
+ retry_on_primary: bool = True,
80
+ ) -> tuple[list[SuggestResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
103
81
 
104
82
 
105
83
  @overload
@@ -109,19 +87,9 @@ async def node_query(
109
87
  pb_query: SearchRequest,
110
88
  target_shard_replicas: Optional[list[str]] = None,
111
89
  use_read_replica_nodes: bool = True,
112
- ) -> tuple[list[SearchResponse], bool, list[tuple[AbstractIndexNode, str]]]:
113
- ...
114
-
115
-
116
- @overload
117
- async def node_query(
118
- kbid: str,
119
- method: Method,
120
- pb_query: RelationSearchRequest,
121
- target_shard_replicas: Optional[list[str]] = None,
122
- use_read_replica_nodes: bool = True,
123
- ) -> tuple[list[RelationSearchResponse], bool, list[tuple[AbstractIndexNode, str]]]:
124
- ...
90
+ timeout: Optional[float] = None,
91
+ retry_on_primary: bool = True,
92
+ ) -> tuple[list[SearchResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
125
93
 
126
94
 
127
95
  async def node_query(
@@ -130,7 +98,10 @@ async def node_query(
130
98
  pb_query: REQUEST_TYPE,
131
99
  target_shard_replicas: Optional[list[str]] = None,
132
100
  use_read_replica_nodes: bool = True,
133
- ) -> tuple[list[T], bool, list[tuple[AbstractIndexNode, str]]]:
101
+ timeout: Optional[float] = None,
102
+ retry_on_primary: bool = True,
103
+ ) -> tuple[Sequence[Union[T, BaseException]], bool, list[tuple[AbstractIndexNode, str]]]:
104
+ timeout = timeout or settings.search_timeout
134
105
  use_read_replica_nodes = use_read_replica_nodes and has_feature(
135
106
  const.Features.READ_REPLICA_SEARCHES, context={"kbid": kbid}
136
107
  )
@@ -152,6 +123,7 @@ async def node_query(
152
123
  try:
153
124
  node, shard_id = cluster_manager.choose_node(
154
125
  shard_obj,
126
+ use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid}),
155
127
  use_read_replica_nodes=use_read_replica_nodes,
156
128
  target_shard_replicas=target_shard_replicas,
157
129
  )
@@ -173,9 +145,9 @@ async def node_query(
173
145
  )
174
146
 
175
147
  try:
176
- results = await asyncio.wait_for( # type: ignore
177
- asyncio.gather(*ops, return_exceptions=True), # type: ignore
178
- timeout=settings.search_timeout,
148
+ results: list[Union[T, BaseException]] = await asyncio.wait_for(
149
+ asyncio.gather(*ops, return_exceptions=True),
150
+ timeout=timeout,
179
151
  )
180
152
  except asyncio.TimeoutError as exc: # pragma: no cover
181
153
  logger.warning(
@@ -186,10 +158,20 @@ async def node_query(
186
158
 
187
159
  error = validate_node_query_results(results or [])
188
160
  if error is not None:
161
+ query_dict = MessageToDict(pb_query)
162
+ query_dict.pop("vector", None)
163
+ logger.error(
164
+ "Error while querying nodes",
165
+ extra={
166
+ "kbid": kbid,
167
+ "query": json.dumps(query_dict),
168
+ },
169
+ )
189
170
  if (
190
171
  error.status_code >= 500
191
172
  and use_read_replica_nodes
192
173
  and any([node.is_read_replica() for node, _ in queried_nodes])
174
+ and retry_on_primary
193
175
  ):
194
176
  # We had an error querying a secondary node, instead of raising an
195
177
  # error directly, retry query to primaries and hope it works
@@ -220,9 +202,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
220
202
  Handling of exception is responsibility of caller.
221
203
  """
222
204
  if results is None or len(results) == 0:
223
- return HTTPException(
224
- status_code=500, detail=f"Error while executing shard queries. No results."
225
- )
205
+ return HTTPException(status_code=500, detail=f"Error while executing shard queries. No results.")
226
206
 
227
207
  for result in results:
228
208
  if isinstance(result, Exception):
@@ -251,9 +231,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
251
231
  return None
252
232
 
253
233
 
254
- def debug_nodes_info(
255
- nodes: list[tuple[AbstractIndexNode, str]]
256
- ) -> list[dict[str, str]]:
234
+ def debug_nodes_info(nodes: list[tuple[AbstractIndexNode, str]]) -> list[dict[str, str]]:
257
235
  details: list[dict[str, str]] = []
258
236
  for node, shard_id in nodes:
259
237
  info = {
@@ -18,42 +18,77 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import asyncio
21
+ import contextlib
22
+ import logging
21
23
  from contextvars import ContextVar
22
24
  from typing import Optional
23
25
 
24
- from lru import LRU # type: ignore
26
+ from lru import LRU
25
27
 
26
- from nucliadb.common.maindb.driver import Transaction
28
+ from nucliadb.common.ids import FieldId
29
+ from nucliadb.common.maindb.utils import get_driver
30
+ from nucliadb.ingest.fields.base import Field
27
31
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
28
32
  from nucliadb.ingest.orm.resource import Resource as ResourceORM
29
- from nucliadb.middleware.transaction import get_read_only_transaction
30
33
  from nucliadb.search import SERVICE_NAME
34
+ from nucliadb_protos.utils_pb2 import ExtractedText
31
35
  from nucliadb_telemetry import metrics
32
36
  from nucliadb_utils.utilities import get_storage
33
37
 
34
- rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar(
35
- "rcache", default=None
36
- )
38
+ logger = logging.getLogger(__name__)
39
+
40
+ rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar("rcache", default=None)
41
+ etcache: ContextVar[Optional["ExtractedTextCache"]] = ContextVar("etcache", default=None)
37
42
 
38
43
 
39
44
  RESOURCE_LOCKS: dict[str, asyncio.Lock] = LRU(1000) # type: ignore
40
45
  RESOURCE_CACHE_OPS = metrics.Counter("nucliadb_resource_cache_ops", labels={"type": ""})
46
+ EXTRACTED_CACHE_OPS = metrics.Counter("nucliadb_extracted_text_cache_ops", labels={"type": ""})
47
+
48
+
49
+ def set_extracted_text_cache() -> None:
50
+ value = ExtractedTextCache()
51
+ etcache.set(value)
52
+
53
+
54
+ def get_extracted_text_cache() -> Optional["ExtractedTextCache"]:
55
+ return etcache.get()
56
+
57
+
58
+ def clear_extracted_text_cache() -> None:
59
+ value = etcache.get()
60
+ if value is not None:
61
+ value.clear()
62
+ etcache.set(None)
63
+
41
64
 
65
+ def set_resource_cache() -> None:
66
+ value: dict[str, ResourceORM] = {}
67
+ rcache.set(value)
42
68
 
43
- def get_resource_cache(clear: bool = False) -> dict[str, ResourceORM]:
44
- value: Optional[dict[str, ResourceORM]] = rcache.get()
45
- if value is None or clear:
46
- value = {}
47
- rcache.set(value)
48
- return value
49
69
 
70
+ def get_resource_cache() -> Optional[dict[str, ResourceORM]]:
71
+ return rcache.get()
50
72
 
51
- async def get_resource_from_cache(
52
- kbid: str, uuid: str, txn: Optional[Transaction] = None
53
- ) -> Optional[ResourceORM]:
73
+
74
+ def clear_resource_cache() -> None:
75
+ value = rcache.get()
76
+ if value is not None:
77
+ value.clear()
78
+ rcache.set(None)
79
+
80
+
81
+ async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
82
+ """
83
+ Will try to get the resource from the cache, if it's not there it will fetch it from the ORM and cache it.
84
+ """
54
85
  orm_resource: Optional[ResourceORM] = None
55
86
 
56
87
  resource_cache = get_resource_cache()
88
+ if resource_cache is None:
89
+ RESOURCE_CACHE_OPS.inc({"type": "miss"})
90
+ logger.warning("Resource cache not set")
91
+ return await _orm_get_resource(kbid, uuid)
57
92
 
58
93
  if uuid not in RESOURCE_LOCKS:
59
94
  RESOURCE_LOCKS[uuid] = asyncio.Lock()
@@ -61,11 +96,7 @@ async def get_resource_from_cache(
61
96
  async with RESOURCE_LOCKS[uuid]:
62
97
  if uuid not in resource_cache:
63
98
  RESOURCE_CACHE_OPS.inc({"type": "miss"})
64
- if txn is None:
65
- txn = await get_read_only_transaction()
66
- storage = await get_storage(service_name=SERVICE_NAME)
67
- kb = KnowledgeBoxORM(txn, storage, kbid)
68
- orm_resource = await kb.get(uuid)
99
+ orm_resource = await _orm_get_resource(kbid, uuid)
69
100
  else:
70
101
  RESOURCE_CACHE_OPS.inc({"type": "hit"})
71
102
 
@@ -75,3 +106,101 @@ async def get_resource_from_cache(
75
106
  orm_resource = resource_cache.get(uuid)
76
107
 
77
108
  return orm_resource
109
+
110
+
111
+ async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
112
+ async with get_driver().transaction(read_only=True) as txn:
113
+ storage = await get_storage(service_name=SERVICE_NAME)
114
+ kb = KnowledgeBoxORM(txn, storage, kbid)
115
+ return await kb.get(uuid)
116
+
117
+
118
+ class ExtractedTextCache:
119
+ """
120
+ Used to cache extracted text from a resource in memory during the process
121
+ of search results hydration.
122
+
123
+ This is needed to avoid fetching the same extracted text multiple times,
124
+ as matching text blocks are processed in parallel and the extracted text is
125
+ fetched for each field where the text block is found.
126
+ """
127
+
128
+ def __init__(self):
129
+ self.locks = {}
130
+ self.values = {}
131
+
132
+ def get_value(self, key: str) -> Optional[ExtractedText]:
133
+ return self.values.get(key)
134
+
135
+ def get_lock(self, key: str) -> asyncio.Lock:
136
+ return self.locks.setdefault(key, asyncio.Lock())
137
+
138
+ def set_value(self, key: str, value: ExtractedText) -> None:
139
+ self.values[key] = value
140
+
141
+ def clear(self):
142
+ self.values.clear()
143
+ self.locks.clear()
144
+
145
+
146
+ async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
147
+ cache = get_extracted_text_cache()
148
+ if cache is None:
149
+ logger.warning("Extracted text cache not set")
150
+ EXTRACTED_CACHE_OPS.inc({"type": "miss"})
151
+ return await field.get_extracted_text()
152
+
153
+ key = f"{field.kbid}/{field.uuid}/{field.id}"
154
+ extracted_text = cache.get_value(key)
155
+ if extracted_text is not None:
156
+ EXTRACTED_CACHE_OPS.inc({"type": "hit"})
157
+ return extracted_text
158
+
159
+ async with cache.get_lock(key):
160
+ # Check again in case another task already fetched it
161
+ extracted_text = cache.get_value(key)
162
+ if extracted_text is not None:
163
+ EXTRACTED_CACHE_OPS.inc({"type": "hit"})
164
+ return extracted_text
165
+
166
+ EXTRACTED_CACHE_OPS.inc({"type": "miss"})
167
+ extracted_text = await field.get_extracted_text()
168
+ if extracted_text is not None:
169
+ # Only cache if we actually have extracted text
170
+ cache.set_value(key, extracted_text)
171
+ return extracted_text
172
+
173
+
174
+ async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
175
+ rid = field.rid
176
+ orm_resource = await get_resource(kbid, rid)
177
+ if orm_resource is None:
178
+ return None
179
+ field_obj = await orm_resource.get_field(
180
+ key=field.key,
181
+ type=field.pb_type,
182
+ load=False,
183
+ )
184
+ return await get_field_extracted_text(field_obj)
185
+
186
+
187
+ @contextlib.contextmanager
188
+ def request_caches():
189
+ """
190
+ This context manager sets the caches for extracted text and resources for a request.
191
+
192
+ It should used at the beginning of a request handler to avoid fetching the same
193
+ resources and extracted text multiple times.
194
+
195
+ Makes sure to clean the caches at the end of the context manager.
196
+ >>> with request_caches():
197
+ ... resource = await get_resource(kbid, uuid)
198
+ ... extracted_text = await get_extracted_text_from_field_id(kbid, rid, field_id)
199
+ """
200
+ set_resource_cache()
201
+ set_extracted_text_cache()
202
+ try:
203
+ yield
204
+ finally:
205
+ clear_resource_cache()
206
+ clear_extracted_text_cache()