nucliadb 4.0.0.post542__py3-none-any.whl → 6.2.1.post2798__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (418) hide show
  1. migrations/0003_allfields_key.py +1 -35
  2. migrations/0009_upgrade_relations_and_texts_to_v2.py +4 -2
  3. migrations/0010_fix_corrupt_indexes.py +10 -10
  4. migrations/0011_materialize_labelset_ids.py +1 -16
  5. migrations/0012_rollover_shards.py +5 -10
  6. migrations/0014_rollover_shards.py +4 -5
  7. migrations/0015_targeted_rollover.py +5 -10
  8. migrations/0016_upgrade_to_paragraphs_v2.py +25 -28
  9. migrations/0017_multiple_writable_shards.py +2 -4
  10. migrations/0018_purge_orphan_kbslugs.py +5 -7
  11. migrations/0019_upgrade_to_paragraphs_v3.py +25 -28
  12. migrations/0020_drain_nodes_from_cluster.py +3 -3
  13. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +16 -19
  14. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  15. migrations/0023_backfill_pg_catalog.py +80 -0
  16. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  17. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  18. migrations/0027_rollover_texts3.py +73 -0
  19. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  20. migrations/pg/0002_catalog.py +42 -0
  21. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  22. nucliadb/common/cluster/base.py +30 -16
  23. nucliadb/common/cluster/discovery/base.py +6 -14
  24. nucliadb/common/cluster/discovery/k8s.py +9 -19
  25. nucliadb/common/cluster/discovery/manual.py +1 -3
  26. nucliadb/common/cluster/discovery/utils.py +1 -3
  27. nucliadb/common/cluster/grpc_node_dummy.py +3 -11
  28. nucliadb/common/cluster/index_node.py +10 -19
  29. nucliadb/common/cluster/manager.py +174 -59
  30. nucliadb/common/cluster/rebalance.py +27 -29
  31. nucliadb/common/cluster/rollover.py +353 -194
  32. nucliadb/common/cluster/settings.py +6 -0
  33. nucliadb/common/cluster/standalone/grpc_node_binding.py +13 -64
  34. nucliadb/common/cluster/standalone/index_node.py +4 -11
  35. nucliadb/common/cluster/standalone/service.py +2 -6
  36. nucliadb/common/cluster/standalone/utils.py +2 -6
  37. nucliadb/common/cluster/utils.py +29 -22
  38. nucliadb/common/constants.py +20 -0
  39. nucliadb/common/context/__init__.py +3 -0
  40. nucliadb/common/context/fastapi.py +8 -5
  41. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  42. nucliadb/common/datamanagers/__init__.py +7 -1
  43. nucliadb/common/datamanagers/atomic.py +22 -4
  44. nucliadb/common/datamanagers/cluster.py +5 -5
  45. nucliadb/common/datamanagers/entities.py +6 -16
  46. nucliadb/common/datamanagers/fields.py +84 -0
  47. nucliadb/common/datamanagers/kb.py +83 -37
  48. nucliadb/common/datamanagers/labels.py +26 -56
  49. nucliadb/common/datamanagers/processing.py +2 -6
  50. nucliadb/common/datamanagers/resources.py +41 -103
  51. nucliadb/common/datamanagers/rollover.py +76 -15
  52. nucliadb/common/datamanagers/synonyms.py +1 -1
  53. nucliadb/common/datamanagers/utils.py +15 -6
  54. nucliadb/common/datamanagers/vectorsets.py +110 -0
  55. nucliadb/common/external_index_providers/base.py +257 -0
  56. nucliadb/{ingest/tests/unit/orm/test_orm_utils.py → common/external_index_providers/exceptions.py} +9 -8
  57. nucliadb/common/external_index_providers/manager.py +101 -0
  58. nucliadb/common/external_index_providers/pinecone.py +933 -0
  59. nucliadb/common/external_index_providers/settings.py +52 -0
  60. nucliadb/common/http_clients/auth.py +3 -6
  61. nucliadb/common/http_clients/processing.py +6 -11
  62. nucliadb/common/http_clients/utils.py +1 -3
  63. nucliadb/common/ids.py +240 -0
  64. nucliadb/common/locking.py +29 -7
  65. nucliadb/common/maindb/driver.py +11 -35
  66. nucliadb/common/maindb/exceptions.py +3 -0
  67. nucliadb/common/maindb/local.py +22 -9
  68. nucliadb/common/maindb/pg.py +206 -111
  69. nucliadb/common/maindb/utils.py +11 -42
  70. nucliadb/common/models_utils/from_proto.py +479 -0
  71. nucliadb/common/models_utils/to_proto.py +60 -0
  72. nucliadb/common/nidx.py +260 -0
  73. nucliadb/export_import/datamanager.py +25 -19
  74. nucliadb/export_import/exporter.py +5 -11
  75. nucliadb/export_import/importer.py +5 -7
  76. nucliadb/export_import/models.py +3 -3
  77. nucliadb/export_import/tasks.py +4 -4
  78. nucliadb/export_import/utils.py +25 -37
  79. nucliadb/health.py +1 -3
  80. nucliadb/ingest/app.py +15 -11
  81. nucliadb/ingest/consumer/auditing.py +21 -19
  82. nucliadb/ingest/consumer/consumer.py +82 -47
  83. nucliadb/ingest/consumer/materializer.py +5 -12
  84. nucliadb/ingest/consumer/pull.py +12 -27
  85. nucliadb/ingest/consumer/service.py +19 -17
  86. nucliadb/ingest/consumer/shard_creator.py +2 -4
  87. nucliadb/ingest/consumer/utils.py +1 -3
  88. nucliadb/ingest/fields/base.py +137 -105
  89. nucliadb/ingest/fields/conversation.py +18 -5
  90. nucliadb/ingest/fields/exceptions.py +1 -4
  91. nucliadb/ingest/fields/file.py +7 -16
  92. nucliadb/ingest/fields/link.py +5 -10
  93. nucliadb/ingest/fields/text.py +9 -4
  94. nucliadb/ingest/orm/brain.py +200 -213
  95. nucliadb/ingest/orm/broker_message.py +181 -0
  96. nucliadb/ingest/orm/entities.py +36 -51
  97. nucliadb/ingest/orm/exceptions.py +12 -0
  98. nucliadb/ingest/orm/knowledgebox.py +322 -197
  99. nucliadb/ingest/orm/processor/__init__.py +2 -700
  100. nucliadb/ingest/orm/processor/auditing.py +4 -23
  101. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  102. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  103. nucliadb/ingest/orm/processor/processor.py +752 -0
  104. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  105. nucliadb/ingest/orm/resource.py +249 -403
  106. nucliadb/ingest/orm/utils.py +4 -4
  107. nucliadb/ingest/partitions.py +3 -9
  108. nucliadb/ingest/processing.py +70 -73
  109. nucliadb/ingest/py.typed +0 -0
  110. nucliadb/ingest/serialize.py +37 -167
  111. nucliadb/ingest/service/__init__.py +1 -3
  112. nucliadb/ingest/service/writer.py +185 -412
  113. nucliadb/ingest/settings.py +10 -20
  114. nucliadb/ingest/utils.py +3 -6
  115. nucliadb/learning_proxy.py +242 -55
  116. nucliadb/metrics_exporter.py +30 -19
  117. nucliadb/middleware/__init__.py +1 -3
  118. nucliadb/migrator/command.py +1 -3
  119. nucliadb/migrator/datamanager.py +13 -13
  120. nucliadb/migrator/migrator.py +47 -30
  121. nucliadb/migrator/utils.py +18 -10
  122. nucliadb/purge/__init__.py +139 -33
  123. nucliadb/purge/orphan_shards.py +7 -13
  124. nucliadb/reader/__init__.py +1 -3
  125. nucliadb/reader/api/models.py +1 -12
  126. nucliadb/reader/api/v1/__init__.py +0 -1
  127. nucliadb/reader/api/v1/download.py +21 -88
  128. nucliadb/reader/api/v1/export_import.py +1 -1
  129. nucliadb/reader/api/v1/knowledgebox.py +10 -10
  130. nucliadb/reader/api/v1/learning_config.py +2 -6
  131. nucliadb/reader/api/v1/resource.py +62 -88
  132. nucliadb/reader/api/v1/services.py +64 -83
  133. nucliadb/reader/app.py +12 -29
  134. nucliadb/reader/lifecycle.py +18 -4
  135. nucliadb/reader/py.typed +0 -0
  136. nucliadb/reader/reader/notifications.py +10 -28
  137. nucliadb/search/__init__.py +1 -3
  138. nucliadb/search/api/v1/__init__.py +1 -2
  139. nucliadb/search/api/v1/ask.py +17 -10
  140. nucliadb/search/api/v1/catalog.py +184 -0
  141. nucliadb/search/api/v1/feedback.py +16 -24
  142. nucliadb/search/api/v1/find.py +36 -36
  143. nucliadb/search/api/v1/knowledgebox.py +89 -60
  144. nucliadb/search/api/v1/resource/ask.py +2 -8
  145. nucliadb/search/api/v1/resource/search.py +49 -70
  146. nucliadb/search/api/v1/search.py +44 -210
  147. nucliadb/search/api/v1/suggest.py +39 -54
  148. nucliadb/search/app.py +12 -32
  149. nucliadb/search/lifecycle.py +10 -3
  150. nucliadb/search/predict.py +136 -187
  151. nucliadb/search/py.typed +0 -0
  152. nucliadb/search/requesters/utils.py +25 -58
  153. nucliadb/search/search/cache.py +149 -20
  154. nucliadb/search/search/chat/ask.py +571 -123
  155. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -14
  156. nucliadb/search/search/chat/images.py +41 -17
  157. nucliadb/search/search/chat/prompt.py +817 -266
  158. nucliadb/search/search/chat/query.py +213 -309
  159. nucliadb/{tests/migrations/__init__.py → search/search/cut.py} +8 -8
  160. nucliadb/search/search/fetch.py +43 -36
  161. nucliadb/search/search/filters.py +9 -15
  162. nucliadb/search/search/find.py +214 -53
  163. nucliadb/search/search/find_merge.py +408 -391
  164. nucliadb/search/search/hydrator.py +191 -0
  165. nucliadb/search/search/merge.py +187 -223
  166. nucliadb/search/search/metrics.py +73 -2
  167. nucliadb/search/search/paragraphs.py +64 -106
  168. nucliadb/search/search/pgcatalog.py +233 -0
  169. nucliadb/search/search/predict_proxy.py +1 -1
  170. nucliadb/search/search/query.py +305 -150
  171. nucliadb/search/search/query_parser/exceptions.py +22 -0
  172. nucliadb/search/search/query_parser/models.py +101 -0
  173. nucliadb/search/search/query_parser/parser.py +183 -0
  174. nucliadb/search/search/rank_fusion.py +204 -0
  175. nucliadb/search/search/rerankers.py +270 -0
  176. nucliadb/search/search/shards.py +3 -32
  177. nucliadb/search/search/summarize.py +7 -18
  178. nucliadb/search/search/utils.py +27 -4
  179. nucliadb/search/settings.py +15 -1
  180. nucliadb/standalone/api_router.py +4 -10
  181. nucliadb/standalone/app.py +8 -14
  182. nucliadb/standalone/auth.py +7 -21
  183. nucliadb/standalone/config.py +7 -10
  184. nucliadb/standalone/lifecycle.py +26 -25
  185. nucliadb/standalone/migrations.py +1 -3
  186. nucliadb/standalone/purge.py +1 -1
  187. nucliadb/standalone/py.typed +0 -0
  188. nucliadb/standalone/run.py +3 -6
  189. nucliadb/standalone/settings.py +9 -16
  190. nucliadb/standalone/versions.py +15 -5
  191. nucliadb/tasks/consumer.py +8 -12
  192. nucliadb/tasks/producer.py +7 -6
  193. nucliadb/tests/config.py +53 -0
  194. nucliadb/train/__init__.py +1 -3
  195. nucliadb/train/api/utils.py +1 -2
  196. nucliadb/train/api/v1/shards.py +1 -1
  197. nucliadb/train/api/v1/trainset.py +2 -4
  198. nucliadb/train/app.py +10 -31
  199. nucliadb/train/generator.py +10 -19
  200. nucliadb/train/generators/field_classifier.py +7 -19
  201. nucliadb/train/generators/field_streaming.py +156 -0
  202. nucliadb/train/generators/image_classifier.py +12 -18
  203. nucliadb/train/generators/paragraph_classifier.py +5 -9
  204. nucliadb/train/generators/paragraph_streaming.py +6 -9
  205. nucliadb/train/generators/question_answer_streaming.py +19 -20
  206. nucliadb/train/generators/sentence_classifier.py +9 -15
  207. nucliadb/train/generators/token_classifier.py +48 -39
  208. nucliadb/train/generators/utils.py +14 -18
  209. nucliadb/train/lifecycle.py +7 -3
  210. nucliadb/train/nodes.py +23 -32
  211. nucliadb/train/py.typed +0 -0
  212. nucliadb/train/servicer.py +13 -21
  213. nucliadb/train/settings.py +2 -6
  214. nucliadb/train/types.py +13 -10
  215. nucliadb/train/upload.py +3 -6
  216. nucliadb/train/uploader.py +19 -23
  217. nucliadb/train/utils.py +1 -1
  218. nucliadb/writer/__init__.py +1 -3
  219. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  220. nucliadb/writer/api/v1/export_import.py +67 -14
  221. nucliadb/writer/api/v1/field.py +16 -269
  222. nucliadb/writer/api/v1/knowledgebox.py +218 -68
  223. nucliadb/writer/api/v1/resource.py +68 -88
  224. nucliadb/writer/api/v1/services.py +51 -70
  225. nucliadb/writer/api/v1/slug.py +61 -0
  226. nucliadb/writer/api/v1/transaction.py +67 -0
  227. nucliadb/writer/api/v1/upload.py +143 -117
  228. nucliadb/writer/app.py +6 -43
  229. nucliadb/writer/back_pressure.py +16 -38
  230. nucliadb/writer/exceptions.py +0 -4
  231. nucliadb/writer/lifecycle.py +21 -15
  232. nucliadb/writer/py.typed +0 -0
  233. nucliadb/writer/resource/audit.py +2 -1
  234. nucliadb/writer/resource/basic.py +48 -46
  235. nucliadb/writer/resource/field.py +37 -128
  236. nucliadb/writer/resource/origin.py +1 -2
  237. nucliadb/writer/settings.py +6 -2
  238. nucliadb/writer/tus/__init__.py +17 -15
  239. nucliadb/writer/tus/azure.py +111 -0
  240. nucliadb/writer/tus/dm.py +17 -5
  241. nucliadb/writer/tus/exceptions.py +1 -3
  242. nucliadb/writer/tus/gcs.py +49 -84
  243. nucliadb/writer/tus/local.py +21 -37
  244. nucliadb/writer/tus/s3.py +28 -68
  245. nucliadb/writer/tus/storage.py +5 -56
  246. nucliadb/writer/vectorsets.py +125 -0
  247. nucliadb-6.2.1.post2798.dist-info/METADATA +148 -0
  248. nucliadb-6.2.1.post2798.dist-info/RECORD +343 -0
  249. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/WHEEL +1 -1
  250. nucliadb/common/maindb/redis.py +0 -194
  251. nucliadb/common/maindb/tikv.py +0 -433
  252. nucliadb/ingest/fields/layout.py +0 -58
  253. nucliadb/ingest/tests/conftest.py +0 -30
  254. nucliadb/ingest/tests/fixtures.py +0 -764
  255. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  256. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -78
  257. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -126
  258. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  259. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  260. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  261. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -684
  262. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  263. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  264. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  265. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -139
  266. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  267. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  268. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -140
  269. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  270. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  271. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  272. nucliadb/ingest/tests/unit/orm/test_brain_vectors.py +0 -74
  273. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  274. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -331
  275. nucliadb/ingest/tests/unit/test_cache.py +0 -31
  276. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  277. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  278. nucliadb/middleware/transaction.py +0 -117
  279. nucliadb/reader/api/v1/learning_collector.py +0 -63
  280. nucliadb/reader/tests/__init__.py +0 -19
  281. nucliadb/reader/tests/conftest.py +0 -31
  282. nucliadb/reader/tests/fixtures.py +0 -136
  283. nucliadb/reader/tests/test_list_resources.py +0 -75
  284. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  285. nucliadb/reader/tests/test_reader_resource.py +0 -353
  286. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  287. nucliadb/search/api/v1/chat.py +0 -263
  288. nucliadb/search/api/v1/resource/chat.py +0 -174
  289. nucliadb/search/tests/__init__.py +0 -19
  290. nucliadb/search/tests/conftest.py +0 -33
  291. nucliadb/search/tests/fixtures.py +0 -199
  292. nucliadb/search/tests/node.py +0 -466
  293. nucliadb/search/tests/unit/__init__.py +0 -18
  294. nucliadb/search/tests/unit/api/__init__.py +0 -19
  295. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  296. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  297. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -98
  298. nucliadb/search/tests/unit/api/v1/test_ask.py +0 -120
  299. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  300. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  301. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -99
  302. nucliadb/search/tests/unit/search/__init__.py +0 -18
  303. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  304. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -211
  305. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  306. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  307. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  308. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -270
  309. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  310. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  311. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  312. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  313. nucliadb/search/tests/unit/search/test_query.py +0 -153
  314. nucliadb/search/tests/unit/test_app.py +0 -79
  315. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  316. nucliadb/search/tests/unit/test_merge.py +0 -34
  317. nucliadb/search/tests/unit/test_predict.py +0 -525
  318. nucliadb/standalone/tests/__init__.py +0 -19
  319. nucliadb/standalone/tests/conftest.py +0 -33
  320. nucliadb/standalone/tests/fixtures.py +0 -38
  321. nucliadb/standalone/tests/unit/__init__.py +0 -18
  322. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  323. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  324. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  325. nucliadb/standalone/tests/unit/test_migrations.py +0 -63
  326. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  327. nucliadb/tests/benchmarks/__init__.py +0 -19
  328. nucliadb/tests/benchmarks/test_search.py +0 -99
  329. nucliadb/tests/conftest.py +0 -32
  330. nucliadb/tests/fixtures.py +0 -735
  331. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -202
  332. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -107
  333. nucliadb/tests/migrations/test_migration_0017.py +0 -76
  334. nucliadb/tests/migrations/test_migration_0018.py +0 -95
  335. nucliadb/tests/tikv.py +0 -240
  336. nucliadb/tests/unit/__init__.py +0 -19
  337. nucliadb/tests/unit/common/__init__.py +0 -19
  338. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  339. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  340. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -172
  341. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  342. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -114
  343. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -61
  344. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -408
  345. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -173
  346. nucliadb/tests/unit/common/cluster/test_rebalance.py +0 -38
  347. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -282
  348. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  349. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  350. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  351. nucliadb/tests/unit/common/maindb/test_utils.py +0 -92
  352. nucliadb/tests/unit/common/test_context.py +0 -36
  353. nucliadb/tests/unit/export_import/__init__.py +0 -19
  354. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  355. nucliadb/tests/unit/export_import/test_utils.py +0 -301
  356. nucliadb/tests/unit/migrator/__init__.py +0 -19
  357. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  358. nucliadb/tests/unit/tasks/__init__.py +0 -19
  359. nucliadb/tests/unit/tasks/conftest.py +0 -42
  360. nucliadb/tests/unit/tasks/test_consumer.py +0 -92
  361. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  362. nucliadb/tests/unit/tasks/test_tasks.py +0 -58
  363. nucliadb/tests/unit/test_field_ids.py +0 -49
  364. nucliadb/tests/unit/test_health.py +0 -86
  365. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  366. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  367. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  368. nucliadb/tests/unit/test_purge.py +0 -136
  369. nucliadb/tests/utils/__init__.py +0 -74
  370. nucliadb/tests/utils/aiohttp_session.py +0 -44
  371. nucliadb/tests/utils/broker_messages/__init__.py +0 -171
  372. nucliadb/tests/utils/broker_messages/fields.py +0 -197
  373. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  374. nucliadb/tests/utils/entities.py +0 -78
  375. nucliadb/train/api/v1/check.py +0 -60
  376. nucliadb/train/tests/__init__.py +0 -19
  377. nucliadb/train/tests/conftest.py +0 -29
  378. nucliadb/train/tests/fixtures.py +0 -342
  379. nucliadb/train/tests/test_field_classification.py +0 -122
  380. nucliadb/train/tests/test_get_entities.py +0 -80
  381. nucliadb/train/tests/test_get_info.py +0 -51
  382. nucliadb/train/tests/test_get_ontology.py +0 -34
  383. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  384. nucliadb/train/tests/test_image_classification.py +0 -221
  385. nucliadb/train/tests/test_list_fields.py +0 -39
  386. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  387. nucliadb/train/tests/test_list_resources.py +0 -39
  388. nucliadb/train/tests/test_list_sentences.py +0 -71
  389. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  390. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  391. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  392. nucliadb/train/tests/test_sentence_classification.py +0 -143
  393. nucliadb/train/tests/test_token_classification.py +0 -136
  394. nucliadb/train/tests/utils.py +0 -101
  395. nucliadb/writer/layouts/__init__.py +0 -51
  396. nucliadb/writer/layouts/v1.py +0 -59
  397. nucliadb/writer/tests/__init__.py +0 -19
  398. nucliadb/writer/tests/conftest.py +0 -31
  399. nucliadb/writer/tests/fixtures.py +0 -191
  400. nucliadb/writer/tests/test_fields.py +0 -475
  401. nucliadb/writer/tests/test_files.py +0 -740
  402. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  403. nucliadb/writer/tests/test_reprocess_file_field.py +0 -133
  404. nucliadb/writer/tests/test_resources.py +0 -476
  405. nucliadb/writer/tests/test_service.py +0 -137
  406. nucliadb/writer/tests/test_tus.py +0 -203
  407. nucliadb/writer/tests/utils.py +0 -35
  408. nucliadb/writer/tus/pg.py +0 -125
  409. nucliadb-4.0.0.post542.dist-info/METADATA +0 -135
  410. nucliadb-4.0.0.post542.dist-info/RECORD +0 -462
  411. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  412. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  413. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  414. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  415. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  416. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/entry_points.txt +0 -0
  417. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/top_level.txt +0 -0
  418. {nucliadb-4.0.0.post542.dist-info → nucliadb-6.2.1.post2798.dist-info}/zip-safe +0 -0
@@ -19,24 +19,13 @@
19
19
 
20
20
  import asyncio
21
21
  import json
22
- from enum import Enum
22
+ from enum import Enum, auto
23
23
  from typing import Any, Optional, Sequence, TypeVar, Union, overload
24
24
 
25
25
  from fastapi import HTTPException
26
26
  from google.protobuf.json_format import MessageToDict
27
27
  from grpc import StatusCode as GrpcStatusCode
28
28
  from grpc.aio import AioRpcError
29
- from nucliadb_protos.nodereader_pb2 import (
30
- ParagraphSearchRequest,
31
- ParagraphSearchResponse,
32
- RelationSearchRequest,
33
- RelationSearchResponse,
34
- SearchRequest,
35
- SearchResponse,
36
- SuggestRequest,
37
- SuggestResponse,
38
- )
39
- from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
40
29
 
41
30
  from nucliadb.common.cluster import manager as cluster_manager
42
31
  from nucliadb.common.cluster.base import AbstractIndexNode
@@ -44,66 +33,53 @@ from nucliadb.common.cluster.exceptions import ShardsNotFound
44
33
  from nucliadb.common.cluster.utils import get_shard_manager
45
34
  from nucliadb.search import logger
46
35
  from nucliadb.search.search.shards import (
47
- query_paragraph_shard,
48
36
  query_shard,
49
- relations_shard,
50
37
  suggest_shard,
51
38
  )
52
39
  from nucliadb.search.settings import settings
40
+ from nucliadb_protos.nodereader_pb2 import (
41
+ SearchRequest,
42
+ SearchResponse,
43
+ SuggestRequest,
44
+ SuggestResponse,
45
+ )
46
+ from nucliadb_protos.writer_pb2 import ShardObject as PBShardObject
53
47
  from nucliadb_telemetry import errors
54
48
  from nucliadb_utils import const
55
49
  from nucliadb_utils.utilities import has_feature
56
50
 
57
51
 
58
52
  class Method(Enum):
59
- SEARCH = 1
60
- PARAGRAPH = 2
61
- SUGGEST = 3
62
- RELATIONS = 4
53
+ SEARCH = auto()
54
+ SUGGEST = auto()
63
55
 
64
56
 
65
57
  METHODS = {
66
58
  Method.SEARCH: query_shard,
67
- Method.PARAGRAPH: query_paragraph_shard,
68
59
  Method.SUGGEST: suggest_shard,
69
- Method.RELATIONS: relations_shard,
70
60
  }
71
61
 
72
- REQUEST_TYPE = Union[
73
- SuggestRequest, ParagraphSearchRequest, SearchRequest, RelationSearchRequest
74
- ]
62
+ REQUEST_TYPE = Union[SuggestRequest, SearchRequest]
75
63
 
76
64
  T = TypeVar(
77
65
  "T",
78
66
  SuggestResponse,
79
- ParagraphSearchResponse,
80
67
  SearchResponse,
81
- RelationSearchResponse,
82
68
  )
83
69
 
84
70
 
85
- @overload # type: ignore
71
+ @overload
86
72
  async def node_query(
87
73
  kbid: str,
88
74
  method: Method,
89
75
  pb_query: SuggestRequest,
90
76
  target_shard_replicas: Optional[list[str]] = None,
91
77
  use_read_replica_nodes: bool = True,
78
+ timeout: Optional[float] = None,
79
+ retry_on_primary: bool = True,
92
80
  ) -> tuple[list[SuggestResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
93
81
 
94
82
 
95
- @overload
96
- async def node_query(
97
- kbid: str,
98
- method: Method,
99
- pb_query: ParagraphSearchRequest,
100
- target_shard_replicas: Optional[list[str]] = None,
101
- use_read_replica_nodes: bool = True,
102
- ) -> tuple[
103
- list[ParagraphSearchResponse], bool, list[tuple[AbstractIndexNode, str]]
104
- ]: ...
105
-
106
-
107
83
  @overload
108
84
  async def node_query(
109
85
  kbid: str,
@@ -111,28 +87,21 @@ async def node_query(
111
87
  pb_query: SearchRequest,
112
88
  target_shard_replicas: Optional[list[str]] = None,
113
89
  use_read_replica_nodes: bool = True,
90
+ timeout: Optional[float] = None,
91
+ retry_on_primary: bool = True,
114
92
  ) -> tuple[list[SearchResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
115
93
 
116
94
 
117
- @overload
118
- async def node_query(
119
- kbid: str,
120
- method: Method,
121
- pb_query: RelationSearchRequest,
122
- target_shard_replicas: Optional[list[str]] = None,
123
- use_read_replica_nodes: bool = True,
124
- ) -> tuple[list[RelationSearchResponse], bool, list[tuple[AbstractIndexNode, str]]]: ...
125
-
126
-
127
95
  async def node_query(
128
96
  kbid: str,
129
97
  method: Method,
130
98
  pb_query: REQUEST_TYPE,
131
99
  target_shard_replicas: Optional[list[str]] = None,
132
100
  use_read_replica_nodes: bool = True,
133
- ) -> tuple[
134
- Sequence[Union[T, BaseException]], bool, list[tuple[AbstractIndexNode, str]]
135
- ]:
101
+ timeout: Optional[float] = None,
102
+ retry_on_primary: bool = True,
103
+ ) -> tuple[Sequence[Union[T, BaseException]], bool, list[tuple[AbstractIndexNode, str]]]:
104
+ timeout = timeout or settings.search_timeout
136
105
  use_read_replica_nodes = use_read_replica_nodes and has_feature(
137
106
  const.Features.READ_REPLICA_SEARCHES, context={"kbid": kbid}
138
107
  )
@@ -154,6 +123,7 @@ async def node_query(
154
123
  try:
155
124
  node, shard_id = cluster_manager.choose_node(
156
125
  shard_obj,
126
+ use_nidx=has_feature(const.Features.NIDX_READS, context={"kbid": kbid}),
157
127
  use_read_replica_nodes=use_read_replica_nodes,
158
128
  target_shard_replicas=target_shard_replicas,
159
129
  )
@@ -177,7 +147,7 @@ async def node_query(
177
147
  try:
178
148
  results: list[Union[T, BaseException]] = await asyncio.wait_for(
179
149
  asyncio.gather(*ops, return_exceptions=True),
180
- timeout=settings.search_timeout,
150
+ timeout=timeout,
181
151
  )
182
152
  except asyncio.TimeoutError as exc: # pragma: no cover
183
153
  logger.warning(
@@ -201,6 +171,7 @@ async def node_query(
201
171
  error.status_code >= 500
202
172
  and use_read_replica_nodes
203
173
  and any([node.is_read_replica() for node, _ in queried_nodes])
174
+ and retry_on_primary
204
175
  ):
205
176
  # We had an error querying a secondary node, instead of raising an
206
177
  # error directly, retry query to primaries and hope it works
@@ -231,9 +202,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
231
202
  Handling of exception is responsibility of caller.
232
203
  """
233
204
  if results is None or len(results) == 0:
234
- return HTTPException(
235
- status_code=500, detail=f"Error while executing shard queries. No results."
236
- )
205
+ return HTTPException(status_code=500, detail=f"Error while executing shard queries. No results.")
237
206
 
238
207
  for result in results:
239
208
  if isinstance(result, Exception):
@@ -262,9 +231,7 @@ def validate_node_query_results(results: list[Any]) -> Optional[HTTPException]:
262
231
  return None
263
232
 
264
233
 
265
- def debug_nodes_info(
266
- nodes: list[tuple[AbstractIndexNode, str]]
267
- ) -> list[dict[str, str]]:
234
+ def debug_nodes_info(nodes: list[tuple[AbstractIndexNode, str]]) -> list[dict[str, str]]:
268
235
  details: list[dict[str, str]] = []
269
236
  for node, shard_id in nodes:
270
237
  info = {
@@ -18,42 +18,77 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
 
20
20
  import asyncio
21
+ import contextlib
22
+ import logging
21
23
  from contextvars import ContextVar
22
24
  from typing import Optional
23
25
 
24
- from lru import LRU # type: ignore
26
+ from lru import LRU
25
27
 
26
- from nucliadb.common.maindb.driver import Transaction
28
+ from nucliadb.common.ids import FieldId
29
+ from nucliadb.common.maindb.utils import get_driver
30
+ from nucliadb.ingest.fields.base import Field
27
31
  from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
28
32
  from nucliadb.ingest.orm.resource import Resource as ResourceORM
29
- from nucliadb.middleware.transaction import get_read_only_transaction
30
33
  from nucliadb.search import SERVICE_NAME
34
+ from nucliadb_protos.utils_pb2 import ExtractedText
31
35
  from nucliadb_telemetry import metrics
32
36
  from nucliadb_utils.utilities import get_storage
33
37
 
34
- rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar(
35
- "rcache", default=None
36
- )
38
+ logger = logging.getLogger(__name__)
39
+
40
+ rcache: ContextVar[Optional[dict[str, ResourceORM]]] = ContextVar("rcache", default=None)
41
+ etcache: ContextVar[Optional["ExtractedTextCache"]] = ContextVar("etcache", default=None)
37
42
 
38
43
 
39
44
  RESOURCE_LOCKS: dict[str, asyncio.Lock] = LRU(1000) # type: ignore
40
45
  RESOURCE_CACHE_OPS = metrics.Counter("nucliadb_resource_cache_ops", labels={"type": ""})
46
+ EXTRACTED_CACHE_OPS = metrics.Counter("nucliadb_extracted_text_cache_ops", labels={"type": ""})
47
+
48
+
49
+ def set_extracted_text_cache() -> None:
50
+ value = ExtractedTextCache()
51
+ etcache.set(value)
52
+
53
+
54
+ def get_extracted_text_cache() -> Optional["ExtractedTextCache"]:
55
+ return etcache.get()
56
+
57
+
58
+ def clear_extracted_text_cache() -> None:
59
+ value = etcache.get()
60
+ if value is not None:
61
+ value.clear()
62
+ etcache.set(None)
63
+
41
64
 
65
+ def set_resource_cache() -> None:
66
+ value: dict[str, ResourceORM] = {}
67
+ rcache.set(value)
42
68
 
43
- def get_resource_cache(clear: bool = False) -> dict[str, ResourceORM]:
44
- value: Optional[dict[str, ResourceORM]] = rcache.get()
45
- if value is None or clear:
46
- value = {}
47
- rcache.set(value)
48
- return value
49
69
 
70
+ def get_resource_cache() -> Optional[dict[str, ResourceORM]]:
71
+ return rcache.get()
50
72
 
51
- async def get_resource_from_cache(
52
- kbid: str, uuid: str, txn: Optional[Transaction] = None
53
- ) -> Optional[ResourceORM]:
73
+
74
+ def clear_resource_cache() -> None:
75
+ value = rcache.get()
76
+ if value is not None:
77
+ value.clear()
78
+ rcache.set(None)
79
+
80
+
81
+ async def get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
82
+ """
83
+ Will try to get the resource from the cache, if it's not there it will fetch it from the ORM and cache it.
84
+ """
54
85
  orm_resource: Optional[ResourceORM] = None
55
86
 
56
87
  resource_cache = get_resource_cache()
88
+ if resource_cache is None:
89
+ RESOURCE_CACHE_OPS.inc({"type": "miss"})
90
+ logger.warning("Resource cache not set")
91
+ return await _orm_get_resource(kbid, uuid)
57
92
 
58
93
  if uuid not in RESOURCE_LOCKS:
59
94
  RESOURCE_LOCKS[uuid] = asyncio.Lock()
@@ -61,11 +96,7 @@ async def get_resource_from_cache(
61
96
  async with RESOURCE_LOCKS[uuid]:
62
97
  if uuid not in resource_cache:
63
98
  RESOURCE_CACHE_OPS.inc({"type": "miss"})
64
- if txn is None:
65
- txn = await get_read_only_transaction()
66
- storage = await get_storage(service_name=SERVICE_NAME)
67
- kb = KnowledgeBoxORM(txn, storage, kbid)
68
- orm_resource = await kb.get(uuid)
99
+ orm_resource = await _orm_get_resource(kbid, uuid)
69
100
  else:
70
101
  RESOURCE_CACHE_OPS.inc({"type": "hit"})
71
102
 
@@ -75,3 +106,101 @@ async def get_resource_from_cache(
75
106
  orm_resource = resource_cache.get(uuid)
76
107
 
77
108
  return orm_resource
109
+
110
+
111
+ async def _orm_get_resource(kbid: str, uuid: str) -> Optional[ResourceORM]:
112
+ async with get_driver().transaction(read_only=True) as txn:
113
+ storage = await get_storage(service_name=SERVICE_NAME)
114
+ kb = KnowledgeBoxORM(txn, storage, kbid)
115
+ return await kb.get(uuid)
116
+
117
+
118
+ class ExtractedTextCache:
119
+ """
120
+ Used to cache extracted text from a resource in memory during the process
121
+ of search results hydration.
122
+
123
+ This is needed to avoid fetching the same extracted text multiple times,
124
+ as matching text blocks are processed in parallel and the extracted text is
125
+ fetched for each field where the text block is found.
126
+ """
127
+
128
+ def __init__(self):
129
+ self.locks = {}
130
+ self.values = {}
131
+
132
+ def get_value(self, key: str) -> Optional[ExtractedText]:
133
+ return self.values.get(key)
134
+
135
+ def get_lock(self, key: str) -> asyncio.Lock:
136
+ return self.locks.setdefault(key, asyncio.Lock())
137
+
138
+ def set_value(self, key: str, value: ExtractedText) -> None:
139
+ self.values[key] = value
140
+
141
+ def clear(self):
142
+ self.values.clear()
143
+ self.locks.clear()
144
+
145
+
146
+ async def get_field_extracted_text(field: Field) -> Optional[ExtractedText]:
147
+ cache = get_extracted_text_cache()
148
+ if cache is None:
149
+ logger.warning("Extracted text cache not set")
150
+ EXTRACTED_CACHE_OPS.inc({"type": "miss"})
151
+ return await field.get_extracted_text()
152
+
153
+ key = f"{field.kbid}/{field.uuid}/{field.id}"
154
+ extracted_text = cache.get_value(key)
155
+ if extracted_text is not None:
156
+ EXTRACTED_CACHE_OPS.inc({"type": "hit"})
157
+ return extracted_text
158
+
159
+ async with cache.get_lock(key):
160
+ # Check again in case another task already fetched it
161
+ extracted_text = cache.get_value(key)
162
+ if extracted_text is not None:
163
+ EXTRACTED_CACHE_OPS.inc({"type": "hit"})
164
+ return extracted_text
165
+
166
+ EXTRACTED_CACHE_OPS.inc({"type": "miss"})
167
+ extracted_text = await field.get_extracted_text()
168
+ if extracted_text is not None:
169
+ # Only cache if we actually have extracted text
170
+ cache.set_value(key, extracted_text)
171
+ return extracted_text
172
+
173
+
174
+ async def get_extracted_text_from_field_id(kbid: str, field: FieldId) -> Optional[ExtractedText]:
175
+ rid = field.rid
176
+ orm_resource = await get_resource(kbid, rid)
177
+ if orm_resource is None:
178
+ return None
179
+ field_obj = await orm_resource.get_field(
180
+ key=field.key,
181
+ type=field.pb_type,
182
+ load=False,
183
+ )
184
+ return await get_field_extracted_text(field_obj)
185
+
186
+
187
+ @contextlib.contextmanager
188
+ def request_caches():
189
+ """
190
+ This context manager sets the caches for extracted text and resources for a request.
191
+
192
+ It should used at the beginning of a request handler to avoid fetching the same
193
+ resources and extracted text multiple times.
194
+
195
+ Makes sure to clean the caches at the end of the context manager.
196
+ >>> with request_caches():
197
+ ... resource = await get_resource(kbid, uuid)
198
+ ... extracted_text = await get_extracted_text_from_field_id(kbid, rid, field_id)
199
+ """
200
+ set_resource_cache()
201
+ set_extracted_text_cache()
202
+ try:
203
+ yield
204
+ finally:
205
+ clear_resource_cache()
206
+ clear_extracted_text_cache()