nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -18,8 +18,10 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import enum
21
+ from typing import Optional
21
22
 
22
- from pydantic import BaseSettings, Field
23
+ from pydantic import Field
24
+ from pydantic_settings import BaseSettings
23
25
 
24
26
 
25
27
  class ClusterDiscoveryMode(str, enum.Enum):
@@ -28,7 +30,7 @@ class ClusterDiscoveryMode(str, enum.Enum):
28
30
  SINGLE_NODE = "single_node"
29
31
 
30
32
 
31
- class StandaloneNodeRole(str, enum.Enum):
33
+ class StandaloneNodeRole(enum.Enum):
32
34
  ALL = "all"
33
35
  INDEX = "index"
34
36
  WORKER = "worker"
@@ -55,16 +57,10 @@ class Settings(BaseSettings):
55
57
 
56
58
  # Node limits
57
59
  max_shard_paragraphs: int = Field(
58
- default=250_000,
60
+ default=500_000,
59
61
  title="Max shard paragraphs",
60
62
  description="Maximum number of paragraphs to target per shard",
61
63
  )
62
- max_shard_fields: int = Field(
63
- default=125_000,
64
- title="Max shard fields",
65
- description="Maximum number of fields to target per shard. "
66
- "If this is reached before max_shard_paragraphs, we will create a new shard",
67
- )
68
64
  max_node_replicas: int = Field(
69
65
  default=800,
70
66
  title="Max node replicas",
@@ -76,6 +72,12 @@ class Settings(BaseSettings):
76
72
  description="Maximum number of paragraphs allowed on a single resource",
77
73
  )
78
74
 
75
+ drain_nodes: list[str] = Field(
76
+ default=[],
77
+ title="Drain nodes",
78
+ description="List of node IDs to ignore when creating new shards. It is used for draining nodes from a cluster. Example: ['1bf3bfe7-e164-4a19-a4d9-41372fc15aca',]", # noqa: E501
79
+ )
80
+
79
81
  local_reader_threads: int = 5
80
82
  local_writer_threads: int = 5
81
83
 
@@ -84,6 +86,11 @@ class Settings(BaseSettings):
84
86
  cluster_discovery_kubernetes_selector: str = "appType=node"
85
87
  cluster_discovery_manual_addresses: list[str] = []
86
88
 
89
+ nidx_api_address: Optional[str] = Field(default=None, description="NIDX gRPC API address")
90
+ nidx_searcher_address: Optional[str] = Field(
91
+ default=None, description="NIDX gRPC searcher API address"
92
+ )
93
+
87
94
 
88
95
  settings = Settings()
89
96
 
@@ -30,6 +30,7 @@ from nucliadb_protos.nodereader_pb2 import (
30
30
  DocumentItem,
31
31
  EdgeList,
32
32
  GetShardRequest,
33
+ IdCollection,
33
34
  ParagraphItem,
34
35
  ParagraphSearchRequest,
35
36
  ParagraphSearchResponse,
@@ -46,17 +47,14 @@ from nucliadb_protos.noderesources_pb2 import (
46
47
  EmptyResponse,
47
48
  Resource,
48
49
  ResourceID,
49
- )
50
- from nucliadb_protos.noderesources_pb2 import Shard as NodeResourcesShard
51
- from nucliadb_protos.noderesources_pb2 import (
52
50
  ShardCreated,
53
51
  ShardId,
54
52
  ShardIds,
55
- ShardMetadata,
56
53
  VectorSetID,
57
54
  VectorSetList,
58
55
  )
59
- from nucliadb_protos.nodewriter_pb2 import OpStatus
56
+ from nucliadb_protos.noderesources_pb2 import Shard as NodeResourcesShard
57
+ from nucliadb_protos.nodewriter_pb2 import NewShardRequest, OpStatus
60
58
 
61
59
  from ..settings import settings
62
60
 
@@ -69,8 +67,7 @@ except ImportError: # pragma: no cover
69
67
  IndexNodeException = Exception
70
68
 
71
69
  try:
72
- from nucliadb_node_binding import NodeReader # type: ignore
73
- from nucliadb_node_binding import NodeWriter # type: ignore
70
+ from nucliadb_node_binding import NodeReader, NodeWriter
74
71
  except ImportError: # pragma: no cover
75
72
  NodeReader = None
76
73
  NodeWriter = None
@@ -81,15 +78,11 @@ class StandaloneReaderWrapper:
81
78
 
82
79
  def __init__(self):
83
80
  if NodeReader is None:
84
- raise ImportError(
85
- "NucliaDB index node bindings are not installed (reader not found)"
86
- )
81
+ raise ImportError("NucliaDB index node bindings are not installed (reader not found)")
87
82
  self.reader = NodeReader()
88
83
  self.executor = ThreadPoolExecutor(settings.local_reader_threads)
89
84
 
90
- async def Search(
91
- self, request: SearchRequest, retry: bool = False
92
- ) -> SearchResponse:
85
+ async def Search(self, request: SearchRequest, retry: bool = False) -> SearchResponse:
93
86
  try:
94
87
  loop = asyncio.get_running_loop()
95
88
  result = await loop.run_in_executor(
@@ -113,30 +106,6 @@ class StandaloneReaderWrapper:
113
106
  else:
114
107
  raise
115
108
 
116
- async def ParagraphSearch(
117
- self, request: ParagraphSearchRequest
118
- ) -> ParagraphSearchResponse:
119
- loop = asyncio.get_running_loop()
120
- result = await loop.run_in_executor(
121
- self.executor, self.reader.paragraph_search, request.SerializeToString()
122
- )
123
- pb_bytes = bytes(result)
124
- pb = ParagraphSearchResponse()
125
- pb.ParseFromString(pb_bytes)
126
- return pb
127
-
128
- async def RelationSearch(
129
- self, request: RelationSearchRequest
130
- ) -> RelationSearchResponse:
131
- loop = asyncio.get_running_loop()
132
- result = await loop.run_in_executor(
133
- self.executor, self.reader.relation_search, request.SerializeToString()
134
- )
135
- pb_bytes = bytes(result)
136
- pb = RelationSearchResponse()
137
- pb.ParseFromString(pb_bytes)
138
- return pb
139
-
140
109
  async def GetShard(self, request: GetShardRequest) -> NodeResourcesShard:
141
110
  loop = asyncio.get_running_loop()
142
111
  result = await loop.run_in_executor(
@@ -201,9 +170,7 @@ class StandaloneReaderWrapper:
201
170
  raise exception
202
171
  await loop.run_in_executor(self.executor, t1.join)
203
172
 
204
- async def Paragraphs(
205
- self, stream_request: StreamRequest
206
- ) -> AsyncIterator[ParagraphItem]:
173
+ async def Paragraphs(self, stream_request: StreamRequest) -> AsyncIterator[ParagraphItem]:
207
174
  loop = asyncio.get_running_loop()
208
175
  q: asyncio.Queue[ParagraphItem] = asyncio.Queue(1)
209
176
  exception = None
@@ -249,30 +216,15 @@ class StandaloneReaderWrapper:
249
216
  edge_list.ParseFromString(pb_bytes)
250
217
  return edge_list
251
218
 
252
-
253
- async def Search(self, request: SearchRequest, retry: bool = False) -> SearchResponse:
254
- try:
219
+ async def VectorIds(self, request: VectorSetID) -> IdCollection:
255
220
  loop = asyncio.get_running_loop()
256
221
  result = await loop.run_in_executor(
257
- self.executor, self.reader.search, request.SerializeToString()
222
+ self.executor, self.reader.vector_ids, request.SerializeToString()
258
223
  )
259
224
  pb_bytes = bytes(result)
260
- pb = SearchResponse()
261
- pb.ParseFromString(pb_bytes)
262
- return pb
263
- except IndexNodeException as exc:
264
- if "IO error" not in str(exc):
265
- # ignore any other error
266
- raise
267
-
268
- # try some mitigations...
269
- logger.error(f"IndexNodeException in Search: {request}", exc_info=True)
270
- if not retry:
271
- # reinit?
272
- self.reader = NodeReader()
273
- return await self.Search(request, retry=True)
274
- else:
275
- raise
225
+ ids = IdCollection()
226
+ ids.ParseFromString(pb_bytes)
227
+ return ids
276
228
 
277
229
 
278
230
  class StandaloneWriterWrapper:
@@ -281,13 +233,11 @@ class StandaloneWriterWrapper:
281
233
  def __init__(self):
282
234
  os.makedirs(settings.data_path, exist_ok=True)
283
235
  if NodeWriter is None:
284
- raise ImportError(
285
- "NucliaDB index node bindings are not installed (writer not found)"
286
- )
236
+ raise ImportError("NucliaDB index node bindings are not installed (writer not found)")
287
237
  self.writer = NodeWriter()
288
238
  self.executor = ThreadPoolExecutor(settings.local_writer_threads)
289
239
 
290
- async def NewShard(self, request: ShardMetadata) -> ShardCreated:
240
+ async def NewShard(self, request: NewShardRequest) -> ShardCreated:
291
241
  loop = asyncio.get_running_loop()
292
242
  resp = await loop.run_in_executor(
293
243
  self.executor, self.writer.new_shard, request.SerializeToString()
@@ -318,33 +268,33 @@ class StandaloneWriterWrapper:
318
268
  shard_ids.ParseFromString(pb_bytes)
319
269
  return shard_ids
320
270
 
321
- async def RemoveVectorSet(self, request: VectorSetID):
271
+ async def AddVectorSet(self, request: VectorSetID):
322
272
  loop = asyncio.get_running_loop()
323
273
  resp = await loop.run_in_executor(
324
- self.executor, self.writer.del_vectorset, request.SerializeToString()
274
+ self.executor, self.writer.add_vectorset, request.SerializeToString()
325
275
  )
326
276
  pb_bytes = bytes(resp)
327
277
  resp = OpStatus()
328
278
  resp.ParseFromString(pb_bytes)
329
279
  return resp
330
280
 
331
- async def AddVectorSet(self, request: VectorSetID):
281
+ async def ListVectorSets(self, request: ShardId):
332
282
  loop = asyncio.get_running_loop()
333
283
  resp = await loop.run_in_executor(
334
- self.executor, self.writer.set_vectorset, request.SerializeToString()
284
+ self.executor, self.writer.list_vectorsets, request.SerializeToString()
335
285
  )
336
286
  pb_bytes = bytes(resp)
337
- resp = OpStatus()
287
+ resp = VectorSetList()
338
288
  resp.ParseFromString(pb_bytes)
339
289
  return resp
340
290
 
341
- async def ListVectorSets(self, request: ShardId):
291
+ async def RemoveVectorSet(self, request: VectorSetID):
342
292
  loop = asyncio.get_running_loop()
343
293
  resp = await loop.run_in_executor(
344
- self.executor, self.writer.get_vectorset, request.SerializeToString()
294
+ self.executor, self.writer.remove_vectorset, request.SerializeToString()
345
295
  )
346
296
  pb_bytes = bytes(resp)
347
- resp = VectorSetList()
297
+ resp = OpStatus()
348
298
  resp.ParseFromString(pb_bytes)
349
299
  return resp
350
300
 
@@ -370,9 +320,7 @@ class StandaloneWriterWrapper:
370
320
 
371
321
  async def GC(self, request: ShardId) -> EmptyResponse:
372
322
  loop = asyncio.get_running_loop()
373
- resp = await loop.run_in_executor(
374
- self.executor, self.writer.gc, request.SerializeToString()
375
- )
323
+ resp = await loop.run_in_executor(self.executor, self.writer.gc, request.SerializeToString())
376
324
  pb_bytes = bytes(resp)
377
325
  op_status = EmptyResponse()
378
326
  op_status.ParseFromString(pb_bytes)
@@ -389,7 +337,7 @@ READER_METHODS = {
389
337
  "RelationEdges": (ShardId, EdgeList),
390
338
  }
391
339
  WRITER_METHODS = {
392
- "NewShard": (ShardMetadata, ShardCreated),
340
+ "NewShard": (NewShardRequest, ShardCreated),
393
341
  "DeleteShard": (ShardId, ShardId),
394
342
  "ListShards": (EmptyQuery, ShardIds),
395
343
  "RemoveVectorSet": (VectorSetID, OpStatus),
@@ -20,10 +20,7 @@
20
20
  from typing import Any, Optional
21
21
 
22
22
  from nucliadb.common.cluster.base import AbstractIndexNode
23
- from nucliadb.common.cluster.grpc_node_dummy import ( # type: ignore
24
- DummyReaderStub,
25
- DummyWriterStub,
26
- )
23
+ from nucliadb.common.cluster.grpc_node_dummy import DummyReaderStub, DummyWriterStub
27
24
  from nucliadb.common.cluster.settings import settings as cluster_settings
28
25
  from nucliadb.common.cluster.standalone import grpc_node_binding
29
26
  from nucliadb_protos import standalone_pb2, standalone_pb2_grpc
@@ -79,7 +76,7 @@ class ProxyCallerWrapper:
79
76
  else:
80
77
  grpc_address = address
81
78
  self._channel = get_traced_grpc_channel(grpc_address, "standalone_proxy")
82
- self._stub = standalone_pb2_grpc.StandaloneClusterServiceStub(self._channel) # type: ignore
79
+ self._stub = standalone_pb2_grpc.StandaloneClusterServiceStub(self._channel)
83
80
 
84
81
  def __getattr__(self, name):
85
82
  async def call(request):
@@ -95,9 +92,7 @@ class ProxyCallerWrapper:
95
92
  else:
96
93
  raise NotImplementedError(f"Unknown type {self._type}")
97
94
  except KeyError:
98
- raise NotImplementedError(
99
- f"Unknown method for type {self._type}: {name}"
100
- )
95
+ raise NotImplementedError(f"Unknown method for type {self._type}: {name}")
101
96
  return_value = return_type()
102
97
  return_value.ParseFromString(resp.payload)
103
98
  return return_value
@@ -116,9 +111,7 @@ class ProxyStandaloneIndexNode(StandaloneIndexNode):
116
111
  available_disk: int,
117
112
  dummy: bool = False,
118
113
  ):
119
- super().__init__(
120
- id, address, shard_count, available_disk=available_disk, dummy=dummy
121
- )
114
+ super().__init__(id, address, shard_count, available_disk=available_disk, dummy=dummy)
122
115
  if dummy:
123
116
  return
124
117
 
@@ -32,9 +32,7 @@ from nucliadb_protos import standalone_pb2, standalone_pb2_grpc
32
32
  from nucliadb_utils.grpc import get_traced_grpc_server
33
33
 
34
34
 
35
- class StandaloneClusterServiceServicer(
36
- standalone_pb2_grpc.StandaloneClusterServiceServicer
37
- ):
35
+ class StandaloneClusterServiceServicer(standalone_pb2_grpc.StandaloneClusterServiceServicer):
38
36
  @backoff.on_exception(backoff.expo, (AioRpcError,), max_time=60)
39
37
  async def NodeAction( # type: ignore
40
38
  self, request: standalone_pb2.NodeActionRequest, context
@@ -61,9 +59,7 @@ class StandaloneClusterServiceServicer(
61
59
  self, request: standalone_pb2.NodeInfoRequest, context
62
60
  ) -> standalone_pb2.NodeInfoResponse:
63
61
  index_node = get_self()
64
- index_node.shard_count = len(
65
- os.listdir(os.path.join(cluster_settings.data_path, "shards"))
66
- )
62
+ index_node.shard_count = len(os.listdir(os.path.join(cluster_settings.data_path, "shards")))
67
63
  total_disk, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
68
64
  return standalone_pb2.NodeInfoResponse(
69
65
  id=index_node.id,
@@ -56,9 +56,7 @@ def get_self() -> StandaloneIndexNode:
56
56
  make another grpc request since this node can service it directly.
57
57
  """
58
58
  if not is_index_node():
59
- raise Exception(
60
- "This node is not an Index Node. You should not reach this code path."
61
- )
59
+ raise Exception("This node is not an Index Node. You should not reach this code path.")
62
60
  global _SELF_INDEX_NODE
63
61
  node_id = get_standalone_node_id()
64
62
  if _SELF_INDEX_NODE is None or node_id != _SELF_INDEX_NODE.id:
@@ -68,9 +66,7 @@ def get_self() -> StandaloneIndexNode:
68
66
  host = f"{hn}.{ns}"
69
67
  else:
70
68
  host = gethostname()
71
- _SELF_INDEX_NODE = StandaloneIndexNode(
72
- id=node_id, address=host, shard_count=0, available_disk=0
73
- )
69
+ _SELF_INDEX_NODE = StandaloneIndexNode(id=node_id, address=host, shard_count=0, available_disk=0)
74
70
  try:
75
71
  _, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
76
72
  _SELF_INDEX_NODE.available_disk = available_disk
@@ -95,3 +91,10 @@ def is_index_node() -> bool:
95
91
  StandaloneNodeRole.ALL,
96
92
  StandaloneNodeRole.INDEX,
97
93
  )
94
+
95
+
96
+ def is_worker_node() -> bool:
97
+ return cluster_settings.standalone_node_role in (
98
+ StandaloneNodeRole.ALL,
99
+ StandaloneNodeRole.WORKER,
100
+ )
@@ -27,14 +27,20 @@ from nucliadb.common.cluster.discovery.utils import (
27
27
  setup_cluster_discovery,
28
28
  teardown_cluster_discovery,
29
29
  )
30
- from nucliadb.common.cluster.manager import KBShardManager, StandaloneKBShardManager
30
+ from nucliadb.common.cluster.manager import (
31
+ KBShardManager,
32
+ StandaloneKBShardManager,
33
+ clear_index_nodes,
34
+ )
31
35
  from nucliadb.common.cluster.settings import settings
32
36
  from nucliadb.common.cluster.standalone.service import (
33
37
  start_grpc as start_standalone_grpc,
34
38
  )
35
39
  from nucliadb.common.cluster.standalone.utils import is_index_node
36
- from nucliadb_protos import noderesources_pb2, writer_pb2
40
+ from nucliadb.ingest.orm.resource import Resource
41
+ from nucliadb_protos import nodereader_pb2, writer_pb2
37
42
  from nucliadb_utils import const
43
+ from nucliadb_utils.settings import is_onprem_nucliadb
38
44
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
39
45
 
40
46
  if TYPE_CHECKING: # pragma: no cover
@@ -79,12 +85,18 @@ async def teardown_cluster():
79
85
  await std_server.stop(None)
80
86
  clean_utility(_STANDALONE_SERVER)
81
87
 
88
+ clear_index_nodes()
89
+
82
90
 
83
91
  def get_shard_manager() -> KBShardManager:
84
92
  return get_utility(Utility.SHARD_MANAGER) # type: ignore
85
93
 
86
94
 
87
95
  async def wait_for_node(app_context: ApplicationContext, node_id: str) -> None:
96
+ if is_onprem_nucliadb():
97
+ # On onprem deployments indexing is synchronous right now, so we don't need to wait
98
+ return
99
+
88
100
  logged = False
89
101
  while True:
90
102
  # get raw js client
@@ -108,40 +120,44 @@ async def wait_for_node(app_context: ApplicationContext, node_id: str) -> None:
108
120
  await asyncio.sleep(sleep)
109
121
 
110
122
 
111
- @backoff.on_exception(
112
- backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8
113
- )
123
+ async def get_resource(kbid: str, resource_id: str) -> Optional[Resource]:
124
+ async with datamanagers.with_ro_transaction() as txn:
125
+ return await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
126
+
127
+
128
+ @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
129
+ async def get_resource_index_message(kbid: str, resource_id: str) -> Optional[nodereader_pb2.Resource]:
130
+ async with datamanagers.with_ro_transaction() as txn:
131
+ resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=resource_id)
132
+ if resource is None:
133
+ logger.warning(
134
+ "Resource not found while indexing, skipping",
135
+ extra={"kbid": kbid, "resource_id": resource_id},
136
+ )
137
+ return None
138
+ resource_index_message = (await resource.generate_index_message(reindex=False)).brain
139
+ return resource_index_message
140
+
141
+
142
+ @backoff.on_exception(backoff.expo, (Exception,), jitter=backoff.random_jitter, max_tries=8)
114
143
  async def index_resource_to_shard(
115
144
  app_context: ApplicationContext,
116
145
  kbid: str,
117
146
  resource_id: str,
118
147
  shard: writer_pb2.ShardObject,
119
- ) -> Optional[noderesources_pb2.Resource]:
120
- logger.warning(
121
- "Indexing resource", extra={"kbid": kbid, "resource_id": resource_id}
122
- )
123
-
148
+ resource_index_message: Optional[nodereader_pb2.Resource] = None,
149
+ ) -> None:
150
+ logger.info("Indexing resource", extra={"kbid": kbid, "resource_id": resource_id})
124
151
  sm = app_context.shard_manager
125
152
  partitioning = app_context.partitioning
126
153
 
127
- async with datamanagers.with_transaction() as txn:
128
- resource_index_message = (
129
- await datamanagers.resources.get_resource_index_message(
130
- txn, kbid=kbid, rid=resource_id
131
- )
132
- )
133
-
134
154
  if resource_index_message is None:
135
- logger.warning(
136
- "Resource index message not found while indexing, skipping",
137
- extra={"kbid": kbid, "resource_id": resource_id},
138
- )
139
- return None
155
+ resource_index_message = await get_resource_index_message(kbid, resource_id)
156
+ if resource_index_message is None:
157
+ return
158
+
140
159
  partition = partitioning.generate_partition(kbid, resource_id)
141
- await sm.add_resource(
142
- shard, resource_index_message, txid=-1, partition=str(partition), kb=kbid
143
- )
144
- return resource_index_message
160
+ await sm.add_resource(shard, resource_index_message, txid=-1, partition=str(partition), kb=kbid)
145
161
 
146
162
 
147
163
  async def delete_resource_from_shard(
@@ -150,9 +166,7 @@ async def delete_resource_from_shard(
150
166
  resource_id: str,
151
167
  shard: writer_pb2.ShardObject,
152
168
  ) -> None:
153
- logger.warning(
154
- "Deleting resource", extra={"kbid": kbid, "resource_id": resource_id}
155
- )
169
+ logger.info("Deleting resource", extra={"kbid": kbid, "resource_id": resource_id})
156
170
 
157
171
  sm = app_context.shard_manager
158
172
  partitioning = app_context.partitioning
@@ -0,0 +1,20 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ AVG_PARAGRAPH_SIZE_BYTES = 10_000
@@ -24,14 +24,13 @@ from nucliadb.common.cluster.settings import in_standalone_mode
24
24
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
25
25
  from nucliadb.common.maindb.driver import Driver
26
26
  from nucliadb.common.maindb.utils import setup_driver, teardown_driver
27
+ from nucliadb.common.nidx import start_nidx_utility, stop_nidx_utility
27
28
  from nucliadb_utils.indexing import IndexingUtility
28
29
  from nucliadb_utils.nats import NatsConnectionManager
29
30
  from nucliadb_utils.partition import PartitionUtility
30
31
  from nucliadb_utils.settings import indexing_settings
31
32
  from nucliadb_utils.storages.storage import Storage
32
33
  from nucliadb_utils.utilities import (
33
- Utility,
34
- clean_utility,
35
34
  get_storage,
36
35
  start_indexing_utility,
37
36
  start_nats_manager,
@@ -41,6 +40,7 @@ from nucliadb_utils.utilities import (
41
40
  stop_nats_manager,
42
41
  stop_partitioning_utility,
43
42
  stop_transaction_utility,
43
+ teardown_storage,
44
44
  )
45
45
 
46
46
 
@@ -79,18 +79,20 @@ class ApplicationContext:
79
79
  )
80
80
  self.indexing = await start_indexing_utility()
81
81
  self.transaction = await start_transaction_utility(self.service_name)
82
+ self.nidx = await start_nidx_utility()
82
83
 
83
84
  async def finalize(self) -> None:
84
85
  if not self._initialized:
85
86
  return
86
87
 
88
+ await stop_nidx_utility()
87
89
  await stop_transaction_utility()
88
90
  if not in_standalone_mode():
89
91
  await stop_indexing_utility()
90
92
  await stop_nats_manager()
93
+
91
94
  stop_partitioning_utility()
92
95
  await teardown_cluster()
93
96
  await teardown_driver()
94
- await self.blob_storage.finalize()
95
- clean_utility(Utility.STORAGE)
97
+ await teardown_storage()
96
98
  self._initialized = False
@@ -18,25 +18,28 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
+ from contextlib import asynccontextmanager
22
+
21
23
  from fastapi import FastAPI
22
24
  from starlette.routing import Mount
23
25
 
24
26
  from nucliadb.common.context import ApplicationContext
25
27
 
26
28
 
27
- def set_app_context(app: FastAPI):
29
+ @asynccontextmanager
30
+ async def inject_app_context(app: FastAPI):
28
31
  context = ApplicationContext()
29
32
 
30
33
  app.state.context = context
31
- app.add_event_handler("startup", context.initialize)
32
- app.add_event_handler("shutdown", context.finalize)
33
34
 
34
35
  # Need to add app context in all sub-applications
35
36
  for route in app.router.routes:
36
37
  if isinstance(route, Mount) and isinstance(route.app, FastAPI):
37
38
  route.app.state.context = context
38
- route.app.add_event_handler("startup", context.initialize)
39
- route.app.add_event_handler("shutdown", context.finalize)
39
+
40
+ await context.initialize()
41
+ yield context
42
+ await context.finalize()
40
43
 
41
44
 
42
45
  def get_app_context(application: FastAPI) -> ApplicationContext:
@@ -18,5 +18,11 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from .philosophy_books import * # noqa
22
- from .ten_dummy_resources import * # noqa
21
+ from dataclasses import dataclass
22
+
23
+
24
+ @dataclass
25
+ class IndexCounts:
26
+ fields: int
27
+ paragraphs: int
28
+ sentences: int
@@ -28,17 +28,36 @@
28
28
  # - First argument is always a transaction, all other arguments are keyword arguments and must be explicit
29
29
  # (better for readability and code editors)
30
30
  # ==============================================================================
31
- from . import cluster, entities, exceptions, kb, labels, processing, resources, rollover
32
- from .utils import with_transaction
31
+ from . import (
32
+ atomic,
33
+ cluster,
34
+ entities,
35
+ exceptions,
36
+ fields,
37
+ kb,
38
+ labels,
39
+ processing,
40
+ resources,
41
+ rollover,
42
+ synonyms,
43
+ vectorsets,
44
+ )
45
+ from .utils import with_ro_transaction, with_rw_transaction, with_transaction
33
46
 
34
47
  __all__ = (
48
+ "atomic",
35
49
  "cluster",
36
- "kb",
37
50
  "entities",
51
+ "exceptions",
52
+ "fields",
53
+ "kb",
38
54
  "labels",
55
+ "processing",
39
56
  "resources",
40
57
  "rollover",
41
- "processing",
42
- "exceptions",
58
+ "synonyms",
59
+ "vectorsets",
43
60
  "with_transaction",
61
+ "with_rw_transaction",
62
+ "with_ro_transaction",
44
63
  )