nucliadb 2.46.1.post382__py3-none-any.whl → 6.2.1.post2777__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (431) hide show
  1. migrations/0002_rollover_shards.py +1 -2
  2. migrations/0003_allfields_key.py +2 -37
  3. migrations/0004_rollover_shards.py +1 -2
  4. migrations/0005_rollover_shards.py +1 -2
  5. migrations/0006_rollover_shards.py +2 -4
  6. migrations/0008_cleanup_leftover_rollover_metadata.py +1 -2
  7. migrations/0009_upgrade_relations_and_texts_to_v2.py +5 -4
  8. migrations/0010_fix_corrupt_indexes.py +11 -12
  9. migrations/0011_materialize_labelset_ids.py +2 -18
  10. migrations/0012_rollover_shards.py +6 -12
  11. migrations/0013_rollover_shards.py +2 -4
  12. migrations/0014_rollover_shards.py +5 -7
  13. migrations/0015_targeted_rollover.py +6 -12
  14. migrations/0016_upgrade_to_paragraphs_v2.py +27 -32
  15. migrations/0017_multiple_writable_shards.py +3 -6
  16. migrations/0018_purge_orphan_kbslugs.py +59 -0
  17. migrations/0019_upgrade_to_paragraphs_v3.py +66 -0
  18. migrations/0020_drain_nodes_from_cluster.py +83 -0
  19. nucliadb/standalone/tests/unit/test_run.py → migrations/0021_overwrite_vectorsets_key.py +17 -18
  20. nucliadb/tests/unit/test_openapi.py → migrations/0022_fix_paragraph_deletion_bug.py +16 -11
  21. migrations/0023_backfill_pg_catalog.py +80 -0
  22. migrations/0025_assign_models_to_kbs_v2.py +113 -0
  23. migrations/0026_fix_high_cardinality_content_types.py +61 -0
  24. migrations/0027_rollover_texts3.py +73 -0
  25. nucliadb/ingest/fields/date.py → migrations/pg/0001_bootstrap.py +10 -12
  26. migrations/pg/0002_catalog.py +42 -0
  27. nucliadb/ingest/tests/unit/test_settings.py → migrations/pg/0003_catalog_kbid_index.py +5 -3
  28. nucliadb/common/cluster/base.py +41 -24
  29. nucliadb/common/cluster/discovery/base.py +6 -14
  30. nucliadb/common/cluster/discovery/k8s.py +9 -19
  31. nucliadb/common/cluster/discovery/manual.py +1 -3
  32. nucliadb/common/cluster/discovery/single.py +1 -2
  33. nucliadb/common/cluster/discovery/utils.py +1 -3
  34. nucliadb/common/cluster/grpc_node_dummy.py +11 -16
  35. nucliadb/common/cluster/index_node.py +10 -19
  36. nucliadb/common/cluster/manager.py +223 -102
  37. nucliadb/common/cluster/rebalance.py +42 -37
  38. nucliadb/common/cluster/rollover.py +377 -204
  39. nucliadb/common/cluster/settings.py +16 -9
  40. nucliadb/common/cluster/standalone/grpc_node_binding.py +24 -76
  41. nucliadb/common/cluster/standalone/index_node.py +4 -11
  42. nucliadb/common/cluster/standalone/service.py +2 -6
  43. nucliadb/common/cluster/standalone/utils.py +9 -6
  44. nucliadb/common/cluster/utils.py +43 -29
  45. nucliadb/common/constants.py +20 -0
  46. nucliadb/common/context/__init__.py +6 -4
  47. nucliadb/common/context/fastapi.py +8 -5
  48. nucliadb/{tests/knowledgeboxes/__init__.py → common/counters.py} +8 -2
  49. nucliadb/common/datamanagers/__init__.py +24 -5
  50. nucliadb/common/datamanagers/atomic.py +102 -0
  51. nucliadb/common/datamanagers/cluster.py +5 -5
  52. nucliadb/common/datamanagers/entities.py +6 -16
  53. nucliadb/common/datamanagers/fields.py +84 -0
  54. nucliadb/common/datamanagers/kb.py +101 -24
  55. nucliadb/common/datamanagers/labels.py +26 -56
  56. nucliadb/common/datamanagers/processing.py +2 -6
  57. nucliadb/common/datamanagers/resources.py +214 -117
  58. nucliadb/common/datamanagers/rollover.py +77 -16
  59. nucliadb/{ingest/orm → common/datamanagers}/synonyms.py +16 -28
  60. nucliadb/common/datamanagers/utils.py +19 -11
  61. nucliadb/common/datamanagers/vectorsets.py +110 -0
  62. nucliadb/common/external_index_providers/base.py +257 -0
  63. nucliadb/{ingest/tests/unit/test_cache.py → common/external_index_providers/exceptions.py} +9 -8
  64. nucliadb/common/external_index_providers/manager.py +101 -0
  65. nucliadb/common/external_index_providers/pinecone.py +933 -0
  66. nucliadb/common/external_index_providers/settings.py +52 -0
  67. nucliadb/common/http_clients/auth.py +3 -6
  68. nucliadb/common/http_clients/processing.py +6 -11
  69. nucliadb/common/http_clients/utils.py +1 -3
  70. nucliadb/common/ids.py +240 -0
  71. nucliadb/common/locking.py +43 -13
  72. nucliadb/common/maindb/driver.py +11 -35
  73. nucliadb/common/maindb/exceptions.py +6 -6
  74. nucliadb/common/maindb/local.py +22 -9
  75. nucliadb/common/maindb/pg.py +206 -111
  76. nucliadb/common/maindb/utils.py +13 -44
  77. nucliadb/common/models_utils/from_proto.py +479 -0
  78. nucliadb/common/models_utils/to_proto.py +60 -0
  79. nucliadb/common/nidx.py +260 -0
  80. nucliadb/export_import/datamanager.py +25 -19
  81. nucliadb/export_import/exceptions.py +8 -0
  82. nucliadb/export_import/exporter.py +20 -7
  83. nucliadb/export_import/importer.py +6 -11
  84. nucliadb/export_import/models.py +5 -5
  85. nucliadb/export_import/tasks.py +4 -4
  86. nucliadb/export_import/utils.py +94 -54
  87. nucliadb/health.py +1 -3
  88. nucliadb/ingest/app.py +15 -11
  89. nucliadb/ingest/consumer/auditing.py +30 -147
  90. nucliadb/ingest/consumer/consumer.py +96 -52
  91. nucliadb/ingest/consumer/materializer.py +10 -12
  92. nucliadb/ingest/consumer/pull.py +12 -27
  93. nucliadb/ingest/consumer/service.py +20 -19
  94. nucliadb/ingest/consumer/shard_creator.py +7 -14
  95. nucliadb/ingest/consumer/utils.py +1 -3
  96. nucliadb/ingest/fields/base.py +139 -188
  97. nucliadb/ingest/fields/conversation.py +18 -5
  98. nucliadb/ingest/fields/exceptions.py +1 -4
  99. nucliadb/ingest/fields/file.py +7 -25
  100. nucliadb/ingest/fields/link.py +11 -16
  101. nucliadb/ingest/fields/text.py +9 -4
  102. nucliadb/ingest/orm/brain.py +255 -262
  103. nucliadb/ingest/orm/broker_message.py +181 -0
  104. nucliadb/ingest/orm/entities.py +36 -51
  105. nucliadb/ingest/orm/exceptions.py +12 -0
  106. nucliadb/ingest/orm/knowledgebox.py +334 -278
  107. nucliadb/ingest/orm/processor/__init__.py +2 -697
  108. nucliadb/ingest/orm/processor/auditing.py +117 -0
  109. nucliadb/ingest/orm/processor/data_augmentation.py +164 -0
  110. nucliadb/ingest/orm/processor/pgcatalog.py +84 -0
  111. nucliadb/ingest/orm/processor/processor.py +752 -0
  112. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  113. nucliadb/ingest/orm/resource.py +280 -520
  114. nucliadb/ingest/orm/utils.py +25 -31
  115. nucliadb/ingest/partitions.py +3 -9
  116. nucliadb/ingest/processing.py +76 -81
  117. nucliadb/ingest/py.typed +0 -0
  118. nucliadb/ingest/serialize.py +37 -173
  119. nucliadb/ingest/service/__init__.py +1 -3
  120. nucliadb/ingest/service/writer.py +186 -577
  121. nucliadb/ingest/settings.py +13 -22
  122. nucliadb/ingest/utils.py +3 -6
  123. nucliadb/learning_proxy.py +264 -51
  124. nucliadb/metrics_exporter.py +30 -19
  125. nucliadb/middleware/__init__.py +1 -3
  126. nucliadb/migrator/command.py +1 -3
  127. nucliadb/migrator/datamanager.py +13 -13
  128. nucliadb/migrator/migrator.py +57 -37
  129. nucliadb/migrator/settings.py +2 -1
  130. nucliadb/migrator/utils.py +18 -10
  131. nucliadb/purge/__init__.py +139 -33
  132. nucliadb/purge/orphan_shards.py +7 -13
  133. nucliadb/reader/__init__.py +1 -3
  134. nucliadb/reader/api/models.py +3 -14
  135. nucliadb/reader/api/v1/__init__.py +0 -1
  136. nucliadb/reader/api/v1/download.py +27 -94
  137. nucliadb/reader/api/v1/export_import.py +4 -4
  138. nucliadb/reader/api/v1/knowledgebox.py +13 -13
  139. nucliadb/reader/api/v1/learning_config.py +8 -12
  140. nucliadb/reader/api/v1/resource.py +67 -93
  141. nucliadb/reader/api/v1/services.py +70 -125
  142. nucliadb/reader/app.py +16 -46
  143. nucliadb/reader/lifecycle.py +18 -4
  144. nucliadb/reader/py.typed +0 -0
  145. nucliadb/reader/reader/notifications.py +10 -31
  146. nucliadb/search/__init__.py +1 -3
  147. nucliadb/search/api/v1/__init__.py +2 -2
  148. nucliadb/search/api/v1/ask.py +112 -0
  149. nucliadb/search/api/v1/catalog.py +184 -0
  150. nucliadb/search/api/v1/feedback.py +17 -25
  151. nucliadb/search/api/v1/find.py +41 -41
  152. nucliadb/search/api/v1/knowledgebox.py +90 -62
  153. nucliadb/search/api/v1/predict_proxy.py +2 -2
  154. nucliadb/search/api/v1/resource/ask.py +66 -117
  155. nucliadb/search/api/v1/resource/search.py +51 -72
  156. nucliadb/search/api/v1/router.py +1 -0
  157. nucliadb/search/api/v1/search.py +50 -197
  158. nucliadb/search/api/v1/suggest.py +40 -54
  159. nucliadb/search/api/v1/summarize.py +9 -5
  160. nucliadb/search/api/v1/utils.py +2 -1
  161. nucliadb/search/app.py +16 -48
  162. nucliadb/search/lifecycle.py +10 -3
  163. nucliadb/search/predict.py +176 -188
  164. nucliadb/search/py.typed +0 -0
  165. nucliadb/search/requesters/utils.py +41 -63
  166. nucliadb/search/search/cache.py +149 -20
  167. nucliadb/search/search/chat/ask.py +918 -0
  168. nucliadb/search/{tests/unit/test_run.py → search/chat/exceptions.py} +14 -13
  169. nucliadb/search/search/chat/images.py +41 -17
  170. nucliadb/search/search/chat/prompt.py +851 -282
  171. nucliadb/search/search/chat/query.py +274 -267
  172. nucliadb/{writer/resource/slug.py → search/search/cut.py} +8 -6
  173. nucliadb/search/search/fetch.py +43 -36
  174. nucliadb/search/search/filters.py +9 -15
  175. nucliadb/search/search/find.py +214 -54
  176. nucliadb/search/search/find_merge.py +408 -391
  177. nucliadb/search/search/hydrator.py +191 -0
  178. nucliadb/search/search/merge.py +198 -234
  179. nucliadb/search/search/metrics.py +73 -2
  180. nucliadb/search/search/paragraphs.py +64 -106
  181. nucliadb/search/search/pgcatalog.py +233 -0
  182. nucliadb/search/search/predict_proxy.py +1 -1
  183. nucliadb/search/search/query.py +386 -257
  184. nucliadb/search/search/query_parser/exceptions.py +22 -0
  185. nucliadb/search/search/query_parser/models.py +101 -0
  186. nucliadb/search/search/query_parser/parser.py +183 -0
  187. nucliadb/search/search/rank_fusion.py +204 -0
  188. nucliadb/search/search/rerankers.py +270 -0
  189. nucliadb/search/search/shards.py +4 -38
  190. nucliadb/search/search/summarize.py +14 -18
  191. nucliadb/search/search/utils.py +27 -4
  192. nucliadb/search/settings.py +15 -1
  193. nucliadb/standalone/api_router.py +4 -10
  194. nucliadb/standalone/app.py +17 -14
  195. nucliadb/standalone/auth.py +7 -21
  196. nucliadb/standalone/config.py +9 -12
  197. nucliadb/standalone/introspect.py +5 -5
  198. nucliadb/standalone/lifecycle.py +26 -25
  199. nucliadb/standalone/migrations.py +58 -0
  200. nucliadb/standalone/purge.py +9 -8
  201. nucliadb/standalone/py.typed +0 -0
  202. nucliadb/standalone/run.py +25 -18
  203. nucliadb/standalone/settings.py +10 -14
  204. nucliadb/standalone/versions.py +15 -5
  205. nucliadb/tasks/consumer.py +8 -12
  206. nucliadb/tasks/producer.py +7 -6
  207. nucliadb/tests/config.py +53 -0
  208. nucliadb/train/__init__.py +1 -3
  209. nucliadb/train/api/utils.py +1 -2
  210. nucliadb/train/api/v1/shards.py +2 -2
  211. nucliadb/train/api/v1/trainset.py +4 -6
  212. nucliadb/train/app.py +14 -47
  213. nucliadb/train/generator.py +10 -19
  214. nucliadb/train/generators/field_classifier.py +7 -19
  215. nucliadb/train/generators/field_streaming.py +156 -0
  216. nucliadb/train/generators/image_classifier.py +12 -18
  217. nucliadb/train/generators/paragraph_classifier.py +5 -9
  218. nucliadb/train/generators/paragraph_streaming.py +6 -9
  219. nucliadb/train/generators/question_answer_streaming.py +19 -20
  220. nucliadb/train/generators/sentence_classifier.py +9 -15
  221. nucliadb/train/generators/token_classifier.py +45 -36
  222. nucliadb/train/generators/utils.py +14 -18
  223. nucliadb/train/lifecycle.py +7 -3
  224. nucliadb/train/nodes.py +23 -32
  225. nucliadb/train/py.typed +0 -0
  226. nucliadb/train/servicer.py +13 -21
  227. nucliadb/train/settings.py +2 -6
  228. nucliadb/train/types.py +13 -10
  229. nucliadb/train/upload.py +3 -6
  230. nucliadb/train/uploader.py +20 -25
  231. nucliadb/train/utils.py +1 -1
  232. nucliadb/writer/__init__.py +1 -3
  233. nucliadb/writer/api/constants.py +0 -5
  234. nucliadb/{ingest/fields/keywordset.py → writer/api/utils.py} +13 -10
  235. nucliadb/writer/api/v1/export_import.py +102 -49
  236. nucliadb/writer/api/v1/field.py +196 -620
  237. nucliadb/writer/api/v1/knowledgebox.py +221 -71
  238. nucliadb/writer/api/v1/learning_config.py +2 -2
  239. nucliadb/writer/api/v1/resource.py +114 -216
  240. nucliadb/writer/api/v1/services.py +64 -132
  241. nucliadb/writer/api/v1/slug.py +61 -0
  242. nucliadb/writer/api/v1/transaction.py +67 -0
  243. nucliadb/writer/api/v1/upload.py +184 -215
  244. nucliadb/writer/app.py +11 -61
  245. nucliadb/writer/back_pressure.py +62 -43
  246. nucliadb/writer/exceptions.py +0 -4
  247. nucliadb/writer/lifecycle.py +21 -15
  248. nucliadb/writer/py.typed +0 -0
  249. nucliadb/writer/resource/audit.py +2 -1
  250. nucliadb/writer/resource/basic.py +48 -62
  251. nucliadb/writer/resource/field.py +45 -135
  252. nucliadb/writer/resource/origin.py +1 -2
  253. nucliadb/writer/settings.py +14 -5
  254. nucliadb/writer/tus/__init__.py +17 -15
  255. nucliadb/writer/tus/azure.py +111 -0
  256. nucliadb/writer/tus/dm.py +17 -5
  257. nucliadb/writer/tus/exceptions.py +1 -3
  258. nucliadb/writer/tus/gcs.py +56 -84
  259. nucliadb/writer/tus/local.py +21 -37
  260. nucliadb/writer/tus/s3.py +28 -68
  261. nucliadb/writer/tus/storage.py +5 -56
  262. nucliadb/writer/vectorsets.py +125 -0
  263. nucliadb-6.2.1.post2777.dist-info/METADATA +148 -0
  264. nucliadb-6.2.1.post2777.dist-info/RECORD +343 -0
  265. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/WHEEL +1 -1
  266. nucliadb/common/maindb/redis.py +0 -194
  267. nucliadb/common/maindb/tikv.py +0 -412
  268. nucliadb/ingest/fields/layout.py +0 -58
  269. nucliadb/ingest/tests/conftest.py +0 -30
  270. nucliadb/ingest/tests/fixtures.py +0 -771
  271. nucliadb/ingest/tests/integration/consumer/__init__.py +0 -18
  272. nucliadb/ingest/tests/integration/consumer/test_auditing.py +0 -80
  273. nucliadb/ingest/tests/integration/consumer/test_materializer.py +0 -89
  274. nucliadb/ingest/tests/integration/consumer/test_pull.py +0 -144
  275. nucliadb/ingest/tests/integration/consumer/test_service.py +0 -81
  276. nucliadb/ingest/tests/integration/consumer/test_shard_creator.py +0 -68
  277. nucliadb/ingest/tests/integration/ingest/test_ingest.py +0 -691
  278. nucliadb/ingest/tests/integration/ingest/test_processing_engine.py +0 -95
  279. nucliadb/ingest/tests/integration/ingest/test_relations.py +0 -272
  280. nucliadb/ingest/tests/unit/consumer/__init__.py +0 -18
  281. nucliadb/ingest/tests/unit/consumer/test_auditing.py +0 -140
  282. nucliadb/ingest/tests/unit/consumer/test_consumer.py +0 -69
  283. nucliadb/ingest/tests/unit/consumer/test_pull.py +0 -60
  284. nucliadb/ingest/tests/unit/consumer/test_shard_creator.py +0 -139
  285. nucliadb/ingest/tests/unit/consumer/test_utils.py +0 -67
  286. nucliadb/ingest/tests/unit/orm/__init__.py +0 -19
  287. nucliadb/ingest/tests/unit/orm/test_brain.py +0 -247
  288. nucliadb/ingest/tests/unit/orm/test_processor.py +0 -131
  289. nucliadb/ingest/tests/unit/orm/test_resource.py +0 -275
  290. nucliadb/ingest/tests/unit/test_partitions.py +0 -40
  291. nucliadb/ingest/tests/unit/test_processing.py +0 -171
  292. nucliadb/middleware/transaction.py +0 -117
  293. nucliadb/reader/api/v1/learning_collector.py +0 -63
  294. nucliadb/reader/tests/__init__.py +0 -19
  295. nucliadb/reader/tests/conftest.py +0 -31
  296. nucliadb/reader/tests/fixtures.py +0 -136
  297. nucliadb/reader/tests/test_list_resources.py +0 -75
  298. nucliadb/reader/tests/test_reader_file_download.py +0 -273
  299. nucliadb/reader/tests/test_reader_resource.py +0 -379
  300. nucliadb/reader/tests/test_reader_resource_field.py +0 -219
  301. nucliadb/search/api/v1/chat.py +0 -258
  302. nucliadb/search/api/v1/resource/chat.py +0 -94
  303. nucliadb/search/tests/__init__.py +0 -19
  304. nucliadb/search/tests/conftest.py +0 -33
  305. nucliadb/search/tests/fixtures.py +0 -199
  306. nucliadb/search/tests/node.py +0 -465
  307. nucliadb/search/tests/unit/__init__.py +0 -18
  308. nucliadb/search/tests/unit/api/__init__.py +0 -19
  309. nucliadb/search/tests/unit/api/v1/__init__.py +0 -19
  310. nucliadb/search/tests/unit/api/v1/resource/__init__.py +0 -19
  311. nucliadb/search/tests/unit/api/v1/resource/test_ask.py +0 -67
  312. nucliadb/search/tests/unit/api/v1/resource/test_chat.py +0 -97
  313. nucliadb/search/tests/unit/api/v1/test_chat.py +0 -96
  314. nucliadb/search/tests/unit/api/v1/test_predict_proxy.py +0 -98
  315. nucliadb/search/tests/unit/api/v1/test_summarize.py +0 -93
  316. nucliadb/search/tests/unit/search/__init__.py +0 -18
  317. nucliadb/search/tests/unit/search/requesters/__init__.py +0 -18
  318. nucliadb/search/tests/unit/search/requesters/test_utils.py +0 -210
  319. nucliadb/search/tests/unit/search/search/__init__.py +0 -19
  320. nucliadb/search/tests/unit/search/search/test_shards.py +0 -45
  321. nucliadb/search/tests/unit/search/search/test_utils.py +0 -82
  322. nucliadb/search/tests/unit/search/test_chat_prompt.py +0 -266
  323. nucliadb/search/tests/unit/search/test_fetch.py +0 -108
  324. nucliadb/search/tests/unit/search/test_filters.py +0 -125
  325. nucliadb/search/tests/unit/search/test_paragraphs.py +0 -157
  326. nucliadb/search/tests/unit/search/test_predict_proxy.py +0 -106
  327. nucliadb/search/tests/unit/search/test_query.py +0 -201
  328. nucliadb/search/tests/unit/test_app.py +0 -79
  329. nucliadb/search/tests/unit/test_find_merge.py +0 -112
  330. nucliadb/search/tests/unit/test_merge.py +0 -34
  331. nucliadb/search/tests/unit/test_predict.py +0 -584
  332. nucliadb/standalone/tests/__init__.py +0 -19
  333. nucliadb/standalone/tests/conftest.py +0 -33
  334. nucliadb/standalone/tests/fixtures.py +0 -38
  335. nucliadb/standalone/tests/unit/__init__.py +0 -18
  336. nucliadb/standalone/tests/unit/test_api_router.py +0 -61
  337. nucliadb/standalone/tests/unit/test_auth.py +0 -169
  338. nucliadb/standalone/tests/unit/test_introspect.py +0 -35
  339. nucliadb/standalone/tests/unit/test_versions.py +0 -68
  340. nucliadb/tests/benchmarks/__init__.py +0 -19
  341. nucliadb/tests/benchmarks/test_search.py +0 -99
  342. nucliadb/tests/conftest.py +0 -32
  343. nucliadb/tests/fixtures.py +0 -736
  344. nucliadb/tests/knowledgeboxes/philosophy_books.py +0 -203
  345. nucliadb/tests/knowledgeboxes/ten_dummy_resources.py +0 -109
  346. nucliadb/tests/migrations/__init__.py +0 -19
  347. nucliadb/tests/migrations/test_migration_0017.py +0 -80
  348. nucliadb/tests/tikv.py +0 -240
  349. nucliadb/tests/unit/__init__.py +0 -19
  350. nucliadb/tests/unit/common/__init__.py +0 -19
  351. nucliadb/tests/unit/common/cluster/__init__.py +0 -19
  352. nucliadb/tests/unit/common/cluster/discovery/__init__.py +0 -19
  353. nucliadb/tests/unit/common/cluster/discovery/test_k8s.py +0 -170
  354. nucliadb/tests/unit/common/cluster/standalone/__init__.py +0 -18
  355. nucliadb/tests/unit/common/cluster/standalone/test_service.py +0 -113
  356. nucliadb/tests/unit/common/cluster/standalone/test_utils.py +0 -59
  357. nucliadb/tests/unit/common/cluster/test_cluster.py +0 -399
  358. nucliadb/tests/unit/common/cluster/test_kb_shard_manager.py +0 -178
  359. nucliadb/tests/unit/common/cluster/test_rollover.py +0 -279
  360. nucliadb/tests/unit/common/maindb/__init__.py +0 -18
  361. nucliadb/tests/unit/common/maindb/test_driver.py +0 -127
  362. nucliadb/tests/unit/common/maindb/test_tikv.py +0 -53
  363. nucliadb/tests/unit/common/maindb/test_utils.py +0 -81
  364. nucliadb/tests/unit/common/test_context.py +0 -36
  365. nucliadb/tests/unit/export_import/__init__.py +0 -19
  366. nucliadb/tests/unit/export_import/test_datamanager.py +0 -37
  367. nucliadb/tests/unit/export_import/test_utils.py +0 -294
  368. nucliadb/tests/unit/migrator/__init__.py +0 -19
  369. nucliadb/tests/unit/migrator/test_migrator.py +0 -87
  370. nucliadb/tests/unit/tasks/__init__.py +0 -19
  371. nucliadb/tests/unit/tasks/conftest.py +0 -42
  372. nucliadb/tests/unit/tasks/test_consumer.py +0 -93
  373. nucliadb/tests/unit/tasks/test_producer.py +0 -95
  374. nucliadb/tests/unit/tasks/test_tasks.py +0 -60
  375. nucliadb/tests/unit/test_field_ids.py +0 -49
  376. nucliadb/tests/unit/test_health.py +0 -84
  377. nucliadb/tests/unit/test_kb_slugs.py +0 -54
  378. nucliadb/tests/unit/test_learning_proxy.py +0 -252
  379. nucliadb/tests/unit/test_metrics_exporter.py +0 -77
  380. nucliadb/tests/unit/test_purge.py +0 -138
  381. nucliadb/tests/utils/__init__.py +0 -74
  382. nucliadb/tests/utils/aiohttp_session.py +0 -44
  383. nucliadb/tests/utils/broker_messages/__init__.py +0 -167
  384. nucliadb/tests/utils/broker_messages/fields.py +0 -181
  385. nucliadb/tests/utils/broker_messages/helpers.py +0 -33
  386. nucliadb/tests/utils/entities.py +0 -78
  387. nucliadb/train/api/v1/check.py +0 -60
  388. nucliadb/train/tests/__init__.py +0 -19
  389. nucliadb/train/tests/conftest.py +0 -29
  390. nucliadb/train/tests/fixtures.py +0 -342
  391. nucliadb/train/tests/test_field_classification.py +0 -122
  392. nucliadb/train/tests/test_get_entities.py +0 -80
  393. nucliadb/train/tests/test_get_info.py +0 -51
  394. nucliadb/train/tests/test_get_ontology.py +0 -34
  395. nucliadb/train/tests/test_get_ontology_count.py +0 -63
  396. nucliadb/train/tests/test_image_classification.py +0 -222
  397. nucliadb/train/tests/test_list_fields.py +0 -39
  398. nucliadb/train/tests/test_list_paragraphs.py +0 -73
  399. nucliadb/train/tests/test_list_resources.py +0 -39
  400. nucliadb/train/tests/test_list_sentences.py +0 -71
  401. nucliadb/train/tests/test_paragraph_classification.py +0 -123
  402. nucliadb/train/tests/test_paragraph_streaming.py +0 -118
  403. nucliadb/train/tests/test_question_answer_streaming.py +0 -239
  404. nucliadb/train/tests/test_sentence_classification.py +0 -143
  405. nucliadb/train/tests/test_token_classification.py +0 -136
  406. nucliadb/train/tests/utils.py +0 -108
  407. nucliadb/writer/layouts/__init__.py +0 -51
  408. nucliadb/writer/layouts/v1.py +0 -59
  409. nucliadb/writer/resource/vectors.py +0 -120
  410. nucliadb/writer/tests/__init__.py +0 -19
  411. nucliadb/writer/tests/conftest.py +0 -31
  412. nucliadb/writer/tests/fixtures.py +0 -192
  413. nucliadb/writer/tests/test_fields.py +0 -486
  414. nucliadb/writer/tests/test_files.py +0 -743
  415. nucliadb/writer/tests/test_knowledgebox.py +0 -49
  416. nucliadb/writer/tests/test_reprocess_file_field.py +0 -139
  417. nucliadb/writer/tests/test_resources.py +0 -546
  418. nucliadb/writer/tests/test_service.py +0 -137
  419. nucliadb/writer/tests/test_tus.py +0 -203
  420. nucliadb/writer/tests/utils.py +0 -35
  421. nucliadb/writer/tus/pg.py +0 -125
  422. nucliadb-2.46.1.post382.dist-info/METADATA +0 -134
  423. nucliadb-2.46.1.post382.dist-info/RECORD +0 -451
  424. {nucliadb/ingest/tests → migrations/pg}/__init__.py +0 -0
  425. /nucliadb/{ingest/tests/integration → common/external_index_providers}/__init__.py +0 -0
  426. /nucliadb/{ingest/tests/integration/ingest → common/models_utils}/__init__.py +0 -0
  427. /nucliadb/{ingest/tests/unit → search/search/query_parser}/__init__.py +0 -0
  428. /nucliadb/{ingest/tests → tests}/vectors.py +0 -0
  429. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/entry_points.txt +0 -0
  430. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/top_level.txt +0 -0
  431. {nucliadb-2.46.1.post382.dist-info → nucliadb-6.2.1.post2777.dist-info}/zip-safe +0 -0
@@ -0,0 +1,933 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import asyncio
21
+ import json
22
+ import logging
23
+ from copy import deepcopy
24
+ from typing import Any, Iterator, Optional
25
+ from uuid import uuid4
26
+
27
+ import backoff
28
+ from cachetools import TTLCache
29
+ from pydantic import BaseModel
30
+
31
+ from nucliadb.common.counters import IndexCounts
32
+ from nucliadb.common.external_index_providers.base import (
33
+ ExternalIndexManager,
34
+ ExternalIndexProviderType,
35
+ QueryResults,
36
+ TextBlockMatch,
37
+ VectorsetExternalIndex,
38
+ )
39
+ from nucliadb.common.external_index_providers.exceptions import ExternalIndexCreationError
40
+ from nucliadb.common.ids import FieldId, ParagraphId, VectorId
41
+ from nucliadb_models.search import SCORE_TYPE, TextPosition
42
+ from nucliadb_protos import knowledgebox_pb2 as kb_pb2
43
+ from nucliadb_protos import utils_pb2
44
+ from nucliadb_protos.nodereader_pb2 import SearchRequest, Timestamps
45
+ from nucliadb_protos.noderesources_pb2 import IndexParagraph, Resource, VectorSentence
46
+ from nucliadb_telemetry.metrics import Observer
47
+ from nucliadb_utils.aiopynecone.client import DataPlane, FilterOperator, LogicalOperator
48
+ from nucliadb_utils.aiopynecone.exceptions import (
49
+ MetadataTooLargeError,
50
+ PineconeAPIError,
51
+ )
52
+ from nucliadb_utils.aiopynecone.models import QueryResponse
53
+ from nucliadb_utils.aiopynecone.models import Vector as PineconeVector
54
+ from nucliadb_utils.utilities import get_endecryptor, get_pinecone
55
+
56
+ logger = logging.getLogger(__name__)
57
+
58
+ manager_observer = Observer("pinecone_index_manager", labels={"operation": ""})
59
+
60
+
61
+ DISCARDED_LABEL_PREFIXES = [
62
+ # NER-related labels are not supported in the Pinecone integration because right now
63
+ # the number of detected entities is unbounded and may exceed the vector metadata size limit.
64
+ "/e/",
65
+ # Processing status labels are only needed for the catalog endpoint.
66
+ "/n/s",
67
+ ]
68
+
69
+ # To avoid querying the Pinecone API for the same index stats multiple times in a short period of time
70
+ COUNTERS_CACHE = TTLCache(maxsize=1024, ttl=60) # type: ignore
71
+
72
+
73
+ class PineconeQueryResults(QueryResults):
74
+ type: ExternalIndexProviderType = ExternalIndexProviderType.PINECONE
75
+ results: QueryResponse
76
+
77
+ def iter_matching_text_blocks(self) -> Iterator[TextBlockMatch]:
78
+ for order, matching_vector in enumerate(self.results.matches):
79
+ try:
80
+ vector_id = VectorId.from_string(matching_vector.id)
81
+ paragraph_id = ParagraphId.from_vector_id(vector_id)
82
+ except ValueError: # pragma: no cover
83
+ logger.error(f"Invalid Pinecone vector id: {matching_vector.id}")
84
+ continue
85
+ vector_metadata = VectorMetadata.model_validate(matching_vector.metadata) # noqa
86
+ yield TextBlockMatch(
87
+ paragraph_id=paragraph_id,
88
+ text=None, # To be filled by the results hydrator
89
+ score=matching_vector.score,
90
+ score_type=SCORE_TYPE.VECTOR,
91
+ order=order,
92
+ fuzzy_search=False, # semantic search doesn't use fuzziness
93
+ is_a_table=vector_metadata.is_a_table or False,
94
+ page_with_visual=vector_metadata.page_with_visual or False,
95
+ representation_file=vector_metadata.representation_file,
96
+ paragraph_labels=vector_metadata.paragraph_labels or [],
97
+ field_labels=vector_metadata.field_labels or [],
98
+ position=TextPosition(
99
+ page_number=vector_metadata.page_number,
100
+ index=vector_id.index or 0,
101
+ start=paragraph_id.paragraph_start,
102
+ end=paragraph_id.paragraph_end,
103
+ start_seconds=list(map(int, vector_metadata.position_start_seconds or [])),
104
+ end_seconds=list(map(int, vector_metadata.position_end_seconds or [])),
105
+ ),
106
+ )
107
+
108
+
109
+ class IndexHostNotFound(Exception): ...
110
+
111
+
112
+ class VectorMetadata(BaseModel):
113
+ """
114
+ This class models what we index at Pinecone's metadata attribute for each vector.
115
+ https://docs.pinecone.io/guides/data/filter-with-metadata
116
+ """
117
+
118
+ # Id filtering
119
+ rid: str
120
+ field_type: str
121
+ field_id: str
122
+
123
+ # Date range filtering
124
+ date_created: Optional[int] = None
125
+ date_modified: Optional[int] = None
126
+
127
+ # Label filtering
128
+ paragraph_labels: Optional[list[str]] = None
129
+ field_labels: Optional[list[str]] = None
130
+
131
+ # Security
132
+ security_public: bool = True
133
+ security_ids_with_access: Optional[list[str]] = None
134
+
135
+ # Position
136
+ position_start_seconds: Optional[list[str]] = None
137
+ position_end_seconds: Optional[list[str]] = None
138
+ page_number: Optional[int] = None
139
+
140
+ # AI-tables metadata
141
+ page_with_visual: Optional[bool] = None
142
+ is_a_table: Optional[bool] = None
143
+ representation_file: Optional[str] = None
144
+
145
+
146
+ class PineconeIndexManager(ExternalIndexManager):
147
+ type = ExternalIndexProviderType.PINECONE
148
+ supports_rollover = True
149
+
150
+ def __init__(
151
+ self,
152
+ kbid: str,
153
+ api_key: str,
154
+ indexes: dict[str, kb_pb2.PineconeIndexMetadata],
155
+ upsert_parallelism: int = 3,
156
+ delete_parallelism: int = 2,
157
+ upsert_timeout: float = 10.0,
158
+ delete_timeout: float = 10.0,
159
+ query_timeout: float = 10.0,
160
+ default_vectorset: Optional[str] = None,
161
+ rollover_indexes: Optional[dict[str, kb_pb2.PineconeIndexMetadata]] = None,
162
+ ):
163
+ super().__init__(kbid=kbid)
164
+ assert api_key != ""
165
+ self.api_key = api_key
166
+ self.indexes = indexes
167
+ self.rollover_indexes = rollover_indexes or {}
168
+ self.pinecone = get_pinecone()
169
+ self.upsert_parallelism = upsert_parallelism
170
+ self.delete_parallelism = delete_parallelism
171
+ self.upsert_timeout = upsert_timeout
172
+ self.delete_timeout = delete_timeout
173
+ self.query_timeout = query_timeout
174
+ self.default_vectorset = default_vectorset
175
+
176
+ def get_data_plane(self, index_host: str) -> DataPlane:
177
+ return self.pinecone.data_plane(api_key=self.api_key, index_host=index_host)
178
+
179
+ @classmethod
180
+ async def create_indexes(
181
+ cls,
182
+ kbid: str,
183
+ request: kb_pb2.CreateExternalIndexProviderMetadata,
184
+ indexes: list[VectorsetExternalIndex],
185
+ ) -> kb_pb2.StoredExternalIndexProviderMetadata:
186
+ created_indexes = []
187
+ metadata = kb_pb2.StoredExternalIndexProviderMetadata(
188
+ type=kb_pb2.ExternalIndexProviderType.PINECONE
189
+ )
190
+ api_key = request.pinecone_config.api_key
191
+ metadata.pinecone_config.encrypted_api_key = get_endecryptor().encrypt(api_key)
192
+ metadata.pinecone_config.serverless_cloud = request.pinecone_config.serverless_cloud
193
+ pinecone = get_pinecone().control_plane(api_key=api_key)
194
+ serverless_cloud = to_pinecone_serverless_cloud_payload(request.pinecone_config.serverless_cloud)
195
+ for index in indexes:
196
+ vectorset_id = index.vectorset_id
197
+ index_name = PineconeIndexManager.get_index_name()
198
+ index_dimension = index.dimension
199
+ similarity_metric = to_pinecone_index_metric(index.similarity)
200
+ logger.info(
201
+ "Creating pincone index",
202
+ extra={
203
+ "kbid": kbid,
204
+ "index_name": index_name,
205
+ "similarity": similarity_metric,
206
+ "vector_dimension": index_dimension,
207
+ "vectorset_id": vectorset_id,
208
+ "cloud": serverless_cloud,
209
+ },
210
+ )
211
+ try:
212
+ index_host = await pinecone.create_index(
213
+ name=index_name,
214
+ dimension=index_dimension,
215
+ metric=similarity_metric,
216
+ serverless_cloud=serverless_cloud,
217
+ )
218
+ created_indexes.append(index_name)
219
+ except PineconeAPIError as exc:
220
+ # Try index creation rollback
221
+ for index_name in created_indexes:
222
+ try:
223
+ await cls._delete_index(api_key, index_name)
224
+ except Exception:
225
+ logger.exception("Could not rollback created pinecone indexes")
226
+ raise ExternalIndexCreationError("pinecone", exc.message) from exc
227
+ metadata.pinecone_config.indexes[vectorset_id].CopyFrom(
228
+ kb_pb2.PineconeIndexMetadata(
229
+ index_name=index_name,
230
+ index_host=index_host,
231
+ vector_dimension=index.dimension,
232
+ similarity=index.similarity,
233
+ )
234
+ )
235
+ return metadata
236
+
237
+ @classmethod
238
+ async def delete_indexes(
239
+ cls,
240
+ kbid: str,
241
+ stored: kb_pb2.StoredExternalIndexProviderMetadata,
242
+ ) -> None:
243
+ api_key = get_endecryptor().decrypt(stored.pinecone_config.encrypted_api_key)
244
+ # Delete all indexes stored in the config and passed as parameters
245
+ for index_metadata in stored.pinecone_config.indexes.values():
246
+ index_name = index_metadata.index_name
247
+ try:
248
+ logger.info("Deleting pincone index", extra={"kbid": kbid, "index_name": index_name})
249
+ await cls._delete_index(api_key, index_name)
250
+ except Exception:
251
+ logger.exception(
252
+ "Error deleting pinecone index", extra={"kbid": kbid, "index_name": index_name}
253
+ )
254
+
255
+ @classmethod
256
+ @backoff.on_exception(
257
+ backoff.expo,
258
+ (PineconeAPIError,),
259
+ jitter=backoff.random_jitter,
260
+ max_tries=3,
261
+ )
262
+ async def _delete_index(cls, api_key: str, index_name: str) -> None:
263
+ control_plane = get_pinecone().control_plane(api_key=api_key)
264
+ await control_plane.delete_index(index_name)
265
+
266
+ async def rollover_create_indexes(
267
+ self, stored: kb_pb2.StoredExternalIndexProviderMetadata
268
+ ) -> kb_pb2.StoredExternalIndexProviderMetadata:
269
+ result = kb_pb2.StoredExternalIndexProviderMetadata()
270
+ result.CopyFrom(stored)
271
+ control_plane = get_pinecone().control_plane(api_key=self.api_key)
272
+ created_indexes = []
273
+ cloud = to_pinecone_serverless_cloud_payload(stored.pinecone_config.serverless_cloud)
274
+ try:
275
+ for vectorset_id, index in stored.pinecone_config.indexes.items():
276
+ rollover_index_name = PineconeIndexManager.get_index_name()
277
+ index_dimension = index.vector_dimension
278
+ similarity_metric = to_pinecone_index_metric(index.similarity)
279
+ logger.info(
280
+ "Creating pincone rollover index",
281
+ extra={
282
+ "kbid": self.kbid,
283
+ "index_name": index.index_name,
284
+ "rollover_index_name": rollover_index_name,
285
+ "similarity": similarity_metric,
286
+ "vector_dimension": index_dimension,
287
+ "vectorset_id": vectorset_id,
288
+ },
289
+ )
290
+ try:
291
+ index_host = await control_plane.create_index(
292
+ name=rollover_index_name,
293
+ dimension=index_dimension,
294
+ metric=similarity_metric,
295
+ serverless_cloud=cloud,
296
+ )
297
+ result.pinecone_config.indexes[vectorset_id].MergeFrom(
298
+ kb_pb2.PineconeIndexMetadata(
299
+ index_name=rollover_index_name,
300
+ index_host=index_host,
301
+ vector_dimension=index_dimension,
302
+ similarity=index.similarity,
303
+ )
304
+ )
305
+ created_indexes.append(rollover_index_name)
306
+ except PineconeAPIError as exc:
307
+ raise ExternalIndexCreationError("pinecone", exc.message) from exc
308
+ except Exception:
309
+ # Rollback any created indexes
310
+ for index_name in created_indexes:
311
+ try:
312
+ await self.__class__._delete_index(self.api_key, index_name)
313
+ except Exception:
314
+ logger.exception(
315
+ f"Could not rollback created pinecone index",
316
+ extra={
317
+ "kbid": self.kbid,
318
+ "index_name": index_name,
319
+ },
320
+ )
321
+ raise
322
+
323
+ # Wait for all indexes to be in the ready state
324
+ wait_tasks = []
325
+ for index_name in created_indexes:
326
+ wait_tasks.append(
327
+ asyncio.create_task(self.wait_for_index_ready(index_name, max_wait_seconds=60))
328
+ )
329
+ if len(wait_tasks) > 0:
330
+ try:
331
+ await asyncio.gather(*wait_tasks)
332
+ except asyncio.TimeoutError:
333
+ logger.warning(
334
+ "Timeout waiting for pinecone indexes to be ready",
335
+ extra={"kbid": self.kbid, "indexes": created_indexes},
336
+ )
337
+
338
+ # Clear the rollover indexes and update the stored metadata
339
+ self.rollover_indexes.clear()
340
+ self.rollover_indexes = dict(result.pinecone_config.indexes)
341
+ return result
342
+
343
+ async def wait_for_index_ready(self, index_name: str, max_wait_seconds: int = 10) -> None:
344
+ """
345
+ Wait for an index to be ready.
346
+ Params:
347
+ - `name`: The name of the index to wait for.
348
+ - `max_wait_seconds`: The maximum number of seconds to wait.
349
+ """
350
+ control_plane = self.pinecone.control_plane(api_key=self.api_key)
351
+ for _ in range(max_wait_seconds):
352
+ try:
353
+ index = await control_plane.describe_index(index_name)
354
+ if index.status.ready:
355
+ return
356
+ except PineconeAPIError:
357
+ logger.exception(
358
+ "Failed to describe index while waiting for it to become ready.",
359
+ extra={"kbid": self.kbid, "index_name": index_name},
360
+ )
361
+ await asyncio.sleep(1)
362
+
363
+ raise TimeoutError(f"Index {index_name} did not become ready after {max_wait_seconds} seconds.")
364
+
365
+ async def rollover_cutover_indexes(self) -> None:
366
+ assert len(self.rollover_indexes) > 0, "No rollover indexes to cutover to"
367
+ control_plane = self.pinecone.control_plane(api_key=self.api_key)
368
+ for index in self.indexes.values():
369
+ index_name = index.index_name
370
+ try:
371
+ await control_plane.delete_index(index.index_name)
372
+ except Exception:
373
+ logger.exception(
374
+ "Error deleting pinecone index on cutover",
375
+ extra={"kbid": self.kbid, "index_name": index_name},
376
+ )
377
+ self.indexes.clear()
378
+ self.indexes.update(self.rollover_indexes)
379
+
380
+ @classmethod
381
+ def get_index_name(cls) -> str:
382
+ """
383
+ Index names can't be longer than 45 characters and can only contain
384
+ alphanumeric lowercase characters: https://docs.pinecone.io/troubleshooting/restrictions-on-index-names
385
+
386
+ We generate a unique id for each pinecone index created.
387
+ `nuclia-` is prepended to easily identify which indexes are created by Nuclia.
388
+
389
+ Example:
390
+ >>> get_index_name()
391
+ 'nuclia-2d899e8a0af54ac9a5addbd483d02ec9'
392
+ """
393
+ return f"nuclia-{uuid4().hex}"
394
+
395
+ async def _delete_resource_to_index(self, index_host: str, resource_uuid: str) -> None:
396
+ data_plane = self.get_data_plane(index_host=index_host)
397
+ with manager_observer({"operation": "delete_by_resource_prefix"}):
398
+ await data_plane.delete_by_id_prefix(
399
+ id_prefix=resource_uuid,
400
+ max_parallel_batches=self.delete_parallelism,
401
+ batch_timeout=self.delete_timeout,
402
+ )
403
+
404
+ async def _delete_resource(self, resource_uuid: str) -> None:
405
+ """
406
+ Deletes by resource uuid on all indexes in parallel.
407
+ """
408
+ delete_tasks = []
409
+ for index in self.indexes.values():
410
+ index_host = index.index_host
411
+ delete_tasks.append(
412
+ asyncio.create_task(
413
+ self._delete_resource_to_index(
414
+ index_host=index_host,
415
+ resource_uuid=resource_uuid,
416
+ )
417
+ )
418
+ )
419
+ if len(delete_tasks) > 0:
420
+ await asyncio.gather(*delete_tasks)
421
+
422
+ def get_vectorsets_in_resource(self, index_data: Resource) -> set[str]:
423
+ vectorsets: set[str] = set()
424
+ for _, paragraph in iter_paragraphs(index_data):
425
+ if not paragraph.sentences and not paragraph.vectorsets_sentences:
426
+ continue
427
+ if paragraph.sentences and self.default_vectorset:
428
+ vectorsets.add(self.default_vectorset)
429
+ for vectorset_id, vectorsets_sentences in paragraph.vectorsets_sentences.items():
430
+ if vectorsets_sentences.sentences:
431
+ vectorsets.add(vectorset_id)
432
+ # Once we have found at least one paragraph with vectors, we can stop iterating
433
+ return vectorsets
434
+ return vectorsets
435
+
436
+ def get_index_host(self, vectorset_id: str, rollover: bool = False) -> str:
437
+ if rollover:
438
+ return self.rollover_indexes[vectorset_id].index_host
439
+ else:
440
+ return self.indexes[vectorset_id].index_host
441
+
442
+ def get_prefixes_to_delete(self, index_data: Resource) -> set[str]:
443
+ prefixes_to_delete = set()
444
+ # TODO: migrate to vector_prefixes_to_delete
445
+ for field_id in index_data.sentences_to_delete:
446
+ try:
447
+ delete_vid = VectorId.from_string(field_id)
448
+ prefixes_to_delete.add(delete_vid.field_id.full())
449
+ except ValueError: # pragma: no cover
450
+ try:
451
+ delete_field = FieldId.from_string(field_id)
452
+ prefixes_to_delete.add(delete_field.full())
453
+ except ValueError:
454
+ logger.warning(f"Invalid id to delete sentences from: {field_id}.")
455
+ continue
456
+ for paragraph_id in index_data.paragraphs_to_delete:
457
+ try:
458
+ delete_pid = ParagraphId.from_string(paragraph_id)
459
+ prefixes_to_delete.add(delete_pid.field_id.full())
460
+ except ValueError: # pragma: no cover
461
+ try:
462
+ delete_field = FieldId.from_string(paragraph_id)
463
+ prefixes_to_delete.add(delete_field.full())
464
+ except ValueError:
465
+ logger.warning(f"Invalid id to delete: {paragraph_id}. ParagraphId expected.")
466
+ continue
467
+ return prefixes_to_delete
468
+
469
+ async def _index_resource(
470
+ self, resource_uuid: str, index_data: Resource, to_rollover_indexes: bool = False
471
+ ) -> None:
472
+ """
473
+ Index NucliaDB resource into a Pinecone index.
474
+ Handles multiple vectorsets.
475
+
476
+ The algorithm is as follows:
477
+ - First, get the vectorsets for which we have vectors to upsert.
478
+ - Then, delete any previously existing vectors with the same field prefixes on all vectorsets.
479
+ - Then, iterate the fields and the paragraphs to compute the base metadata for each vector.
480
+ - After that, iterate the sentences now, and compute the list of vectors to upsert, and extend the vector
481
+ metadata with any specific sentence metadata. This is done for each vectorset.
482
+ - Finally, upsert the vectors to each vectorset index in parallel.
483
+ """
484
+ vectorsets = self.get_vectorsets_in_resource(index_data)
485
+ prefixes_to_delete = self.get_prefixes_to_delete(index_data)
486
+ delete_tasks = []
487
+ for vectorset in vectorsets:
488
+ index_host = self.get_index_host(vectorset_id=vectorset, rollover=to_rollover_indexes)
489
+ delete_tasks.append(
490
+ asyncio.create_task(
491
+ self._delete_by_prefix_to_index(
492
+ index_host=index_host,
493
+ prefixes_to_delete=prefixes_to_delete,
494
+ )
495
+ )
496
+ )
497
+ if len(delete_tasks) > 0:
498
+ await asyncio.gather(*delete_tasks)
499
+
500
+ with manager_observer({"operation": "compute_base_vector_metadatas"}):
501
+ base_vector_metadatas: dict[str, VectorMetadata] = await self.compute_base_vector_metadatas(
502
+ index_data, resource_uuid
503
+ )
504
+
505
+ with manager_observer({"operation": "compute_vectorset_vectors"}):
506
+ vectorset_vectors: dict[str, list[PineconeVector]] = await self.compute_vectorset_vectors(
507
+ index_data, base_vector_metadatas
508
+ )
509
+
510
+ upsert_tasks = []
511
+ for vectorset_id, vectors in vectorset_vectors.items():
512
+ index_host = self.get_index_host(vectorset_id=vectorset_id, rollover=to_rollover_indexes)
513
+ upsert_tasks.append(
514
+ asyncio.create_task(
515
+ self._upsert_to_index(
516
+ index_host=index_host,
517
+ vectors=vectors,
518
+ )
519
+ )
520
+ )
521
+ if len(upsert_tasks) > 0:
522
+ await asyncio.gather(*upsert_tasks)
523
+
524
+ async def _upsert_to_index(self, index_host: str, vectors: list[PineconeVector]) -> None:
525
+ if len(vectors) == 0: # pragma: no cover
526
+ return
527
+ data_plane = self.get_data_plane(index_host=index_host)
528
+ with manager_observer({"operation": "upsert_in_batches"}):
529
+ await data_plane.upsert_in_batches(
530
+ vectors=vectors,
531
+ max_parallel_batches=self.upsert_parallelism,
532
+ batch_timeout=self.upsert_timeout,
533
+ )
534
+
535
+ async def _delete_by_prefix_to_index(self, index_host: str, prefixes_to_delete: set[str]) -> None:
536
+ if len(prefixes_to_delete) == 0: # pragma: no cover
537
+ return
538
+ data_plane = self.get_data_plane(index_host=index_host)
539
+ with manager_observer({"operation": "delete_by_prefix"}):
540
+ for prefix in prefixes_to_delete:
541
+ await data_plane.delete_by_id_prefix(
542
+ id_prefix=prefix,
543
+ max_parallel_batches=self.delete_parallelism,
544
+ batch_timeout=self.delete_timeout,
545
+ )
546
+
547
+ async def compute_base_vector_metadatas(
548
+ self, index_data: Resource, resource_uuid: str
549
+ ) -> dict[str, VectorMetadata]:
550
+ # This is a CPU bound operation and when the number of vectors is large, it can take a
551
+ # long time (around a second).
552
+ # Ideally, we would use a ProcessPoolExecutor to parallelize the computation of the metadata, but
553
+ # the Resource protobuf is not pickleable, so we can't use it in a ProcessPoolExecutor. This will
554
+ # be less of a problem when we move pinecone indexing to its own consumer.
555
+ return await asyncio.to_thread(self._compute_base_vector_metadatas, index_data, resource_uuid)
556
+
557
+ def _compute_base_vector_metadatas(
558
+ self, index_data: Resource, resource_uuid: str
559
+ ) -> dict[str, VectorMetadata]:
560
+ """
561
+ Compute the base metadata for each vector in the resource.
562
+ This metadata is common to all vectors in the same paragraph, for all vectorsets.
563
+ """
564
+ metadatas: dict[str, VectorMetadata] = {}
565
+ security_public = True
566
+ security_ids_with_access = None
567
+ if index_data.HasField("security"):
568
+ security_public = False
569
+ security_ids_with_access = list(set(index_data.security.access_groups))
570
+
571
+ resource_labels = set(index_data.labels)
572
+ date_created = index_data.metadata.created.ToSeconds()
573
+ date_modified = index_data.metadata.modified.ToSeconds()
574
+
575
+ # First off, iterate the fields and the paragraphs to compute the metadata for
576
+ # each vector, specifically the labels that will be used for filtering.
577
+ for field_id, text_info in index_data.texts.items():
578
+ field_labels = set(text_info.labels)
579
+ field_paragraphs = index_data.paragraphs.get(field_id)
580
+ if field_paragraphs is None:
581
+ logger.info(
582
+ "Paragraphs not found for field",
583
+ extra={"kbid": self.kbid, "rid": resource_uuid, "field_id": field_id},
584
+ )
585
+ continue
586
+
587
+ paragraph: IndexParagraph
588
+ for paragraph_id, paragraph in field_paragraphs.paragraphs.items():
589
+ fid = ParagraphId.from_string(paragraph_id).field_id
590
+ vector_metadata = VectorMetadata(
591
+ rid=resource_uuid,
592
+ field_type=fid.type,
593
+ field_id=fid.key,
594
+ date_created=date_created,
595
+ date_modified=date_modified,
596
+ security_public=security_public,
597
+ security_ids_with_access=security_ids_with_access,
598
+ )
599
+ metadatas[paragraph_id] = vector_metadata
600
+ final_field_labels = resource_labels.union(field_labels)
601
+ if final_field_labels:
602
+ vector_metadata.field_labels = unique(discard_labels(list(final_field_labels)))
603
+ final_paragraph_labels = paragraph.labels
604
+ if final_paragraph_labels:
605
+ vector_metadata.paragraph_labels = unique(
606
+ discard_labels(list(final_paragraph_labels))
607
+ )
608
+ return metadatas
609
+
610
+ async def compute_vectorset_vectors(
611
+ self, index_data: Resource, base_vector_metadatas: dict[str, VectorMetadata]
612
+ ) -> dict[str, list[PineconeVector]]:
613
+ # This is a CPU bound operation and when the number of vectors is large, it can take a
614
+ # long time (around a second).
615
+ # Ideally, we would use a ProcessPoolExecutor to parallelize the computation of the metadata, but
616
+ # the Resource protobuf is not pickleable, so we can't use it in a ProcessPoolExecutor. This will
617
+ # be less of a problem when we move pinecone indexing to its own consumer.
618
+ return await asyncio.to_thread(
619
+ self._compute_vectorset_vectors, index_data, base_vector_metadatas
620
+ )
621
+
622
+ def _compute_vectorset_vectors(
623
+ self, index_data: Resource, base_vector_metadatas: dict[str, VectorMetadata]
624
+ ) -> dict[str, list[PineconeVector]]:
625
+ vectorset_vectors: dict[str, list[PineconeVector]] = {}
626
+ for index_paragraph_id, index_paragraph in iter_paragraphs(index_data):
627
+ # We must compute the vectors for each vectorset present the paragraph.
628
+ vectorset_iterators = {}
629
+ if index_paragraph.sentences and self.default_vectorset:
630
+ vectorset_iterators[self.default_vectorset] = index_paragraph.sentences.items()
631
+ for vectorset_id, vector_sentences in index_paragraph.vectorsets_sentences.items():
632
+ if vector_sentences.sentences:
633
+ vectorset_iterators[vectorset_id] = vector_sentences.sentences.items()
634
+
635
+ vector_sentence: VectorSentence
636
+ for vectorset_id, sentences_iterator in vectorset_iterators.items():
637
+ for sentence_id, vector_sentence in sentences_iterator:
638
+ vector_metadata_to_copy = base_vector_metadatas.get(index_paragraph_id)
639
+ if vector_metadata_to_copy is None:
640
+ logger.warning(
641
+ f"Metadata not found for sentences of paragraph {index_paragraph_id}"
642
+ )
643
+ continue
644
+ # Copy the initial metadata collected at paragraph parsing in case
645
+ # the metadata is different for each vectorset
646
+ vector_metadata = deepcopy(vector_metadata_to_copy)
647
+
648
+ # AI-tables metadata
649
+ if vector_sentence.metadata.page_with_visual:
650
+ vector_metadata.page_with_visual = True
651
+ if vector_sentence.metadata.representation.is_a_table:
652
+ vector_metadata.is_a_table = True
653
+ if vector_sentence.metadata.representation.file:
654
+ vector_metadata.representation_file = (
655
+ vector_sentence.metadata.representation.file
656
+ )
657
+
658
+ # Video positions
659
+ if len(vector_sentence.metadata.position.start_seconds):
660
+ vector_metadata.position_start_seconds = list(
661
+ map(str, vector_sentence.metadata.position.start_seconds)
662
+ )
663
+ if len(vector_sentence.metadata.position.end_seconds):
664
+ vector_metadata.position_end_seconds = list(
665
+ map(str, vector_sentence.metadata.position.end_seconds)
666
+ )
667
+ vector_metadata.page_number = vector_sentence.metadata.position.page_number
668
+ try:
669
+ pc_vector = PineconeVector(
670
+ id=sentence_id,
671
+ values=list(vector_sentence.vector),
672
+ metadata=vector_metadata.model_dump(exclude_none=True),
673
+ )
674
+ except MetadataTooLargeError as exc: # pragma: no cover
675
+ logger.error(f"Invalid Pinecone vector. Metadata is too large. Skipping: {exc}")
676
+ continue
677
+
678
+ vectors = vectorset_vectors.setdefault(vectorset_id, [])
679
+ vectors.append(pc_vector)
680
+ return vectorset_vectors
681
+
682
+ async def _query(self, request: SearchRequest) -> PineconeQueryResults:
683
+ if len(request.vector) == 0:
684
+ return PineconeQueryResults(results=QueryResponse(matches=[]))
685
+ vectorset_id = request.vectorset or self.default_vectorset or "__default__"
686
+ index_host = self.get_index_host(vectorset_id=vectorset_id)
687
+ data_plane = self.get_data_plane(index_host=index_host)
688
+ filter = convert_to_pinecone_filter(request)
689
+ top_k = (request.page_number + 1) * request.result_per_page
690
+ query_results = await data_plane.query(
691
+ vector=list(request.vector),
692
+ top_k=top_k,
693
+ include_values=False,
694
+ include_metadata=True,
695
+ filter=filter,
696
+ timeout=self.query_timeout,
697
+ )
698
+ # filter by min score manually, as Pinecone don't implement this feature
699
+ results = QueryResponse(
700
+ matches=[
701
+ match for match in query_results.matches if match.score >= request.min_score_semantic
702
+ ]
703
+ )
704
+ return PineconeQueryResults(results=results)
705
+
706
+ async def _get_index_counts(self) -> IndexCounts:
707
+ if self.kbid in COUNTERS_CACHE:
708
+ # Cache hit
709
+ return COUNTERS_CACHE[self.kbid]
710
+ total = IndexCounts(
711
+ fields=0,
712
+ paragraphs=0,
713
+ sentences=0,
714
+ )
715
+ tasks = []
716
+ vectorset_results: dict[str, IndexCounts] = {}
717
+
718
+ for vectorset_id in self.indexes.keys():
719
+ tasks.append(
720
+ asyncio.create_task(self._get_vectorset_index_counts(vectorset_id, vectorset_results))
721
+ )
722
+ if len(tasks) > 0:
723
+ await asyncio.gather(*tasks)
724
+
725
+ for _, counts in vectorset_results.items():
726
+ total.paragraphs += counts.paragraphs
727
+ total.sentences += counts.sentences
728
+ COUNTERS_CACHE[self.kbid] = total
729
+ return total
730
+
731
+ async def _get_vectorset_index_counts(
732
+ self, vectorset_id: str, results: dict[str, IndexCounts]
733
+ ) -> None:
734
+ index_host = self.get_index_host(vectorset_id=vectorset_id)
735
+ data_plane = self.get_data_plane(index_host=index_host)
736
+ try:
737
+ index_stats = await data_plane.stats()
738
+ results[vectorset_id] = IndexCounts(
739
+ fields=0,
740
+ paragraphs=index_stats.totalVectorCount,
741
+ sentences=index_stats.totalVectorCount,
742
+ )
743
+ except Exception:
744
+ logger.exception(
745
+ "Error getting index stats",
746
+ extra={"kbid": self.kbid, "provider": self.type.value, "index_host": index_host},
747
+ )
748
+
749
+
750
+ def discard_labels(labels: list[str]) -> list[str]:
751
+ return [
752
+ label
753
+ for label in labels
754
+ if not any(label.startswith(prefix) for prefix in DISCARDED_LABEL_PREFIXES)
755
+ ]
756
+
757
+
758
+ def unique(labels: list[str]) -> list[str]:
759
+ return list(set(labels))
760
+
761
+
762
+ def convert_to_pinecone_filter(request: SearchRequest) -> Optional[dict[str, Any]]:
763
+ """
764
+ Returns a Pinecone filter from a SearchRequest so that RAG features supported by Nuclia
765
+ can be used on Pinecone indexes.
766
+ """
767
+ and_terms = []
768
+ if request.HasField("filter"):
769
+ # Label filtering
770
+ if len(request.filter.paragraph_labels) > 0 and len(request.filter.field_labels) > 0:
771
+ raise ValueError("Cannot filter by paragraph and field labels at the same request")
772
+
773
+ decoded_expression: dict[str, Any] = json.loads(request.filter.labels_expression)
774
+ if len(request.filter.paragraph_labels) > 0:
775
+ and_terms.append(convert_label_filter_expression("paragraph_labels", decoded_expression))
776
+ else:
777
+ and_terms.append(convert_label_filter_expression("field_labels", decoded_expression))
778
+
779
+ if request.HasField("timestamps"):
780
+ # Date range filtering
781
+ and_terms.extend(convert_timestamp_filter(request.timestamps))
782
+
783
+ if len(request.key_filters) > 0:
784
+ # Filter by resource_id
785
+ and_terms.append({"rid": {FilterOperator.IN: list(set(request.key_filters))}})
786
+
787
+ if len(request.security.access_groups):
788
+ # Security filtering
789
+ security_term = {
790
+ LogicalOperator.OR: [
791
+ {"security_public": {"$eq": True}},
792
+ {
793
+ "security_ids_with_access": {
794
+ FilterOperator.IN: list(set(request.security.access_groups))
795
+ }
796
+ },
797
+ ]
798
+ }
799
+ and_terms.append(security_term)
800
+
801
+ if len(request.fields) > 0:
802
+ # Filter by field_id
803
+ fields_term = {
804
+ "field_id": {FilterOperator.IN: list({field_id.strip("/") for field_id in request.fields})}
805
+ }
806
+ and_terms.append(fields_term)
807
+ if len(and_terms) == 0:
808
+ return None
809
+ if len(and_terms) == 1:
810
+ return and_terms[0]
811
+ return {LogicalOperator.AND: and_terms}
812
+
813
+
814
+ def convert_label_filter_expression(
815
+ field: str, expression: dict[str, Any], negative: bool = False
816
+ ) -> dict[str, Any]:
817
+ """
818
+ Converts internal label filter expressions to Pinecone's metadata query language.
819
+
820
+ Note: Since Pinecone does not support negation of expressions, we need to use De Morgan's laws to
821
+ convert the expression to a positive one.
822
+ """
823
+ if "literal" in expression:
824
+ if negative:
825
+ return {field: {FilterOperator.NOT_IN: [expression["literal"]]}}
826
+ else:
827
+ return {field: {FilterOperator.IN: [expression["literal"]]}}
828
+
829
+ if "and" in expression:
830
+ if negative:
831
+ return {
832
+ LogicalOperator.OR: [
833
+ convert_label_filter_expression(field, sub_expression, negative=True)
834
+ for sub_expression in expression["and"]
835
+ ]
836
+ }
837
+ else:
838
+ return {
839
+ LogicalOperator.AND: [
840
+ convert_label_filter_expression(field, sub_expression)
841
+ for sub_expression in expression["and"]
842
+ ]
843
+ }
844
+
845
+ if "or" in expression:
846
+ if negative:
847
+ return {
848
+ LogicalOperator.AND: [
849
+ convert_label_filter_expression(field, sub_expression, negative=True)
850
+ for sub_expression in expression["or"]
851
+ ]
852
+ }
853
+ else:
854
+ return {
855
+ LogicalOperator.OR: [
856
+ convert_label_filter_expression(field, sub_expression)
857
+ for sub_expression in expression["or"]
858
+ ]
859
+ }
860
+
861
+ if "not" in expression:
862
+ return convert_label_filter_expression(field, expression["not"], negative=True)
863
+
864
+ raise ValueError(f"Invalid label filter expression: {expression}")
865
+
866
+
867
+ def convert_timestamp_filter(timestamps: Timestamps) -> list[dict[str, Any]]:
868
+ """
869
+ Allows to filter by date_created and date_modified fields in Pinecone.
870
+ Powers date range filtering at NucliaDB.
871
+ """
872
+ and_terms = []
873
+ if timestamps.HasField("from_modified"):
874
+ and_terms.append(
875
+ {
876
+ "date_modified": {
877
+ FilterOperator.GREATER_THAN_OR_EQUAL: timestamps.from_modified.ToSeconds()
878
+ }
879
+ }
880
+ )
881
+ if timestamps.HasField("to_modified"):
882
+ and_terms.append(
883
+ {"date_modified": {FilterOperator.LESS_THAN_OR_EQUAL: timestamps.to_modified.ToSeconds()}}
884
+ )
885
+ if timestamps.HasField("from_created"):
886
+ and_terms.append(
887
+ {"date_created": {FilterOperator.GREATER_THAN_OR_EQUAL: timestamps.from_created.ToSeconds()}}
888
+ )
889
+ if timestamps.HasField("to_created"):
890
+ and_terms.append(
891
+ {"date_created": {FilterOperator.LESS_THAN_OR_EQUAL: timestamps.to_created.ToSeconds()}}
892
+ )
893
+ return and_terms
894
+
895
+
896
+ def iter_paragraphs(resource: Resource) -> Iterator[tuple[str, IndexParagraph]]:
897
+ for _, paragraphs in resource.paragraphs.items():
898
+ for paragraph_id, paragraph in paragraphs.paragraphs.items():
899
+ yield paragraph_id, paragraph
900
+
901
+
902
+ def to_pinecone_index_metric(similarity: utils_pb2.VectorSimilarity.ValueType) -> str:
903
+ return {
904
+ utils_pb2.VectorSimilarity.COSINE: "cosine",
905
+ utils_pb2.VectorSimilarity.DOT: "dotproduct",
906
+ }[similarity]
907
+
908
+
909
+ def to_pinecone_serverless_cloud_payload(
910
+ serverless: kb_pb2.PineconeServerlessCloud.ValueType,
911
+ ) -> dict[str, str]:
912
+ return {
913
+ kb_pb2.PineconeServerlessCloud.AWS_EU_WEST_1: {
914
+ "cloud": "aws",
915
+ "region": "eu-west-1",
916
+ },
917
+ kb_pb2.PineconeServerlessCloud.AWS_US_EAST_1: {
918
+ "cloud": "aws",
919
+ "region": "us-east-1",
920
+ },
921
+ kb_pb2.PineconeServerlessCloud.AWS_US_WEST_2: {
922
+ "cloud": "aws",
923
+ "region": "us-west-2",
924
+ },
925
+ kb_pb2.PineconeServerlessCloud.AZURE_EASTUS2: {
926
+ "cloud": "azure",
927
+ "region": "eastus2",
928
+ },
929
+ kb_pb2.PineconeServerlessCloud.GCP_US_CENTRAL1: {
930
+ "cloud": "gcp",
931
+ "region": "us-central1",
932
+ },
933
+ }[serverless]