nucliadb 6.7.2.post4874__py3-none-any.whl → 6.10.0.post5705__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (246) hide show
  1. migrations/0023_backfill_pg_catalog.py +8 -4
  2. migrations/0028_extracted_vectors_reference.py +1 -1
  3. migrations/0029_backfill_field_status.py +3 -4
  4. migrations/0032_remove_old_relations.py +2 -3
  5. migrations/0038_backfill_catalog_field_labels.py +8 -4
  6. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  7. migrations/0040_migrate_search_configurations.py +79 -0
  8. migrations/0041_reindex_conversations.py +137 -0
  9. migrations/pg/0010_shards_index.py +34 -0
  10. nucliadb/search/api/v1/resource/utils.py → migrations/pg/0011_catalog_statistics.py +5 -6
  11. migrations/pg/0012_catalog_statistics_undo.py +26 -0
  12. nucliadb/backups/create.py +2 -15
  13. nucliadb/backups/restore.py +4 -15
  14. nucliadb/backups/tasks.py +4 -1
  15. nucliadb/common/back_pressure/cache.py +2 -3
  16. nucliadb/common/back_pressure/materializer.py +7 -13
  17. nucliadb/common/back_pressure/settings.py +6 -6
  18. nucliadb/common/back_pressure/utils.py +1 -0
  19. nucliadb/common/cache.py +9 -9
  20. nucliadb/common/catalog/__init__.py +79 -0
  21. nucliadb/common/catalog/dummy.py +36 -0
  22. nucliadb/common/catalog/interface.py +85 -0
  23. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +330 -232
  24. nucliadb/common/catalog/utils.py +56 -0
  25. nucliadb/common/cluster/manager.py +8 -23
  26. nucliadb/common/cluster/rebalance.py +484 -112
  27. nucliadb/common/cluster/rollover.py +36 -9
  28. nucliadb/common/cluster/settings.py +4 -9
  29. nucliadb/common/cluster/utils.py +34 -8
  30. nucliadb/common/context/__init__.py +7 -8
  31. nucliadb/common/context/fastapi.py +1 -2
  32. nucliadb/common/datamanagers/__init__.py +2 -4
  33. nucliadb/common/datamanagers/atomic.py +9 -2
  34. nucliadb/common/datamanagers/cluster.py +1 -2
  35. nucliadb/common/datamanagers/fields.py +3 -4
  36. nucliadb/common/datamanagers/kb.py +6 -6
  37. nucliadb/common/datamanagers/labels.py +2 -3
  38. nucliadb/common/datamanagers/resources.py +10 -33
  39. nucliadb/common/datamanagers/rollover.py +5 -7
  40. nucliadb/common/datamanagers/search_configurations.py +1 -2
  41. nucliadb/common/datamanagers/synonyms.py +1 -2
  42. nucliadb/common/datamanagers/utils.py +4 -4
  43. nucliadb/common/datamanagers/vectorsets.py +4 -4
  44. nucliadb/common/external_index_providers/base.py +32 -5
  45. nucliadb/common/external_index_providers/manager.py +5 -34
  46. nucliadb/common/external_index_providers/settings.py +1 -27
  47. nucliadb/common/filter_expression.py +129 -41
  48. nucliadb/common/http_clients/exceptions.py +8 -0
  49. nucliadb/common/http_clients/processing.py +16 -23
  50. nucliadb/common/http_clients/utils.py +3 -0
  51. nucliadb/common/ids.py +82 -58
  52. nucliadb/common/locking.py +1 -2
  53. nucliadb/common/maindb/driver.py +9 -8
  54. nucliadb/common/maindb/local.py +5 -5
  55. nucliadb/common/maindb/pg.py +9 -8
  56. nucliadb/common/nidx.py +22 -5
  57. nucliadb/common/vector_index_config.py +1 -1
  58. nucliadb/export_import/datamanager.py +4 -3
  59. nucliadb/export_import/exporter.py +11 -19
  60. nucliadb/export_import/importer.py +13 -6
  61. nucliadb/export_import/tasks.py +2 -0
  62. nucliadb/export_import/utils.py +6 -18
  63. nucliadb/health.py +2 -2
  64. nucliadb/ingest/app.py +8 -8
  65. nucliadb/ingest/consumer/consumer.py +8 -10
  66. nucliadb/ingest/consumer/pull.py +10 -8
  67. nucliadb/ingest/consumer/service.py +5 -30
  68. nucliadb/ingest/consumer/shard_creator.py +16 -5
  69. nucliadb/ingest/consumer/utils.py +1 -1
  70. nucliadb/ingest/fields/base.py +37 -49
  71. nucliadb/ingest/fields/conversation.py +55 -9
  72. nucliadb/ingest/fields/exceptions.py +1 -2
  73. nucliadb/ingest/fields/file.py +22 -8
  74. nucliadb/ingest/fields/link.py +7 -7
  75. nucliadb/ingest/fields/text.py +2 -3
  76. nucliadb/ingest/orm/brain_v2.py +89 -57
  77. nucliadb/ingest/orm/broker_message.py +2 -4
  78. nucliadb/ingest/orm/entities.py +10 -209
  79. nucliadb/ingest/orm/index_message.py +128 -113
  80. nucliadb/ingest/orm/knowledgebox.py +91 -59
  81. nucliadb/ingest/orm/processor/auditing.py +1 -3
  82. nucliadb/ingest/orm/processor/data_augmentation.py +1 -2
  83. nucliadb/ingest/orm/processor/processor.py +98 -153
  84. nucliadb/ingest/orm/processor/sequence_manager.py +1 -2
  85. nucliadb/ingest/orm/resource.py +82 -71
  86. nucliadb/ingest/orm/utils.py +1 -1
  87. nucliadb/ingest/partitions.py +12 -1
  88. nucliadb/ingest/processing.py +17 -17
  89. nucliadb/ingest/serialize.py +202 -145
  90. nucliadb/ingest/service/writer.py +15 -114
  91. nucliadb/ingest/settings.py +36 -15
  92. nucliadb/ingest/utils.py +1 -2
  93. nucliadb/learning_proxy.py +23 -26
  94. nucliadb/metrics_exporter.py +20 -6
  95. nucliadb/middleware/__init__.py +82 -1
  96. nucliadb/migrator/datamanager.py +4 -11
  97. nucliadb/migrator/migrator.py +1 -2
  98. nucliadb/migrator/models.py +1 -2
  99. nucliadb/migrator/settings.py +1 -2
  100. nucliadb/models/internal/augment.py +614 -0
  101. nucliadb/models/internal/processing.py +19 -19
  102. nucliadb/openapi.py +2 -2
  103. nucliadb/purge/__init__.py +3 -8
  104. nucliadb/purge/orphan_shards.py +1 -2
  105. nucliadb/reader/__init__.py +5 -0
  106. nucliadb/reader/api/models.py +6 -13
  107. nucliadb/reader/api/v1/download.py +59 -38
  108. nucliadb/reader/api/v1/export_import.py +4 -4
  109. nucliadb/reader/api/v1/knowledgebox.py +37 -9
  110. nucliadb/reader/api/v1/learning_config.py +33 -14
  111. nucliadb/reader/api/v1/resource.py +61 -9
  112. nucliadb/reader/api/v1/services.py +18 -14
  113. nucliadb/reader/app.py +3 -1
  114. nucliadb/reader/reader/notifications.py +1 -2
  115. nucliadb/search/api/v1/__init__.py +3 -0
  116. nucliadb/search/api/v1/ask.py +3 -4
  117. nucliadb/search/api/v1/augment.py +585 -0
  118. nucliadb/search/api/v1/catalog.py +15 -19
  119. nucliadb/search/api/v1/find.py +16 -22
  120. nucliadb/search/api/v1/hydrate.py +328 -0
  121. nucliadb/search/api/v1/knowledgebox.py +1 -2
  122. nucliadb/search/api/v1/predict_proxy.py +1 -2
  123. nucliadb/search/api/v1/resource/ask.py +28 -8
  124. nucliadb/search/api/v1/resource/ingestion_agents.py +5 -6
  125. nucliadb/search/api/v1/resource/search.py +9 -11
  126. nucliadb/search/api/v1/retrieve.py +130 -0
  127. nucliadb/search/api/v1/search.py +28 -32
  128. nucliadb/search/api/v1/suggest.py +11 -14
  129. nucliadb/search/api/v1/summarize.py +1 -2
  130. nucliadb/search/api/v1/utils.py +2 -2
  131. nucliadb/search/app.py +3 -2
  132. nucliadb/search/augmentor/__init__.py +21 -0
  133. nucliadb/search/augmentor/augmentor.py +232 -0
  134. nucliadb/search/augmentor/fields.py +704 -0
  135. nucliadb/search/augmentor/metrics.py +24 -0
  136. nucliadb/search/augmentor/paragraphs.py +334 -0
  137. nucliadb/search/augmentor/resources.py +238 -0
  138. nucliadb/search/augmentor/utils.py +33 -0
  139. nucliadb/search/lifecycle.py +3 -1
  140. nucliadb/search/predict.py +33 -19
  141. nucliadb/search/predict_models.py +8 -9
  142. nucliadb/search/requesters/utils.py +11 -10
  143. nucliadb/search/search/cache.py +19 -42
  144. nucliadb/search/search/chat/ask.py +131 -59
  145. nucliadb/search/search/chat/exceptions.py +3 -5
  146. nucliadb/search/search/chat/fetcher.py +201 -0
  147. nucliadb/search/search/chat/images.py +6 -4
  148. nucliadb/search/search/chat/old_prompt.py +1375 -0
  149. nucliadb/search/search/chat/parser.py +510 -0
  150. nucliadb/search/search/chat/prompt.py +563 -615
  151. nucliadb/search/search/chat/query.py +453 -32
  152. nucliadb/search/search/chat/rpc.py +85 -0
  153. nucliadb/search/search/fetch.py +3 -4
  154. nucliadb/search/search/filters.py +8 -11
  155. nucliadb/search/search/find.py +33 -31
  156. nucliadb/search/search/find_merge.py +124 -331
  157. nucliadb/search/search/graph_strategy.py +14 -12
  158. nucliadb/search/search/hydrator/__init__.py +49 -0
  159. nucliadb/search/search/hydrator/fields.py +217 -0
  160. nucliadb/search/search/hydrator/images.py +130 -0
  161. nucliadb/search/search/hydrator/paragraphs.py +323 -0
  162. nucliadb/search/search/hydrator/resources.py +60 -0
  163. nucliadb/search/search/ingestion_agents.py +5 -5
  164. nucliadb/search/search/merge.py +90 -94
  165. nucliadb/search/search/metrics.py +24 -7
  166. nucliadb/search/search/paragraphs.py +7 -9
  167. nucliadb/search/search/predict_proxy.py +44 -18
  168. nucliadb/search/search/query.py +14 -86
  169. nucliadb/search/search/query_parser/fetcher.py +51 -82
  170. nucliadb/search/search/query_parser/models.py +19 -48
  171. nucliadb/search/search/query_parser/old_filters.py +20 -19
  172. nucliadb/search/search/query_parser/parsers/ask.py +5 -6
  173. nucliadb/search/search/query_parser/parsers/catalog.py +7 -11
  174. nucliadb/search/search/query_parser/parsers/common.py +21 -13
  175. nucliadb/search/search/query_parser/parsers/find.py +6 -29
  176. nucliadb/search/search/query_parser/parsers/graph.py +18 -28
  177. nucliadb/search/search/query_parser/parsers/retrieve.py +207 -0
  178. nucliadb/search/search/query_parser/parsers/search.py +15 -56
  179. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +8 -29
  180. nucliadb/search/search/rank_fusion.py +18 -13
  181. nucliadb/search/search/rerankers.py +6 -7
  182. nucliadb/search/search/retrieval.py +300 -0
  183. nucliadb/search/search/summarize.py +5 -6
  184. nucliadb/search/search/utils.py +3 -4
  185. nucliadb/search/settings.py +1 -2
  186. nucliadb/standalone/api_router.py +1 -1
  187. nucliadb/standalone/app.py +4 -3
  188. nucliadb/standalone/auth.py +5 -6
  189. nucliadb/standalone/lifecycle.py +2 -2
  190. nucliadb/standalone/run.py +5 -4
  191. nucliadb/standalone/settings.py +5 -6
  192. nucliadb/standalone/versions.py +3 -4
  193. nucliadb/tasks/consumer.py +13 -8
  194. nucliadb/tasks/models.py +2 -1
  195. nucliadb/tasks/producer.py +3 -3
  196. nucliadb/tasks/retries.py +8 -7
  197. nucliadb/train/api/utils.py +1 -3
  198. nucliadb/train/api/v1/shards.py +1 -2
  199. nucliadb/train/api/v1/trainset.py +1 -2
  200. nucliadb/train/app.py +1 -1
  201. nucliadb/train/generator.py +4 -4
  202. nucliadb/train/generators/field_classifier.py +2 -2
  203. nucliadb/train/generators/field_streaming.py +6 -6
  204. nucliadb/train/generators/image_classifier.py +2 -2
  205. nucliadb/train/generators/paragraph_classifier.py +2 -2
  206. nucliadb/train/generators/paragraph_streaming.py +2 -2
  207. nucliadb/train/generators/question_answer_streaming.py +2 -2
  208. nucliadb/train/generators/sentence_classifier.py +4 -10
  209. nucliadb/train/generators/token_classifier.py +3 -2
  210. nucliadb/train/generators/utils.py +6 -5
  211. nucliadb/train/nodes.py +3 -3
  212. nucliadb/train/resource.py +6 -8
  213. nucliadb/train/settings.py +3 -4
  214. nucliadb/train/types.py +11 -11
  215. nucliadb/train/upload.py +3 -2
  216. nucliadb/train/uploader.py +1 -2
  217. nucliadb/train/utils.py +1 -2
  218. nucliadb/writer/api/v1/export_import.py +4 -1
  219. nucliadb/writer/api/v1/field.py +15 -14
  220. nucliadb/writer/api/v1/knowledgebox.py +18 -56
  221. nucliadb/writer/api/v1/learning_config.py +5 -4
  222. nucliadb/writer/api/v1/resource.py +9 -20
  223. nucliadb/writer/api/v1/services.py +10 -132
  224. nucliadb/writer/api/v1/upload.py +73 -72
  225. nucliadb/writer/app.py +8 -2
  226. nucliadb/writer/resource/basic.py +12 -15
  227. nucliadb/writer/resource/field.py +43 -5
  228. nucliadb/writer/resource/origin.py +7 -0
  229. nucliadb/writer/settings.py +2 -3
  230. nucliadb/writer/tus/__init__.py +2 -3
  231. nucliadb/writer/tus/azure.py +5 -7
  232. nucliadb/writer/tus/dm.py +3 -3
  233. nucliadb/writer/tus/exceptions.py +3 -4
  234. nucliadb/writer/tus/gcs.py +15 -22
  235. nucliadb/writer/tus/s3.py +2 -3
  236. nucliadb/writer/tus/storage.py +3 -3
  237. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/METADATA +10 -11
  238. nucliadb-6.10.0.post5705.dist-info/RECORD +410 -0
  239. nucliadb/common/datamanagers/entities.py +0 -139
  240. nucliadb/common/external_index_providers/pinecone.py +0 -894
  241. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  242. nucliadb/search/search/hydrator.py +0 -197
  243. nucliadb-6.7.2.post4874.dist-info/RECORD +0 -383
  244. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/WHEEL +0 -0
  245. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/entry_points.txt +0 -0
  246. {nucliadb-6.7.2.post4874.dist-info → nucliadb-6.10.0.post5705.dist-info}/top_level.txt +0 -0
@@ -27,10 +27,11 @@ Backfill the data into the PG catalog
27
27
  import logging
28
28
  from typing import cast
29
29
 
30
- from nucliadb.common import datamanagers
30
+ from nucliadb.common.catalog import catalog_update, get_catalog
31
+ from nucliadb.common.catalog.pg import PGCatalog
31
32
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
33
  from nucliadb.ingest.orm.index_message import get_resource_index_message
33
- from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
34
+ from nucliadb.ingest.orm.resource import Resource
34
35
  from nucliadb.migrator.context import ExecutionContext
35
36
 
36
37
  logger = logging.getLogger(__name__)
@@ -43,6 +44,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
43
44
  if not isinstance(context.kv_driver, PGDriver):
44
45
  return
45
46
 
47
+ if not isinstance(get_catalog(), PGCatalog):
48
+ return
49
+
46
50
  BATCH_SIZE = 100
47
51
  async with context.kv_driver.rw_transaction() as txn:
48
52
  txn = cast(PGTransaction, txn)
@@ -69,13 +73,13 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
69
73
  # Index each resource
70
74
  for rid in resources_to_index:
71
75
  rid = str(rid).replace("-", "")
72
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
76
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
73
77
  if resource is None:
74
78
  logger.warning(f"Could not load resource {rid} for kbid {kbid}")
75
79
  continue
76
80
 
77
81
  index_message = await get_resource_index_message(resource, reindex=False)
78
- await pgcatalog_update(txn, kbid, resource, index_message)
82
+ await catalog_update(txn, kbid, resource, index_message)
79
83
 
80
84
  await txn.commit()
81
85
  continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
@@ -39,7 +39,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
39
39
  async with datamanagers.with_rw_transaction() as txn:
40
40
  vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
41
41
 
42
- if len(vectorsets) == 0: # pragma: nocover
42
+ if len(vectorsets) == 0: # pragma: no cover
43
43
  # should never happen, everyone should have at least one
44
44
  logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
45
45
  return
@@ -24,7 +24,6 @@ Backfill field status (from error)
24
24
  """
25
25
 
26
26
  import logging
27
- from typing import Optional
28
27
 
29
28
  from nucliadb.migrator.context import ExecutionContext
30
29
  from nucliadb_protos import resources_pb2, writer_pb2
@@ -33,7 +32,7 @@ logger = logging.getLogger(__name__)
33
32
 
34
33
 
35
34
  async def migrate(context: ExecutionContext) -> None:
36
- start: Optional[str] = ""
35
+ start: str | None = ""
37
36
  while True:
38
37
  if start is None:
39
38
  break
@@ -43,7 +42,7 @@ async def migrate(context: ExecutionContext) -> None:
43
42
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
44
43
 
45
44
 
46
- async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
45
+ async def do_batch(context: ExecutionContext, start: str) -> str | None:
47
46
  logger.info(f"Running batch from {start}")
48
47
  async with context.kv_driver.rw_transaction() as txn:
49
48
  async with txn.connection.cursor() as cur: # type: ignore
@@ -64,7 +63,7 @@ async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
64
63
  field_keys = [r[0] for r in records]
65
64
 
66
65
  # Retrieve resources basic (to check status)
67
- resource_keys = set(["/".join(f.split("/")[:5]) for f in field_keys])
66
+ resource_keys = {"/".join(f.split("/")[:5]) for f in field_keys}
68
67
  await cur.execute(
69
68
  """
70
69
  SELECT key, value FROM resources
@@ -26,7 +26,6 @@ is stored in object storage.
26
26
  """
27
27
 
28
28
  import logging
29
- from typing import Optional
30
29
 
31
30
  from nucliadb.migrator.context import ExecutionContext
32
31
 
@@ -34,7 +33,7 @@ logger = logging.getLogger(__name__)
34
33
 
35
34
 
36
35
  async def migrate(context: ExecutionContext) -> None:
37
- start: Optional[str] = ""
36
+ start: str | None = ""
38
37
  while True:
39
38
  if start is None:
40
39
  break
@@ -45,7 +44,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
45
44
  pass
46
45
 
47
46
 
48
- async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
47
+ async def do_batch(context: ExecutionContext, start: str) -> str | None:
49
48
  logger.info(f"Running batch from {start}")
50
49
  async with context.kv_driver.rw_transaction() as txn:
51
50
  async with txn.connection.cursor() as cur: # type: ignore
@@ -27,10 +27,11 @@ Backfill the catalog with labels from fields metadata
27
27
  import logging
28
28
  from typing import cast
29
29
 
30
- from nucliadb.common import datamanagers
30
+ from nucliadb.common.catalog import catalog_update, get_catalog
31
+ from nucliadb.common.catalog.pg import PGCatalog
31
32
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
33
  from nucliadb.ingest.orm.index_message import get_resource_index_message
33
- from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
34
+ from nucliadb.ingest.orm.resource import Resource
34
35
  from nucliadb.migrator.context import ExecutionContext
35
36
  from nucliadb_protos import resources_pb2
36
37
 
@@ -44,6 +45,9 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
44
45
  if not isinstance(context.kv_driver, PGDriver):
45
46
  return
46
47
 
48
+ if not isinstance(get_catalog(), PGCatalog):
49
+ return
50
+
47
51
  BATCH_SIZE = 100
48
52
  async with context.kv_driver.rw_transaction() as txn:
49
53
  txn = cast(PGTransaction, txn)
@@ -78,13 +82,13 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
78
82
  # Index each resource
79
83
  for key in to_index:
80
84
  rid = key.split("/")[4]
81
- resource = await datamanagers.resources.get_resource(txn, kbid=kbid, rid=rid)
85
+ resource = await Resource.get(txn, kbid=kbid, rid=rid)
82
86
  if resource is None:
83
87
  logger.warning(f"Could not load resource {rid} for kbid {kbid}")
84
88
  continue
85
89
 
86
90
  index_message = await get_resource_index_message(resource, reindex=False)
87
- await pgcatalog_update(txn, kbid, resource, index_message)
91
+ await catalog_update(txn, kbid, resource, index_message)
88
92
 
89
93
  if to_index:
90
94
  await txn.commit()
@@ -0,0 +1,106 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #39
22
+
23
+ Backfill splits metadata on conversation fields
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common.maindb.driver import Transaction
31
+ from nucliadb.common.maindb.pg import PGTransaction
32
+ from nucliadb.ingest.fields.conversation import (
33
+ CONVERSATION_SPLITS_METADATA,
34
+ Conversation,
35
+ )
36
+ from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
37
+ from nucliadb.migrator.context import ExecutionContext
38
+ from nucliadb_protos import resources_pb2
39
+ from nucliadb_protos.resources_pb2 import SplitsMetadata
40
+ from nucliadb_utils.storages.storage import Storage
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ async def migrate(context: ExecutionContext) -> None: ...
46
+
47
+
48
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
49
+ BATCH_SIZE = 100
50
+ start = ""
51
+ while True:
52
+ to_fix: list[tuple[str, str]] = []
53
+ async with context.kv_driver.rw_transaction() as txn:
54
+ txn = cast(PGTransaction, txn)
55
+ async with txn.connection.cursor() as cur:
56
+ # Retrieve a bunch of conversation fields
57
+ await cur.execute(
58
+ """
59
+ SELECT key FROM resources
60
+ WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
61
+ AND key > %s
62
+ ORDER BY key
63
+ LIMIT %s""",
64
+ (kbid, start, BATCH_SIZE),
65
+ )
66
+ rows = await cur.fetchall()
67
+ if len(rows) == 0:
68
+ return
69
+ for row in rows:
70
+ key = row[0]
71
+ start = key
72
+ rid = key.split("/")[4]
73
+ field_id = key.split("/")[7]
74
+ to_fix.append((rid, field_id))
75
+
76
+ for rid, field_id in to_fix:
77
+ async with context.kv_driver.rw_transaction() as txn2:
78
+ splits_metadata = await build_splits_metadata(
79
+ txn2, context.blob_storage, kbid, rid, field_id
80
+ )
81
+ splits_metadata_key = CONVERSATION_SPLITS_METADATA.format(
82
+ kbid=kbid, uuid=rid, type="c", field=field_id
83
+ )
84
+ await txn2.set(splits_metadata_key, splits_metadata.SerializeToString())
85
+ await txn2.commit()
86
+
87
+
88
+ async def build_splits_metadata(
89
+ txn: Transaction, storage: Storage, kbid: str, rid: str, field_id: str
90
+ ) -> SplitsMetadata:
91
+ splits_metadata = SplitsMetadata()
92
+ kb_orm = KnowledgeBoxORM(txn, storage, kbid)
93
+ resource_obj = await kb_orm.get(rid)
94
+ if resource_obj is None:
95
+ return splits_metadata
96
+ field_obj: Conversation = await resource_obj.get_field(
97
+ field_id, resources_pb2.FieldType.CONVERSATION, load=False
98
+ )
99
+ conv_metadata = await field_obj.get_metadata()
100
+ for i in range(1, conv_metadata.pages + 1):
101
+ page = await field_obj.get_value(page=i)
102
+ if page is None:
103
+ continue
104
+ for message in page.messages:
105
+ splits_metadata.metadata.get_or_create(message.ident)
106
+ return splits_metadata
@@ -0,0 +1,79 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #40
22
+
23
+ Replaces deprecated and removed generative models from search configurations
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common import datamanagers
31
+ from nucliadb.migrator.context import ExecutionContext
32
+ from nucliadb_models.configuration import SearchConfiguration
33
+ from nucliadb_models.search import AskRequest, FindRequest
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ REPLACEMENTS = {
38
+ "claude-3-5-small": "claude-4-5-sonnet",
39
+ "gcp-claude-3-5-sonnet-v2": "gcp-claude-4-5-sonnet",
40
+ }
41
+
42
+
43
+ async def migrate(context: ExecutionContext) -> None: ...
44
+
45
+
46
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
47
+ affected = await get_affected_search_configurations(kbid)
48
+ if not affected:
49
+ return
50
+
51
+ async with datamanagers.with_rw_transaction() as txn:
52
+ for name, config in affected.items():
53
+ logger.info(
54
+ "Migrating search config for kb",
55
+ extra={
56
+ "kbid": kbid,
57
+ "search_config": name,
58
+ "generative_model": config.config.generative_model, # type: ignore
59
+ },
60
+ )
61
+ config.config.generative_model = REPLACEMENTS[config.config.generative_model] # type: ignore
62
+ await datamanagers.search_configurations.set(txn, kbid=kbid, name=name, config=config)
63
+ await txn.commit()
64
+
65
+
66
+ async def get_affected_search_configurations(kbid: str) -> dict[str, SearchConfiguration]:
67
+ result: dict[str, SearchConfiguration] = {}
68
+ async with datamanagers.with_ro_transaction() as txn:
69
+ search_configs = await datamanagers.search_configurations.list(txn, kbid=kbid)
70
+ for name, config in search_configs.items():
71
+ if config.kind == "find":
72
+ find_config = cast(FindRequest, config.config)
73
+ if find_config.generative_model in REPLACEMENTS:
74
+ result[name] = config
75
+ elif config.kind == "ask":
76
+ ask_config = cast(AskRequest, config.config)
77
+ if ask_config.generative_model in REPLACEMENTS:
78
+ result[name] = config
79
+ return result
@@ -0,0 +1,137 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ import logging
21
+ import uuid
22
+ from collections.abc import AsyncIterator
23
+ from typing import cast
24
+
25
+ from nucliadb.common import datamanagers
26
+ from nucliadb.common.maindb.pg import PGTransaction
27
+ from nucliadb.ingest.orm.index_message import get_resource_index_message
28
+ from nucliadb.ingest.orm.resource import Resource
29
+ from nucliadb.migrator.context import ExecutionContext
30
+ from nucliadb_protos.writer_pb2 import ShardObject, Shards
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ async def migrate(context: ExecutionContext) -> None: ...
36
+
37
+
38
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
39
+ """
40
+ Reindex resources that have conversation fields
41
+ """
42
+ kb_shards = await datamanagers.atomic.cluster.get_kb_shards(kbid=kbid, for_update=False)
43
+ if kb_shards is not None:
44
+ async for rid in iter_affected_resource_ids(context, kbid):
45
+ await reindex_resource(context, kbid, rid, kb_shards)
46
+ else:
47
+ logger.warning(
48
+ "Migration 41: KB shards not found, skipping reindexing",
49
+ extra={"kbid": kbid},
50
+ )
51
+
52
+
53
+ async def reindex_resource(
54
+ context: ExecutionContext,
55
+ kbid: str,
56
+ rid: str,
57
+ kb_shards: Shards,
58
+ ) -> None:
59
+ """
60
+ Reindex a single resource
61
+ """
62
+ async with datamanagers.with_ro_transaction() as rs_txn:
63
+ # Fetch the resource
64
+ resource = await Resource.get(rs_txn, kbid=kbid, rid=rid)
65
+ if resource is None:
66
+ logger.warning(
67
+ "Migration 41: Resource not found, skipping reindexing",
68
+ extra={"kbid": kbid, "rid": rid},
69
+ )
70
+ return
71
+
72
+ # Get the shard for the resource
73
+ shard: ShardObject | None = None
74
+ shard_id = await datamanagers.resources.get_resource_shard_id(
75
+ rs_txn, kbid=kbid, rid=rid, for_update=False
76
+ )
77
+ if shard_id is not None:
78
+ shard = next((shard for shard in kb_shards.shards if shard.shard == shard_id), None)
79
+ if shard is None:
80
+ logger.warning(
81
+ "Migration 41: Shard not found for resource, skipping reindexing",
82
+ extra={"kbid": kbid, "rid": rid, "shard_id": shard_id},
83
+ )
84
+ return
85
+
86
+ # Create the index message and reindex the resource
87
+ index_message = await get_resource_index_message(resource, reindex=True)
88
+ await context.shard_manager.add_resource(
89
+ shard,
90
+ index_message,
91
+ 0,
92
+ partition="0",
93
+ kb=kbid,
94
+ reindex_id=uuid.uuid4().hex,
95
+ )
96
+ logger.info(
97
+ "Migration 41: Resource reindexed",
98
+ extra={"kbid": kbid, "rid": rid},
99
+ )
100
+
101
+
102
+ async def iter_affected_resource_ids(context: ExecutionContext, kbid: str) -> AsyncIterator[str]:
103
+ start = ""
104
+ while True:
105
+ keys_batch = await get_batch(context, kbid, start)
106
+ if keys_batch is None:
107
+ break
108
+ start = keys_batch[-1]
109
+ for key in keys_batch:
110
+ # The keys have the format /kbs/{kbid}/r/{rid}/f/c/{field_id}
111
+ rid = key.split("/")[4]
112
+ yield rid
113
+
114
+
115
+ async def get_batch(context: ExecutionContext, kbid: str, start: str) -> list[str] | None:
116
+ """
117
+ Get a batch of resource keys that hold conversation fields for the given KB.
118
+ Starting after the given start key.
119
+ Returns None if no more keys are found.
120
+ """
121
+ batch_size = 100
122
+ async with context.kv_driver.rw_transaction() as txn:
123
+ txn = cast(PGTransaction, txn)
124
+ async with txn.connection.cursor() as cur:
125
+ await cur.execute(
126
+ """
127
+ SELECT key FROM resources
128
+ WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
129
+ AND key > %s
130
+ ORDER BY key
131
+ LIMIT %s""",
132
+ (kbid, start, batch_size),
133
+ )
134
+ rows = await cur.fetchall()
135
+ if len(rows) == 0:
136
+ return None
137
+ return [row[0] for row in rows]
@@ -0,0 +1,34 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ # Concurrent index must be created outside of a transaction but psycopg automatically
26
+ # creates transactions. We temporarily disable this for building indexes.
27
+ await txn.connection.commit()
28
+ try:
29
+ await txn.connection.set_autocommit(True)
30
+ await txn.connection.execute(
31
+ "CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
32
+ )
33
+ finally:
34
+ await txn.connection.set_autocommit(False)
@@ -18,11 +18,10 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
 
21
- from typing import Optional
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
22
 
23
- from nucliadb.common import datamanagers
24
23
 
25
-
26
- async def get_resource_uuid_by_slug(kbid: str, slug: str) -> Optional[str]:
27
- async with datamanagers.with_ro_transaction() as txn:
28
- return await datamanagers.resources.get_resource_uuid_from_slug(txn, kbid=kbid, slug=slug)
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute("CREATE STATISTICS catalog_kbid_labels ON kbid, labels FROM catalog;")
27
+ await cur.execute("ANALYZE catalog;")
@@ -0,0 +1,26 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ async with txn.connection.cursor() as cur:
26
+ await cur.execute("DROP STATISTICS catalog_kbid_labels;")
@@ -21,8 +21,8 @@ import asyncio
21
21
  import json
22
22
  import logging
23
23
  import tarfile
24
+ from collections.abc import AsyncIterator
24
25
  from datetime import datetime, timezone
25
- from typing import AsyncIterator, Optional
26
26
 
27
27
  from nucliadb.backups.const import (
28
28
  BackupFinishedStream,
@@ -37,7 +37,6 @@ from nucliadb.export_import.utils import (
37
37
  download_binary,
38
38
  get_broker_message,
39
39
  get_cloud_files,
40
- get_entities,
41
40
  get_labels,
42
41
  get_search_configurations,
43
42
  get_synonyms,
@@ -76,7 +75,6 @@ async def backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
76
75
  """
77
76
  await backup_resources(context, kbid, backup_id)
78
77
  await backup_labels(context, kbid, backup_id)
79
- await backup_entities(context, kbid, backup_id)
80
78
  await backup_synonyms(context, kbid, backup_id)
81
79
  await backup_search_configurations(context, kbid, backup_id)
82
80
  await notify_backup_completed(context, kbid, backup_id)
@@ -235,15 +233,6 @@ async def backup_labels(context: ApplicationContext, kbid: str, backup_id: str):
235
233
  )
236
234
 
237
235
 
238
- async def backup_entities(context: ApplicationContext, kbid: str, backup_id: str):
239
- entities = await get_entities(context, kbid)
240
- await context.blob_storage.upload_object(
241
- bucket=settings.backups_bucket,
242
- key=StorageKeys.ENTITIES.format(backup_id=backup_id),
243
- data=entities.SerializeToString(),
244
- )
245
-
246
-
247
236
  async def backup_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
248
237
  synonyms = await get_synonyms(context, kbid)
249
238
  await context.blob_storage.upload_object(
@@ -266,9 +255,7 @@ async def backup_search_configurations(context: ApplicationContext, kbid: str, b
266
255
  )
267
256
 
268
257
 
269
- async def get_metadata(
270
- context: ApplicationContext, kbid: str, backup_id: str
271
- ) -> Optional[BackupMetadata]:
258
+ async def get_metadata(context: ApplicationContext, kbid: str, backup_id: str) -> BackupMetadata | None:
272
259
  async with context.kv_driver.ro_transaction() as txn:
273
260
  metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
274
261
  if metadata_raw is None:
@@ -24,7 +24,8 @@ import functools
24
24
  import json
25
25
  import logging
26
26
  import tarfile
27
- from typing import Any, AsyncIterator, Callable, Optional, Union
27
+ from collections.abc import AsyncIterator, Callable
28
+ from typing import Any
28
29
 
29
30
  from pydantic import TypeAdapter
30
31
 
@@ -35,7 +36,6 @@ from nucliadb.common.context import ApplicationContext
35
36
  from nucliadb.export_import.utils import (
36
37
  import_binary,
37
38
  restore_broker_message,
38
- set_entities_groups,
39
39
  set_labels,
40
40
  set_search_configurations,
41
41
  set_synonyms,
@@ -74,7 +74,6 @@ async def restore_kb(context: ApplicationContext, kbid: str, backup_id: str):
74
74
  """
75
75
  await restore_resources(context, kbid, backup_id)
76
76
  await restore_labels(context, kbid, backup_id)
77
- await restore_entities(context, kbid, backup_id)
78
77
  await restore_synonyms(context, kbid, backup_id)
79
78
  await restore_search_configurations(context, kbid, backup_id)
80
79
  await delete_last_restored(context, kbid, backup_id)
@@ -101,7 +100,7 @@ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: s
101
100
  await set_last_restored(context, kbid, backup_id, key)
102
101
 
103
102
 
104
- async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> Optional[str]:
103
+ async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> str | None:
105
104
  key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
106
105
  async with context.kv_driver.ro_transaction() as txn:
107
106
  raw = await txn.get(key)
@@ -191,7 +190,7 @@ class ResourceBackupReader:
191
190
  data = await self.read(tarinfo_size + padding_bytes)
192
191
  return data[:tarinfo_size]
193
192
 
194
- async def read_item(self) -> Union[BrokerMessage, CloudFile, CloudFileBinary]:
193
+ async def read_item(self) -> BrokerMessage | CloudFile | CloudFileBinary:
195
194
  tarinfo = await self.read_tarinfo()
196
195
  if tarinfo.name.startswith("broker-message"):
197
196
  raw_bm = await self.read_data(tarinfo)
@@ -257,16 +256,6 @@ async def restore_labels(context: ApplicationContext, kbid: str, backup_id: str)
257
256
  await set_labels(context, kbid, labels)
258
257
 
259
258
 
260
- async def restore_entities(context: ApplicationContext, kbid: str, backup_id: str):
261
- raw = await context.blob_storage.downloadbytes(
262
- bucket=settings.backups_bucket,
263
- key=StorageKeys.ENTITIES.format(backup_id=backup_id),
264
- )
265
- entities = kb_pb2.EntitiesGroups()
266
- entities.ParseFromString(raw.getvalue())
267
- await set_entities_groups(context, kbid, entities)
268
-
269
-
270
259
  async def restore_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
271
260
  raw = await context.blob_storage.downloadbytes(
272
261
  bucket=settings.backups_bucket,