nucliadb 6.7.2.post4862__py3-none-any.whl → 6.9.2.post5282__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nucliadb might be problematic. Click here for more details.

Files changed (126) hide show
  1. migrations/0016_upgrade_to_paragraphs_v2.py +1 -1
  2. migrations/0017_multiple_writable_shards.py +1 -1
  3. migrations/0018_purge_orphan_kbslugs.py +1 -1
  4. migrations/0019_upgrade_to_paragraphs_v3.py +1 -1
  5. migrations/0021_overwrite_vectorsets_key.py +1 -1
  6. migrations/0023_backfill_pg_catalog.py +7 -3
  7. migrations/0025_assign_models_to_kbs_v2.py +3 -3
  8. migrations/0027_rollover_texts3.py +1 -1
  9. migrations/0028_extracted_vectors_reference.py +1 -1
  10. migrations/0029_backfill_field_status.py +1 -1
  11. migrations/0032_remove_old_relations.py +1 -1
  12. migrations/0036_backfill_catalog_slug.py +1 -1
  13. migrations/0037_backfill_catalog_facets.py +1 -1
  14. migrations/0038_backfill_catalog_field_labels.py +7 -3
  15. migrations/0039_backfill_converation_splits_metadata.py +106 -0
  16. migrations/0040_migrate_search_configurations.py +79 -0
  17. migrations/pg/0010_shards_index.py +34 -0
  18. nucliadb/backups/create.py +3 -3
  19. nucliadb/backups/restore.py +3 -3
  20. nucliadb/common/cache.py +1 -1
  21. nucliadb/common/catalog/__init__.py +79 -0
  22. nucliadb/common/catalog/dummy.py +36 -0
  23. nucliadb/common/catalog/interface.py +85 -0
  24. nucliadb/{search/search/pgcatalog.py → common/catalog/pg.py} +294 -208
  25. nucliadb/common/catalog/utils.py +56 -0
  26. nucliadb/common/cluster/manager.py +3 -19
  27. nucliadb/common/cluster/rebalance.py +484 -110
  28. nucliadb/common/cluster/rollover.py +29 -0
  29. nucliadb/common/cluster/settings.py +1 -1
  30. nucliadb/common/cluster/utils.py +26 -0
  31. nucliadb/common/datamanagers/atomic.py +6 -0
  32. nucliadb/common/datamanagers/utils.py +2 -2
  33. nucliadb/common/external_index_providers/manager.py +1 -29
  34. nucliadb/common/external_index_providers/settings.py +1 -27
  35. nucliadb/common/filter_expression.py +16 -33
  36. nucliadb/common/http_clients/exceptions.py +8 -0
  37. nucliadb/common/http_clients/processing.py +4 -0
  38. nucliadb/common/http_clients/utils.py +3 -0
  39. nucliadb/common/ids.py +77 -55
  40. nucliadb/common/locking.py +4 -4
  41. nucliadb/common/maindb/driver.py +11 -1
  42. nucliadb/common/maindb/local.py +1 -1
  43. nucliadb/common/maindb/pg.py +1 -1
  44. nucliadb/common/nidx.py +19 -1
  45. nucliadb/common/vector_index_config.py +1 -1
  46. nucliadb/export_import/datamanager.py +3 -3
  47. nucliadb/ingest/consumer/pull.py +7 -0
  48. nucliadb/ingest/consumer/service.py +2 -27
  49. nucliadb/ingest/consumer/shard_creator.py +17 -6
  50. nucliadb/ingest/fields/base.py +9 -17
  51. nucliadb/ingest/fields/conversation.py +47 -1
  52. nucliadb/ingest/orm/brain_v2.py +21 -3
  53. nucliadb/ingest/orm/index_message.py +126 -111
  54. nucliadb/ingest/orm/knowledgebox.py +84 -43
  55. nucliadb/ingest/orm/processor/auditing.py +1 -1
  56. nucliadb/ingest/orm/processor/processor.py +95 -149
  57. nucliadb/ingest/orm/processor/sequence_manager.py +1 -1
  58. nucliadb/ingest/orm/resource.py +10 -1
  59. nucliadb/ingest/partitions.py +12 -1
  60. nucliadb/ingest/serialize.py +2 -2
  61. nucliadb/ingest/service/writer.py +26 -19
  62. nucliadb/ingest/settings.py +33 -11
  63. nucliadb/learning_proxy.py +12 -15
  64. nucliadb/metrics_exporter.py +17 -4
  65. nucliadb/migrator/datamanager.py +11 -17
  66. nucliadb/migrator/migrator.py +2 -2
  67. nucliadb/purge/__init__.py +12 -17
  68. nucliadb/purge/orphan_shards.py +2 -2
  69. nucliadb/reader/api/v1/knowledgebox.py +40 -12
  70. nucliadb/reader/api/v1/learning_config.py +30 -10
  71. nucliadb/reader/api/v1/resource.py +2 -2
  72. nucliadb/reader/api/v1/services.py +1 -1
  73. nucliadb/reader/reader/notifications.py +1 -1
  74. nucliadb/search/api/v1/__init__.py +1 -0
  75. nucliadb/search/api/v1/catalog.py +4 -4
  76. nucliadb/search/api/v1/find.py +1 -4
  77. nucliadb/search/api/v1/hydrate.py +328 -0
  78. nucliadb/search/api/v1/resource/ask.py +21 -1
  79. nucliadb/search/api/v1/search.py +1 -4
  80. nucliadb/search/predict.py +9 -2
  81. nucliadb/search/search/cache.py +1 -20
  82. nucliadb/search/search/chat/ask.py +50 -8
  83. nucliadb/search/search/chat/prompt.py +47 -15
  84. nucliadb/search/search/chat/query.py +8 -1
  85. nucliadb/search/search/fetch.py +1 -1
  86. nucliadb/search/search/find.py +1 -6
  87. nucliadb/search/search/{hydrator.py → hydrator/__init__.py} +5 -4
  88. nucliadb/search/search/hydrator/fields.py +175 -0
  89. nucliadb/search/search/hydrator/images.py +130 -0
  90. nucliadb/search/search/hydrator/paragraphs.py +307 -0
  91. nucliadb/search/search/hydrator/resources.py +56 -0
  92. nucliadb/search/search/metrics.py +16 -0
  93. nucliadb/search/search/predict_proxy.py +33 -11
  94. nucliadb/search/search/query.py +0 -23
  95. nucliadb/search/search/query_parser/fetcher.py +5 -5
  96. nucliadb/search/search/query_parser/models.py +1 -30
  97. nucliadb/search/search/query_parser/parsers/ask.py +1 -1
  98. nucliadb/search/search/query_parser/parsers/catalog.py +4 -7
  99. nucliadb/search/search/query_parser/parsers/common.py +16 -7
  100. nucliadb/search/search/query_parser/parsers/find.py +0 -11
  101. nucliadb/search/search/query_parser/parsers/graph.py +5 -5
  102. nucliadb/search/search/query_parser/parsers/search.py +0 -11
  103. nucliadb/search/search/query_parser/parsers/unit_retrieval.py +4 -11
  104. nucliadb/search/search/rerankers.py +1 -1
  105. nucliadb/search/search/summarize.py +1 -1
  106. nucliadb/standalone/run.py +3 -0
  107. nucliadb/tasks/retries.py +4 -4
  108. nucliadb/train/generators/sentence_classifier.py +2 -8
  109. nucliadb/train/generators/utils.py +1 -1
  110. nucliadb/train/nodes.py +4 -4
  111. nucliadb/train/servicer.py +1 -1
  112. nucliadb/train/uploader.py +1 -1
  113. nucliadb/writer/api/v1/field.py +14 -9
  114. nucliadb/writer/api/v1/knowledgebox.py +15 -52
  115. nucliadb/writer/api/v1/learning_config.py +5 -4
  116. nucliadb/writer/api/v1/resource.py +2 -2
  117. nucliadb/writer/resource/field.py +38 -2
  118. nucliadb/writer/tus/azure.py +4 -4
  119. nucliadb/writer/tus/gcs.py +11 -17
  120. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/METADATA +9 -10
  121. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/RECORD +124 -114
  122. nucliadb/common/external_index_providers/pinecone.py +0 -894
  123. nucliadb/ingest/orm/processor/pgcatalog.py +0 -129
  124. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/WHEEL +0 -0
  125. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/entry_points.txt +0 -0
  126. {nucliadb-6.7.2.post4862.dist-info → nucliadb-6.9.2.post5282.dist-info}/top_level.txt +0 -0
@@ -55,7 +55,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
55
55
 
56
56
 
57
57
  # async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
58
- # async with context.kv_driver.transaction(read_only=True) as txn:
58
+ # async with context.kv_driver.ro_transaction() as txn:
59
59
  # shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=False)
60
60
  # if not shards_object:
61
61
  # raise ShardsObjectNotFound()
@@ -44,7 +44,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
44
44
 
45
45
  # No longer relevant with nidx
46
46
 
47
- # async with context.kv_driver.transaction() as txn:
47
+ # async with context.kv_driver.rw_transaction() as txn:
48
48
  # shards = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid, for_update=True)
49
49
  # if shards is None:
50
50
  # logger.error("KB without shards", extra={"kbid": kbid})
@@ -35,7 +35,7 @@ logger = logging.getLogger(__name__)
35
35
 
36
36
 
37
37
  async def migrate(context: ExecutionContext) -> None:
38
- async with context.kv_driver.transaction() as txn:
38
+ async with context.kv_driver.rw_transaction() as txn:
39
39
  async for key in txn.keys(KB_SLUGS_BASE):
40
40
  slug = key.replace(KB_SLUGS_BASE, "")
41
41
  value = await txn.get(key, for_update=False)
@@ -55,7 +55,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
55
55
 
56
56
 
57
57
  # async def has_old_paragraphs_index(context: ExecutionContext, kbid: str) -> bool:
58
- # async with context.kv_driver.transaction(read_only=True) as txn:
58
+ # async with context.kv_driver.ro_transaction() as txn:
59
59
  # shards_object = await datamanagers.cluster.get_kb_shards(txn, kbid=kbid)
60
60
  # if not shards_object:
61
61
  # raise ShardsObjectNotFound()
@@ -38,7 +38,7 @@ async def migrate(context: ExecutionContext) -> None: ...
38
38
 
39
39
 
40
40
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
41
- async with context.kv_driver.transaction() as txn:
41
+ async with context.kv_driver.rw_transaction() as txn:
42
42
  logger.info(f"Overwriting vectorsets key", extra={"kbid": kbid})
43
43
  await datamanagers.vectorsets.initialize(txn, kbid=kbid)
44
44
  await txn.commit()
@@ -28,9 +28,10 @@ import logging
28
28
  from typing import cast
29
29
 
30
30
  from nucliadb.common import datamanagers
31
+ from nucliadb.common.catalog import catalog_update, get_catalog
32
+ from nucliadb.common.catalog.pg import PGCatalog
31
33
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
34
  from nucliadb.ingest.orm.index_message import get_resource_index_message
33
- from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
34
35
  from nucliadb.migrator.context import ExecutionContext
35
36
 
36
37
  logger = logging.getLogger(__name__)
@@ -43,8 +44,11 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
43
44
  if not isinstance(context.kv_driver, PGDriver):
44
45
  return
45
46
 
47
+ if not isinstance(get_catalog(), PGCatalog):
48
+ return
49
+
46
50
  BATCH_SIZE = 100
47
- async with context.kv_driver.transaction() as txn:
51
+ async with context.kv_driver.rw_transaction() as txn:
48
52
  txn = cast(PGTransaction, txn)
49
53
  continue_sql = ""
50
54
  while True:
@@ -75,7 +79,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
75
79
  continue
76
80
 
77
81
  index_message = await get_resource_index_message(resource, reindex=False)
78
- await pgcatalog_update(txn, kbid, resource, index_message)
82
+ await catalog_update(txn, kbid, resource, index_message)
79
83
 
80
84
  await txn.commit()
81
85
  continue_sql = f"AND key > '/kbs/{kbid}/r/{rid}'"
@@ -47,7 +47,7 @@ async def migrate(context: ExecutionContext) -> None: ...
47
47
 
48
48
 
49
49
  async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
50
- async with context.kv_driver.transaction(read_only=True) as txn:
50
+ async with context.kv_driver.ro_transaction() as txn:
51
51
  vectorsets_count = len([vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)])
52
52
  if vectorsets_count > 0:
53
53
  logger.info("Skipping KB with vectorsets already populated", extra={"kbid": kbid})
@@ -65,7 +65,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
65
65
  learning_matryoshka_dimensions = learning_model_metadata.matryoshka_dimensions
66
66
  learning_normalize_vectors = len(learning_matryoshka_dimensions) > 0
67
67
 
68
- async with context.kv_driver.transaction(read_only=True) as txn:
68
+ async with context.kv_driver.ro_transaction() as txn:
69
69
  semantic_model = await datamanagers.kb.get_model_metadata(txn, kbid=kbid)
70
70
 
71
71
  maindb_similarity = semantic_model.similarity_function
@@ -103,7 +103,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
103
103
  matryoshka_dimensions=maindb_matryoshka_dimensions,
104
104
  )
105
105
 
106
- async with context.kv_driver.transaction() as txn:
106
+ async with context.kv_driver.rw_transaction() as txn:
107
107
  # Populate KB vectorsets with data from learning. We are skipping KBs
108
108
  # with this key already set, so we can set here safely
109
109
  await datamanagers.vectorsets.set(txn, kbid=kbid, config=default_vectorset)
@@ -49,7 +49,7 @@ async def maybe_fix_vector_dimensions(context: ExecutionContext, kbid: str) -> N
49
49
  logger.warning(f"KB has no learning config", extra={"kbid": kbid})
50
50
  return
51
51
 
52
- async with context.kv_driver.transaction() as txn:
52
+ async with context.kv_driver.rw_transaction() as txn:
53
53
  vectorsets = [vs async for vs in datamanagers.vectorsets.iter(txn, kbid=kbid)]
54
54
  if len(vectorsets) != 1:
55
55
  # If multiple vectorsets, they are new shards created correctly, we can safely skip it
@@ -39,7 +39,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
39
39
  async with datamanagers.with_rw_transaction() as txn:
40
40
  vectorsets = [vs async for (_vid, vs) in datamanagers.vectorsets.iter(txn, kbid=kbid)]
41
41
 
42
- if len(vectorsets) == 0: # pragma: nocover
42
+ if len(vectorsets) == 0: # pragma: no cover
43
43
  # should never happen, everyone should have at least one
44
44
  logger.warning(f"KB has no vectorsets!", extra={"kbid": kbid})
45
45
  return
@@ -45,7 +45,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None: ...
45
45
 
46
46
  async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
47
47
  logger.info(f"Running batch from {start}")
48
- async with context.kv_driver.transaction(read_only=False) as txn:
48
+ async with context.kv_driver.rw_transaction() as txn:
49
49
  async with txn.connection.cursor() as cur: # type: ignore
50
50
  # Retrieve a batch of fields
51
51
  await cur.execute(
@@ -47,7 +47,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
47
47
 
48
48
  async def do_batch(context: ExecutionContext, start: str) -> Optional[str]:
49
49
  logger.info(f"Running batch from {start}")
50
- async with context.kv_driver.transaction(read_only=False) as txn:
50
+ async with context.kv_driver.rw_transaction() as txn:
51
51
  async with txn.connection.cursor() as cur: # type: ignore
52
52
  # Retrieve a batch of fields
53
53
  await cur.execute(
@@ -37,7 +37,7 @@ async def migrate(context: ExecutionContext) -> None:
37
37
  driver = cast(PGDriver, context.kv_driver)
38
38
 
39
39
  BATCH_SIZE = 10_000
40
- async with driver.transaction() as txn:
40
+ async with driver.rw_transaction() as txn:
41
41
  txn = cast(PGTransaction, txn)
42
42
  start_key = ""
43
43
  while True:
@@ -37,7 +37,7 @@ async def migrate(context: ExecutionContext) -> None:
37
37
  driver = cast(PGDriver, context.kv_driver)
38
38
 
39
39
  BATCH_SIZE = 1_000
40
- async with driver.transaction() as txn:
40
+ async with driver.rw_transaction() as txn:
41
41
  txn = cast(PGTransaction, txn)
42
42
  start_kbid = "00000000000000000000000000000000"
43
43
  start_rid = "00000000000000000000000000000000"
@@ -28,9 +28,10 @@ import logging
28
28
  from typing import cast
29
29
 
30
30
  from nucliadb.common import datamanagers
31
+ from nucliadb.common.catalog import catalog_update, get_catalog
32
+ from nucliadb.common.catalog.pg import PGCatalog
31
33
  from nucliadb.common.maindb.pg import PGDriver, PGTransaction
32
34
  from nucliadb.ingest.orm.index_message import get_resource_index_message
33
- from nucliadb.ingest.orm.processor.pgcatalog import pgcatalog_update
34
35
  from nucliadb.migrator.context import ExecutionContext
35
36
  from nucliadb_protos import resources_pb2
36
37
 
@@ -44,8 +45,11 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
44
45
  if not isinstance(context.kv_driver, PGDriver):
45
46
  return
46
47
 
48
+ if not isinstance(get_catalog(), PGCatalog):
49
+ return
50
+
47
51
  BATCH_SIZE = 100
48
- async with context.kv_driver.transaction() as txn:
52
+ async with context.kv_driver.rw_transaction() as txn:
49
53
  txn = cast(PGTransaction, txn)
50
54
  start = ""
51
55
  while True:
@@ -84,7 +88,7 @@ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
84
88
  continue
85
89
 
86
90
  index_message = await get_resource_index_message(resource, reindex=False)
87
- await pgcatalog_update(txn, kbid, resource, index_message)
91
+ await catalog_update(txn, kbid, resource, index_message)
88
92
 
89
93
  if to_index:
90
94
  await txn.commit()
@@ -0,0 +1,106 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #39
22
+
23
+ Backfill splits metadata on conversation fields
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common.maindb.driver import Transaction
31
+ from nucliadb.common.maindb.pg import PGTransaction
32
+ from nucliadb.ingest.fields.conversation import (
33
+ CONVERSATION_SPLITS_METADATA,
34
+ Conversation,
35
+ )
36
+ from nucliadb.ingest.orm.knowledgebox import KnowledgeBox as KnowledgeBoxORM
37
+ from nucliadb.migrator.context import ExecutionContext
38
+ from nucliadb_protos import resources_pb2
39
+ from nucliadb_protos.resources_pb2 import SplitMetadata, SplitsMetadata
40
+ from nucliadb_utils.storages.storage import Storage
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ async def migrate(context: ExecutionContext) -> None: ...
46
+
47
+
48
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
49
+ BATCH_SIZE = 100
50
+ start = ""
51
+ while True:
52
+ to_fix: list[tuple[str, str]] = []
53
+ async with context.kv_driver.rw_transaction() as txn:
54
+ txn = cast(PGTransaction, txn)
55
+ async with txn.connection.cursor() as cur:
56
+ # Retrieve a bunch of conversation fields
57
+ await cur.execute(
58
+ """
59
+ SELECT key FROM resources
60
+ WHERE key ~ ('^/kbs/' || %s || '/r/[^/]*/f/c/[^/]*$')
61
+ AND key > %s
62
+ ORDER BY key
63
+ LIMIT %s""",
64
+ (kbid, start, BATCH_SIZE),
65
+ )
66
+ rows = await cur.fetchall()
67
+ if len(rows) == 0:
68
+ return
69
+ for row in rows:
70
+ key = row[0]
71
+ start = key
72
+ rid = key.split("/")[4]
73
+ field_id = key.split("/")[7]
74
+ to_fix.append((rid, field_id))
75
+
76
+ for rid, field_id in to_fix:
77
+ async with context.kv_driver.rw_transaction() as txn2:
78
+ splits_metadata = await build_splits_metadata(
79
+ txn2, context.blob_storage, kbid, rid, field_id
80
+ )
81
+ splits_metadata_key = CONVERSATION_SPLITS_METADATA.format(
82
+ kbid=kbid, uuid=rid, type="c", field=field_id
83
+ )
84
+ await txn2.set(splits_metadata_key, splits_metadata.SerializeToString())
85
+ await txn2.commit()
86
+
87
+
88
+ async def build_splits_metadata(
89
+ txn: Transaction, storage: Storage, kbid: str, rid: str, field_id: str
90
+ ) -> SplitsMetadata:
91
+ splits_metadata = SplitsMetadata()
92
+ kb_orm = KnowledgeBoxORM(txn, storage, kbid)
93
+ resource_obj = await kb_orm.get(rid)
94
+ if resource_obj is None:
95
+ return splits_metadata
96
+ field_obj: Conversation = await resource_obj.get_field(
97
+ field_id, resources_pb2.FieldType.CONVERSATION, load=False
98
+ )
99
+ conv_metadata = await field_obj.get_metadata()
100
+ for i in range(1, conv_metadata.pages + 1):
101
+ page = await field_obj.get_value(page=i)
102
+ if page is None:
103
+ continue
104
+ for message in page.messages:
105
+ splits_metadata.metadata.setdefault(message.ident, SplitMetadata())
106
+ return splits_metadata
@@ -0,0 +1,79 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ """Migration #40
22
+
23
+ Replaces deprecated and removed generative models from search configurations
24
+
25
+ """
26
+
27
+ import logging
28
+ from typing import cast
29
+
30
+ from nucliadb.common import datamanagers
31
+ from nucliadb.migrator.context import ExecutionContext
32
+ from nucliadb_models.configuration import SearchConfiguration
33
+ from nucliadb_models.search import AskRequest, FindRequest
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ REPLACEMENTS = {
38
+ "claude-3-5-small": "claude-4-5-sonnet",
39
+ "gcp-claude-3-5-sonnet-v2": "gcp-claude-4-5-sonnet",
40
+ }
41
+
42
+
43
+ async def migrate(context: ExecutionContext) -> None: ...
44
+
45
+
46
+ async def migrate_kb(context: ExecutionContext, kbid: str) -> None:
47
+ affected = await get_affected_search_configurations(kbid)
48
+ if not affected:
49
+ return
50
+
51
+ async with datamanagers.with_rw_transaction() as txn:
52
+ for name, config in affected.items():
53
+ logger.info(
54
+ "Migrating search config for kb",
55
+ extra={
56
+ "kbid": kbid,
57
+ "search_config": name,
58
+ "generative_model": config.config.generative_model, # type: ignore
59
+ },
60
+ )
61
+ config.config.generative_model = REPLACEMENTS[config.config.generative_model] # type: ignore
62
+ await datamanagers.search_configurations.set(txn, kbid=kbid, name=name, config=config)
63
+ await txn.commit()
64
+
65
+
66
+ async def get_affected_search_configurations(kbid: str) -> dict[str, SearchConfiguration]:
67
+ result: dict[str, SearchConfiguration] = {}
68
+ async with datamanagers.with_ro_transaction() as txn:
69
+ search_configs = await datamanagers.search_configurations.list(txn, kbid=kbid)
70
+ for name, config in search_configs.items():
71
+ if config.kind == "find":
72
+ find_config = cast(FindRequest, config.config)
73
+ if find_config.generative_model in REPLACEMENTS:
74
+ result[name] = config
75
+ elif config.kind == "ask":
76
+ ask_config = cast(AskRequest, config.config)
77
+ if ask_config.generative_model in REPLACEMENTS:
78
+ result[name] = config
79
+ return result
@@ -0,0 +1,34 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+
21
+ from nucliadb.common.maindb.pg import PGTransaction
22
+
23
+
24
+ async def migrate(txn: PGTransaction) -> None:
25
+ # Concurrent index must be created outside of a transaction but psycopg automatically
26
+ # creates transactions. We temporarily disable this for building indexes.
27
+ await txn.connection.commit()
28
+ try:
29
+ await txn.connection.set_autocommit(True)
30
+ await txn.connection.execute(
31
+ "CREATE INDEX CONCURRENTLY ON resources (key, value) WHERE key ~ '/kbs/[^/]*/r/[^/]*/shard$';"
32
+ )
33
+ finally:
34
+ await txn.connection.set_autocommit(False)
@@ -269,7 +269,7 @@ async def backup_search_configurations(context: ApplicationContext, kbid: str, b
269
269
  async def get_metadata(
270
270
  context: ApplicationContext, kbid: str, backup_id: str
271
271
  ) -> Optional[BackupMetadata]:
272
- async with context.kv_driver.transaction(read_only=True) as txn:
272
+ async with context.kv_driver.ro_transaction() as txn:
273
273
  metadata_raw = await txn.get(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
274
274
  if metadata_raw is None:
275
275
  return None
@@ -277,7 +277,7 @@ async def get_metadata(
277
277
 
278
278
 
279
279
  async def set_metadata(context: ApplicationContext, kbid: str, backup_id: str, metadata: BackupMetadata):
280
- async with context.kv_driver.transaction() as txn:
280
+ async with context.kv_driver.rw_transaction() as txn:
281
281
  await txn.set(
282
282
  MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id),
283
283
  metadata.model_dump_json().encode(),
@@ -286,7 +286,7 @@ async def set_metadata(context: ApplicationContext, kbid: str, backup_id: str, m
286
286
 
287
287
 
288
288
  async def delete_metadata(context: ApplicationContext, kbid: str, backup_id: str):
289
- async with context.kv_driver.transaction() as txn:
289
+ async with context.kv_driver.rw_transaction() as txn:
290
290
  await txn.delete(MaindbKeys.METADATA.format(kbid=kbid, backup_id=backup_id))
291
291
  await txn.commit()
292
292
 
@@ -103,7 +103,7 @@ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: s
103
103
 
104
104
  async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: str) -> Optional[str]:
105
105
  key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
106
- async with context.kv_driver.transaction(read_only=True) as txn:
106
+ async with context.kv_driver.ro_transaction() as txn:
107
107
  raw = await txn.get(key)
108
108
  if raw is None:
109
109
  return None
@@ -112,14 +112,14 @@ async def get_last_restored(context: ApplicationContext, kbid: str, backup_id: s
112
112
 
113
113
  async def set_last_restored(context: ApplicationContext, kbid: str, backup_id: str, resource_id: str):
114
114
  key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
115
- async with context.kv_driver.transaction() as txn:
115
+ async with context.kv_driver.rw_transaction() as txn:
116
116
  await txn.set(key, resource_id.encode())
117
117
  await txn.commit()
118
118
 
119
119
 
120
120
  async def delete_last_restored(context: ApplicationContext, kbid: str, backup_id: str):
121
121
  key = MaindbKeys.LAST_RESTORED.format(kbid=kbid, backup_id=backup_id)
122
- async with context.kv_driver.transaction() as txn:
122
+ async with context.kv_driver.rw_transaction() as txn:
123
123
  await txn.delete(key)
124
124
  await txn.commit()
125
125
 
nucliadb/common/cache.py CHANGED
@@ -90,7 +90,7 @@ class ResourceCache(Cache[[str, str], ResourceORM]):
90
90
  @alru_cache(maxsize=cache_size)
91
91
  async def _get_resource(kbid: str, rid: str) -> Optional[ResourceORM]:
92
92
  storage = await get_storage()
93
- async with get_driver().transaction(read_only=True) as txn:
93
+ async with get_driver().ro_transaction() as txn:
94
94
  kb = KnowledgeBoxORM(txn, storage, kbid)
95
95
  return await kb.get(rid)
96
96
 
@@ -0,0 +1,79 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ # Copyright (C) 2021 Bosutech XXI S.L.
21
+ #
22
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
23
+ # For commercial licensing, contact us at info@nuclia.com.
24
+ #
25
+ # AGPL:
26
+ # This program is free software: you can redistribute it and/or modify
27
+ # it under the terms of the GNU Affero General Public License as
28
+ # published by the Free Software Foundation, either version 3 of the
29
+ # License, or (at your option) any later version.
30
+ #
31
+ # This program is distributed in the hope that it will be useful,
32
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
33
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34
+ # GNU Affero General Public License for more details.
35
+ #
36
+ # You should have received a copy of the GNU Affero General Public License
37
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
38
+
39
+ from nidx_protos.noderesources_pb2 import Resource as IndexMessage
40
+
41
+ from nucliadb.common.catalog.dummy import DummyCatalog
42
+ from nucliadb.common.catalog.interface import Catalog, CatalogQuery
43
+ from nucliadb.common.catalog.pg import PGCatalog
44
+ from nucliadb.common.catalog.utils import build_catalog_resource_data
45
+ from nucliadb.common.maindb.driver import Transaction
46
+ from nucliadb.ingest.orm.resource import Resource
47
+ from nucliadb.ingest.settings import CatalogConfig, settings
48
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
49
+ from nucliadb_utils.exceptions import ConfigurationError
50
+
51
+
52
+ def get_catalog() -> Catalog:
53
+ if settings.catalog == CatalogConfig.UNSET:
54
+ return DummyCatalog()
55
+ elif settings.catalog == CatalogConfig.PG:
56
+ return PGCatalog()
57
+ else:
58
+ raise ConfigurationError(f"Unknown catalog configuration: {settings.catalog}")
59
+
60
+
61
+ async def catalog_update(txn: Transaction, kbid: str, resource: Resource, index_message: IndexMessage):
62
+ catalog = get_catalog()
63
+ resource_data = build_catalog_resource_data(resource, index_message)
64
+ await catalog.update(txn, kbid, resource.uuid, resource_data)
65
+
66
+
67
+ async def catalog_delete(txn: Transaction, kbid: str, rid: str):
68
+ catalog = get_catalog()
69
+ await catalog.delete(txn, kbid, rid)
70
+
71
+
72
+ async def catalog_search(query: CatalogQuery) -> Resources:
73
+ catalog = get_catalog()
74
+ return await catalog.search(query)
75
+
76
+
77
+ async def catalog_facets(kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
78
+ catalog = get_catalog()
79
+ return await catalog.facets(kbid, request)
@@ -0,0 +1,36 @@
1
+ # Copyright (C) 2021 Bosutech XXI S.L.
2
+ #
3
+ # nucliadb is offered under the AGPL v3.0 and as commercial software.
4
+ # For commercial licensing, contact us at info@nuclia.com.
5
+ #
6
+ # AGPL:
7
+ # This program is free software: you can redistribute it and/or modify
8
+ # it under the terms of the GNU Affero General Public License as
9
+ # published by the Free Software Foundation, either version 3 of the
10
+ # License, or (at your option) any later version.
11
+ #
12
+ # This program is distributed in the hope that it will be useful,
13
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
14
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
+ # GNU Affero General Public License for more details.
16
+ #
17
+ # You should have received a copy of the GNU Affero General Public License
18
+ # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
+ #
20
+ from nucliadb.common.catalog.interface import Catalog, CatalogQuery, CatalogResourceData
21
+ from nucliadb.common.maindb.driver import Transaction
22
+ from nucliadb_models.search import CatalogFacetsRequest, Resources
23
+
24
+
25
+ class DummyCatalog(Catalog):
26
+ async def update(self, txn: Transaction, kbid: str, rid: str, data: CatalogResourceData):
27
+ return
28
+
29
+ async def delete(self, txn: Transaction, kbid: str, rid: str):
30
+ return
31
+
32
+ async def search(self, query: CatalogQuery) -> Resources:
33
+ return Resources(results=[], min_score=0.0)
34
+
35
+ async def facets(self, kbid: str, request: CatalogFacetsRequest) -> dict[str, int]:
36
+ return {}