nucliadb 6.2.0.post2679__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2679.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2679.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
@@ -19,13 +19,10 @@
19
19
 
20
20
  import logging
21
21
  import os
22
- import shutil
23
22
  import uuid
24
- from socket import gethostname
25
23
 
26
24
  from nucliadb.common.cluster.settings import StandaloneNodeRole
27
25
  from nucliadb.common.cluster.settings import settings as cluster_settings
28
- from nucliadb.common.cluster.standalone.index_node import StandaloneIndexNode
29
26
 
30
27
  logger = logging.getLogger(__name__)
31
28
 
@@ -46,46 +43,6 @@ def get_standalone_node_id() -> str:
46
43
  return str(uuid.UUID(bytes=f.read()))
47
44
 
48
45
 
49
- _SELF_INDEX_NODE = None
50
-
51
-
52
- def get_self() -> StandaloneIndexNode:
53
- """
54
- This returns an instance of the standalone index node
55
- so when API requests come into this mode, we don't
56
- make another grpc request since this node can service it directly.
57
- """
58
- if not is_index_node():
59
- raise Exception("This node is not an Index Node. You should not reach this code path.")
60
- global _SELF_INDEX_NODE
61
- node_id = get_standalone_node_id()
62
- if _SELF_INDEX_NODE is None or node_id != _SELF_INDEX_NODE.id:
63
- if "NUCLIADB_SERVICE_HOST" in os.environ:
64
- hn = os.environ["HOSTNAME"]
65
- ns = os.environ.get("NAMESPACE", "nucliadb")
66
- host = f"{hn}.{ns}"
67
- else:
68
- host = gethostname()
69
- _SELF_INDEX_NODE = StandaloneIndexNode(id=node_id, address=host, shard_count=0, available_disk=0)
70
- try:
71
- _, _, available_disk = shutil.disk_usage(cluster_settings.data_path)
72
- _SELF_INDEX_NODE.available_disk = available_disk
73
- except FileNotFoundError: # pragma: no cover
74
- ...
75
- try:
76
- _shards_dir = os.path.join(cluster_settings.data_path, "shards")
77
- _SELF_INDEX_NODE.shard_count = len(
78
- [
79
- shard_dir
80
- for shard_dir in os.listdir(_shards_dir)
81
- if os.path.isdir(os.path.join(_shards_dir, shard_dir))
82
- ]
83
- )
84
- except FileNotFoundError: # pragma: no cover
85
- ...
86
- return _SELF_INDEX_NODE
87
-
88
-
89
46
  def is_index_node() -> bool:
90
47
  return cluster_settings.standalone_node_role in (
91
48
  StandaloneNodeRole.ALL,
@@ -23,20 +23,11 @@ from typing import TYPE_CHECKING, Optional, Union
23
23
  import backoff
24
24
 
25
25
  from nucliadb.common import datamanagers
26
- from nucliadb.common.cluster.discovery.utils import (
27
- setup_cluster_discovery,
28
- teardown_cluster_discovery,
29
- )
30
26
  from nucliadb.common.cluster.manager import (
31
27
  KBShardManager,
32
28
  StandaloneKBShardManager,
33
- clear_index_nodes,
34
29
  )
35
30
  from nucliadb.common.cluster.settings import settings
36
- from nucliadb.common.cluster.standalone.service import (
37
- start_grpc as start_standalone_grpc,
38
- )
39
- from nucliadb.common.cluster.standalone.utils import is_index_node
40
31
  from nucliadb.ingest.orm.resource import Resource
41
32
  from nucliadb_protos import nodereader_pb2, writer_pb2
42
33
  from nucliadb_utils import const
@@ -62,12 +53,8 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
62
53
  # already setup
63
54
  return get_utility(Utility.SHARD_MANAGER)
64
55
 
65
- await setup_cluster_discovery()
66
56
  mng: Union[KBShardManager, StandaloneKBShardManager]
67
57
  if settings.standalone_mode:
68
- if is_index_node():
69
- server = await start_standalone_grpc()
70
- set_utility(_STANDALONE_SERVER, server)
71
58
  mng = StandaloneKBShardManager()
72
59
  else:
73
60
  mng = KBShardManager()
@@ -76,7 +63,6 @@ async def setup_cluster() -> Union[KBShardManager, StandaloneKBShardManager]:
76
63
 
77
64
 
78
65
  async def teardown_cluster():
79
- await teardown_cluster_discovery()
80
66
  if get_utility(Utility.SHARD_MANAGER):
81
67
  clean_utility(Utility.SHARD_MANAGER)
82
68
 
@@ -85,8 +71,6 @@ async def teardown_cluster():
85
71
  await std_server.stop(None)
86
72
  clean_utility(_STANDALONE_SERVER)
87
73
 
88
- clear_index_nodes()
89
-
90
74
 
91
75
  def get_shard_manager() -> KBShardManager:
92
76
  return get_utility(Utility.SHARD_MANAGER) # type: ignore
@@ -26,3 +26,4 @@ class IndexCounts:
26
26
  fields: int
27
27
  paragraphs: int
28
28
  sentences: int
29
+ size_bytes: int
@@ -23,11 +23,13 @@ from typing import Optional
23
23
  from google.protobuf.message import Message
24
24
 
25
25
  from nucliadb.common.datamanagers.utils import get_kv_pb
26
+ from nucliadb.common.ids import FIELD_TYPE_PB_TO_STR
26
27
  from nucliadb.common.maindb.driver import Transaction
27
28
  from nucliadb_protos import writer_pb2
28
29
 
29
30
  KB_RESOURCE_FIELD = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
30
31
  KB_RESOURCE_FIELD_ERROR = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/error"
32
+ KB_RESOURCE_FIELD_STATUS = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/status"
31
33
 
32
34
 
33
35
  async def get_raw(
@@ -52,13 +54,7 @@ async def set(
52
54
 
53
55
  async def delete(txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str):
54
56
  base_key = KB_RESOURCE_FIELD.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
55
- # Make sure we explicitly delete the field and any nested key
56
- keys_to_delete = []
57
- async for key in txn.keys(base_key):
58
- keys_to_delete.append(key)
59
-
60
- for key in keys_to_delete:
61
- await txn.delete(key)
57
+ await txn.delete_by_prefix(base_key)
62
58
 
63
59
 
64
60
  # Error
@@ -82,3 +78,48 @@ async def set_error(
82
78
  ):
83
79
  key = KB_RESOURCE_FIELD_ERROR.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
84
80
  await txn.set(key, error.SerializeToString())
81
+
82
+
83
+ # Status, replaces error
84
+
85
+
86
+ async def get_status(
87
+ txn: Transaction, *, kbid: str, rid: str, field_type: str, field_id: str
88
+ ) -> Optional[writer_pb2.FieldStatus]:
89
+ key = KB_RESOURCE_FIELD_STATUS.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
90
+ return await get_kv_pb(txn, key, writer_pb2.FieldStatus)
91
+
92
+
93
+ async def get_statuses(
94
+ txn: Transaction, *, kbid: str, rid: str, fields: list[writer_pb2.FieldID]
95
+ ) -> list[writer_pb2.FieldStatus]:
96
+ keys = [
97
+ KB_RESOURCE_FIELD_STATUS.format(
98
+ kbid=kbid, uuid=rid, type=FIELD_TYPE_PB_TO_STR[fid.field_type], field=fid.field
99
+ )
100
+ for fid in fields
101
+ ]
102
+ serialized = await txn.batch_get(keys, for_update=False)
103
+ statuses = []
104
+ for serialized_status in serialized:
105
+ pb = writer_pb2.FieldStatus()
106
+ if serialized_status is not None:
107
+ pb.ParseFromString(serialized_status)
108
+ else:
109
+ pb = writer_pb2.FieldStatus()
110
+ statuses.append(pb)
111
+
112
+ return statuses
113
+
114
+
115
+ async def set_status(
116
+ txn: Transaction,
117
+ *,
118
+ kbid: str,
119
+ rid: str,
120
+ field_type: str,
121
+ field_id: str,
122
+ status: writer_pb2.FieldStatus,
123
+ ):
124
+ key = KB_RESOURCE_FIELD_STATUS.format(kbid=kbid, uuid=rid, type=field_type, field=field_id)
125
+ await txn.set(key, status.SerializeToString())
@@ -58,6 +58,11 @@ async def iter(
58
58
  yield config.vectorset_id, config
59
59
 
60
60
 
61
+ async def count(txn: Transaction, *, kbid: str) -> int:
62
+ kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=False)
63
+ return len(kb_vectorsets.vectorsets)
64
+
65
+
61
66
  async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSetConfig):
62
67
  """Create or update a vectorset configuration"""
63
68
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
@@ -73,16 +78,20 @@ async def set(txn: Transaction, *, kbid: str, config: knowledgebox_pb2.VectorSet
73
78
  await txn.set(key, kb_vectorsets.SerializeToString())
74
79
 
75
80
 
76
- async def delete(txn: Transaction, *, kbid: str, vectorset_id: str):
81
+ async def delete(
82
+ txn: Transaction, *, kbid: str, vectorset_id: str
83
+ ) -> Optional[knowledgebox_pb2.VectorSetConfig]:
77
84
  kb_vectorsets = await _get_or_default(txn, kbid=kbid, for_update=True)
78
85
  index = _find_vectorset(kb_vectorsets, vectorset_id)
79
86
  if index is None:
80
87
  # already deleted
81
- return
88
+ return None
82
89
 
90
+ deleted = kb_vectorsets.vectorsets[index]
83
91
  del kb_vectorsets.vectorsets[index]
84
92
  key = KB_VECTORSETS.format(kbid=kbid)
85
93
  await txn.set(key, kb_vectorsets.SerializeToString())
94
+ return deleted
86
95
 
87
96
 
88
97
  # XXX At some point in the vectorset epic, we should make this key mandatory and
@@ -28,7 +28,7 @@ from nucliadb.common.counters import IndexCounts
28
28
  from nucliadb.common.external_index_providers.exceptions import ExternalIndexingError
29
29
  from nucliadb.common.ids import ParagraphId
30
30
  from nucliadb_models.external_index_providers import ExternalIndexProviderType
31
- from nucliadb_models.search import SCORE_TYPE, TextPosition
31
+ from nucliadb_models.search import SCORE_TYPE, Relations, TextPosition
32
32
  from nucliadb_protos.knowledgebox_pb2 import (
33
33
  CreateExternalIndexProviderMetadata,
34
34
  StoredExternalIndexProviderMetadata,
@@ -73,6 +73,7 @@ class TextBlockMatch(BaseModel):
73
73
  paragraph_labels: list[str] = []
74
74
  field_labels: list[str] = []
75
75
  text: Optional[str] = None
76
+ relevant_relations: Optional[Relations] = None
76
77
 
77
78
 
78
79
  class QueryResults(BaseModel):
@@ -441,6 +441,7 @@ class PineconeIndexManager(ExternalIndexManager):
441
441
 
442
442
  def get_prefixes_to_delete(self, index_data: Resource) -> set[str]:
443
443
  prefixes_to_delete = set()
444
+ # TODO: migrate to vector_prefixes_to_delete
444
445
  for field_id in index_data.sentences_to_delete:
445
446
  try:
446
447
  delete_vid = VectorId.from_string(field_id)
@@ -706,11 +707,7 @@ class PineconeIndexManager(ExternalIndexManager):
706
707
  if self.kbid in COUNTERS_CACHE:
707
708
  # Cache hit
708
709
  return COUNTERS_CACHE[self.kbid]
709
- total = IndexCounts(
710
- fields=0,
711
- paragraphs=0,
712
- sentences=0,
713
- )
710
+ total = IndexCounts(fields=0, paragraphs=0, sentences=0, size_bytes=0)
714
711
  tasks = []
715
712
  vectorset_results: dict[str, IndexCounts] = {}
716
713
 
@@ -738,6 +735,7 @@ class PineconeIndexManager(ExternalIndexManager):
738
735
  fields=0,
739
736
  paragraphs=index_stats.totalVectorCount,
740
737
  sentences=index_stats.totalVectorCount,
738
+ size_bytes=0,
741
739
  )
742
740
  except Exception:
743
741
  logger.exception(
nucliadb/common/ids.py CHANGED
@@ -111,13 +111,11 @@ class FieldId:
111
111
  parts = value.split("/")
112
112
  if len(parts) == 3:
113
113
  rid, _type, key = parts
114
- if _type not in FIELD_TYPE_STR_TO_PB:
115
- raise ValueError(f"Invalid FieldId: {value}")
114
+ _type = cls.parse_field_type(_type)
116
115
  return cls(rid=rid, type=_type, key=key)
117
116
  elif len(parts) == 4:
118
117
  rid, _type, key, subfield_id = parts
119
- if _type not in FIELD_TYPE_STR_TO_PB:
120
- raise ValueError(f"Invalid FieldId: {value}")
118
+ _type = cls.parse_field_type(_type)
121
119
  return cls(
122
120
  rid=rid,
123
121
  type=_type,
@@ -127,6 +125,22 @@ class FieldId:
127
125
  else:
128
126
  raise ValueError(f"Invalid FieldId: {value}")
129
127
 
128
+ @classmethod
129
+ def parse_field_type(cls, _type: str) -> str:
130
+ if _type not in FIELD_TYPE_STR_TO_PB:
131
+ # Try to parse the enum value
132
+ # XXX: This is to support field types that are integer values of FieldType
133
+ # Which is how legacy processor relations reported the paragraph_id
134
+ try:
135
+ type_pb = FieldType.ValueType(int(_type))
136
+ except ValueError:
137
+ raise ValueError(f"Invalid FieldId: {_type}")
138
+ if type_pb in FIELD_TYPE_PB_TO_STR:
139
+ return FIELD_TYPE_PB_TO_STR[type_pb]
140
+ else:
141
+ raise ValueError(f"Invalid FieldId: {_type}")
142
+ return _type
143
+
130
144
 
131
145
  @dataclass
132
146
  class ParagraphId: