nucliadb 6.2.0.post2675__py3-none-any.whl → 6.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. migrations/0028_extracted_vectors_reference.py +61 -0
  2. migrations/0029_backfill_field_status.py +149 -0
  3. migrations/0030_label_deduplication.py +60 -0
  4. nucliadb/common/cluster/manager.py +41 -331
  5. nucliadb/common/cluster/rebalance.py +2 -2
  6. nucliadb/common/cluster/rollover.py +12 -71
  7. nucliadb/common/cluster/settings.py +3 -0
  8. nucliadb/common/cluster/standalone/utils.py +0 -43
  9. nucliadb/common/cluster/utils.py +0 -16
  10. nucliadb/common/counters.py +1 -0
  11. nucliadb/common/datamanagers/fields.py +48 -7
  12. nucliadb/common/datamanagers/vectorsets.py +11 -2
  13. nucliadb/common/external_index_providers/base.py +2 -1
  14. nucliadb/common/external_index_providers/pinecone.py +3 -5
  15. nucliadb/common/ids.py +18 -4
  16. nucliadb/common/models_utils/from_proto.py +479 -0
  17. nucliadb/common/models_utils/to_proto.py +60 -0
  18. nucliadb/common/nidx.py +76 -37
  19. nucliadb/export_import/models.py +3 -3
  20. nucliadb/health.py +0 -7
  21. nucliadb/ingest/app.py +0 -8
  22. nucliadb/ingest/consumer/auditing.py +1 -1
  23. nucliadb/ingest/consumer/shard_creator.py +1 -1
  24. nucliadb/ingest/fields/base.py +83 -21
  25. nucliadb/ingest/orm/brain.py +55 -56
  26. nucliadb/ingest/orm/broker_message.py +12 -2
  27. nucliadb/ingest/orm/entities.py +6 -17
  28. nucliadb/ingest/orm/knowledgebox.py +44 -22
  29. nucliadb/ingest/orm/processor/data_augmentation.py +7 -29
  30. nucliadb/ingest/orm/processor/processor.py +5 -2
  31. nucliadb/ingest/orm/resource.py +222 -413
  32. nucliadb/ingest/processing.py +8 -2
  33. nucliadb/ingest/serialize.py +77 -46
  34. nucliadb/ingest/service/writer.py +2 -56
  35. nucliadb/ingest/settings.py +1 -4
  36. nucliadb/learning_proxy.py +6 -4
  37. nucliadb/purge/__init__.py +102 -12
  38. nucliadb/purge/orphan_shards.py +6 -4
  39. nucliadb/reader/api/models.py +3 -3
  40. nucliadb/reader/api/v1/__init__.py +1 -0
  41. nucliadb/reader/api/v1/download.py +2 -2
  42. nucliadb/reader/api/v1/knowledgebox.py +3 -3
  43. nucliadb/reader/api/v1/resource.py +23 -12
  44. nucliadb/reader/api/v1/services.py +4 -4
  45. nucliadb/reader/api/v1/vectorsets.py +48 -0
  46. nucliadb/search/api/v1/ask.py +11 -1
  47. nucliadb/search/api/v1/feedback.py +3 -3
  48. nucliadb/search/api/v1/knowledgebox.py +8 -13
  49. nucliadb/search/api/v1/search.py +3 -2
  50. nucliadb/search/api/v1/suggest.py +0 -2
  51. nucliadb/search/predict.py +6 -4
  52. nucliadb/search/requesters/utils.py +1 -2
  53. nucliadb/search/search/chat/ask.py +77 -13
  54. nucliadb/search/search/chat/prompt.py +16 -5
  55. nucliadb/search/search/chat/query.py +74 -34
  56. nucliadb/search/search/exceptions.py +2 -7
  57. nucliadb/search/search/find.py +9 -5
  58. nucliadb/search/search/find_merge.py +10 -4
  59. nucliadb/search/search/graph_strategy.py +884 -0
  60. nucliadb/search/search/hydrator.py +6 -0
  61. nucliadb/search/search/merge.py +79 -24
  62. nucliadb/search/search/query.py +74 -245
  63. nucliadb/search/search/query_parser/exceptions.py +11 -1
  64. nucliadb/search/search/query_parser/fetcher.py +405 -0
  65. nucliadb/search/search/query_parser/models.py +0 -3
  66. nucliadb/search/search/query_parser/parser.py +22 -21
  67. nucliadb/search/search/rerankers.py +1 -42
  68. nucliadb/search/search/shards.py +19 -0
  69. nucliadb/standalone/api_router.py +2 -14
  70. nucliadb/standalone/settings.py +4 -0
  71. nucliadb/train/generators/field_streaming.py +7 -3
  72. nucliadb/train/lifecycle.py +3 -6
  73. nucliadb/train/nodes.py +14 -12
  74. nucliadb/train/resource.py +380 -0
  75. nucliadb/writer/api/constants.py +20 -16
  76. nucliadb/writer/api/v1/__init__.py +1 -0
  77. nucliadb/writer/api/v1/export_import.py +1 -1
  78. nucliadb/writer/api/v1/field.py +13 -7
  79. nucliadb/writer/api/v1/knowledgebox.py +3 -46
  80. nucliadb/writer/api/v1/resource.py +20 -13
  81. nucliadb/writer/api/v1/services.py +10 -1
  82. nucliadb/writer/api/v1/upload.py +61 -34
  83. nucliadb/writer/{vectorsets.py → api/v1/vectorsets.py} +99 -47
  84. nucliadb/writer/back_pressure.py +17 -46
  85. nucliadb/writer/resource/basic.py +9 -7
  86. nucliadb/writer/resource/field.py +42 -9
  87. nucliadb/writer/settings.py +2 -2
  88. nucliadb/writer/tus/gcs.py +11 -10
  89. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/METADATA +11 -14
  90. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/RECORD +94 -96
  91. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/WHEEL +1 -1
  92. nucliadb/common/cluster/discovery/base.py +0 -178
  93. nucliadb/common/cluster/discovery/k8s.py +0 -301
  94. nucliadb/common/cluster/discovery/manual.py +0 -57
  95. nucliadb/common/cluster/discovery/single.py +0 -51
  96. nucliadb/common/cluster/discovery/types.py +0 -32
  97. nucliadb/common/cluster/discovery/utils.py +0 -67
  98. nucliadb/common/cluster/standalone/grpc_node_binding.py +0 -349
  99. nucliadb/common/cluster/standalone/index_node.py +0 -123
  100. nucliadb/common/cluster/standalone/service.py +0 -84
  101. nucliadb/standalone/introspect.py +0 -208
  102. nucliadb-6.2.0.post2675.dist-info/zip-safe +0 -1
  103. /nucliadb/common/{cluster/discovery → models_utils}/__init__.py +0 -0
  104. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/entry_points.txt +0 -0
  105. {nucliadb-6.2.0.post2675.dist-info → nucliadb-6.2.1.dist-info}/top_level.txt +0 -0
nucliadb/common/nidx.py CHANGED
@@ -21,7 +21,7 @@
21
21
  import os
22
22
  from typing import Optional
23
23
 
24
- from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxSearcherStub
24
+ from nidx_protos.nidx_pb2_grpc import NidxApiStub, NidxIndexerStub, NidxSearcherStub
25
25
 
26
26
  from nucliadb.common.cluster.base import AbstractIndexNode
27
27
  from nucliadb.common.cluster.settings import settings
@@ -37,12 +37,10 @@ from nucliadb_utils.settings import FileBackendConfig, indexing_settings, storag
37
37
  from nucliadb_utils.storages.settings import settings as extended_storage_settings
38
38
  from nucliadb_utils.utilities import Utility, clean_utility, get_utility, set_utility
39
39
 
40
- NIDX_ENABLED = bool(os.environ.get("NIDX_ENABLED"))
41
-
42
40
 
43
41
  class NidxUtility:
44
- api_client = None
45
- searcher_client = None
42
+ api_client: NidxApiStub
43
+ searcher_client: NidxSearcherStub
46
44
 
47
45
  async def initialize(self):
48
46
  raise NotImplementedError()
@@ -98,6 +96,9 @@ class NidxBindingUtility(NidxUtility):
98
96
 
99
97
  self.config = {
100
98
  "METADATA__DATABASE_URL": ingest_settings.driver_pg_url,
99
+ "SEARCHER__METADATA_REFRESH_INTERVAL": str(
100
+ indexing_settings.index_searcher_refresh_interval
101
+ ),
101
102
  **_storage_config("INDEXER", None),
102
103
  **_storage_config("STORAGE", "nidx"),
103
104
  }
@@ -123,16 +124,8 @@ class NidxBindingUtility(NidxUtility):
123
124
  self.binding.wait_for_sync()
124
125
 
125
126
 
126
- class NidxServiceUtility(NidxUtility):
127
- """Implements Nidx utility connecting to the network service"""
128
-
127
+ class NidxNatsIndexer:
129
128
  def __init__(self):
130
- if indexing_settings.index_nidx_subject is None:
131
- raise ValueError("INDEX_NIDX_SUBJECT needed for nidx utility")
132
-
133
- if not settings.nidx_api_address or not settings.nidx_searcher_address:
134
- raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
135
-
136
129
  self.nats_connection_manager = NatsConnectionManager(
137
130
  service_name="NidxIndexer",
138
131
  nats_servers=indexing_settings.index_jetstream_servers,
@@ -142,10 +135,6 @@ class NidxServiceUtility(NidxUtility):
142
135
 
143
136
  async def initialize(self):
144
137
  await self.nats_connection_manager.initialize()
145
- self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
146
- self.searcher_client = NidxSearcherStub(
147
- get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
148
- )
149
138
 
150
139
  async def finalize(self):
151
140
  await self.nats_connection_manager.finalize()
@@ -158,18 +147,68 @@ class NidxServiceUtility(NidxUtility):
158
147
  return res.seq
159
148
 
160
149
 
161
- async def start_nidx_utility() -> Optional[NidxUtility]:
162
- if not NIDX_ENABLED:
163
- return None
150
+ class NidxGrpcIndexer:
151
+ def __init__(self, address):
152
+ self.address = address
164
153
 
165
- nidx = get_nidx()
154
+ async def initialize(self):
155
+ self.client = NidxIndexerStub(get_traced_grpc_channel(self.address, "nidx_indexer"))
156
+
157
+ async def finalize(self):
158
+ pass
159
+
160
+ async def index(self, writer: IndexMessage) -> int:
161
+ await self.client.Index(writer)
162
+ return 0
163
+
164
+
165
+ class NidxServiceUtility(NidxUtility):
166
+ """Implements Nidx utility connecting to the network service"""
167
+
168
+ def __init__(self):
169
+ if not settings.nidx_api_address or not settings.nidx_searcher_address:
170
+ raise ValueError("NIDX_API_ADDRESS and NIDX_SEARCHER_ADDRESS are required")
171
+
172
+ if indexing_settings.index_nidx_subject:
173
+ self.indexer = NidxNatsIndexer()
174
+ elif settings.nidx_indexer_address is not None:
175
+ self.indexer = NidxGrpcIndexer(settings.nidx_indexer_address)
176
+ else:
177
+ raise ValueError("NIDX_INDEXER_ADDRESS or INDEX_NIDX_SUBJECT are required")
178
+
179
+ async def initialize(self):
180
+ await self.indexer.initialize()
181
+ self.api_client = NidxApiStub(get_traced_grpc_channel(settings.nidx_api_address, "nidx_api"))
182
+ self.searcher_client = NidxSearcherStub(
183
+ get_traced_grpc_channel(settings.nidx_searcher_address, "nidx_searcher")
184
+ )
185
+
186
+ async def finalize(self):
187
+ await self.indexer.finalize()
188
+
189
+ async def index(self, writer: IndexMessage) -> int:
190
+ return await self.indexer.index(writer)
191
+
192
+
193
+ async def start_nidx_utility() -> Optional[NidxUtility]:
194
+ nidx = get_utility(Utility.NIDX)
166
195
  if nidx:
167
196
  return nidx
168
197
 
169
198
  nidx_utility: NidxUtility
170
199
  if settings.standalone_mode:
171
- nidx_utility = NidxBindingUtility()
200
+ if (
201
+ settings.nidx_api_address is not None
202
+ and settings.nidx_searcher_address is not None
203
+ and settings.nidx_indexer_address is not None
204
+ ):
205
+ # Standalone with nidx service (via grpc). This is used in clustered standalone mode
206
+ nidx_utility = NidxServiceUtility()
207
+ else:
208
+ # Normal standalone mode with binding
209
+ nidx_utility = NidxBindingUtility()
172
210
  else:
211
+ # Component deploy with nidx service via grpc & nats (cloud)
173
212
  nidx_utility = NidxServiceUtility()
174
213
 
175
214
  await nidx_utility.initialize()
@@ -178,30 +217,33 @@ async def start_nidx_utility() -> Optional[NidxUtility]:
178
217
 
179
218
 
180
219
  async def stop_nidx_utility():
181
- nidx_utility = get_nidx()
220
+ nidx_utility = get_utility(Utility.NIDX)
182
221
  if nidx_utility:
183
222
  clean_utility(Utility.NIDX)
184
223
  await nidx_utility.finalize()
185
224
 
186
225
 
187
- def get_nidx() -> Optional[NidxUtility]:
188
- return get_utility(Utility.NIDX)
226
+ def get_nidx() -> NidxUtility:
227
+ nidx = get_utility(Utility.NIDX)
228
+ if nidx is None:
229
+ raise Exception("nidx not initialized")
230
+ return nidx
189
231
 
190
232
 
191
- def get_nidx_api_client() -> Optional["NidxApiStub"]:
233
+ def get_nidx_api_client() -> "NidxApiStub":
192
234
  nidx = get_nidx()
193
- if nidx:
235
+ if nidx.api_client:
194
236
  return nidx.api_client
195
237
  else:
196
- return None
238
+ raise Exception("nidx not initialized")
197
239
 
198
240
 
199
- def get_nidx_searcher_client() -> Optional["NidxSearcherStub"]:
241
+ def get_nidx_searcher_client() -> "NidxSearcherStub":
200
242
  nidx = get_nidx()
201
- if nidx:
243
+ if nidx.searcher_client:
202
244
  return nidx.searcher_client
203
245
  else:
204
- return None
246
+ raise Exception("nidx not initialized")
205
247
 
206
248
 
207
249
  # TODO: Remove the index node abstraction
@@ -252,9 +294,6 @@ class FakeNode(AbstractIndexNode):
252
294
  return "nidx"
253
295
 
254
296
 
255
- def get_nidx_fake_node() -> Optional[FakeNode]:
297
+ def get_nidx_fake_node() -> FakeNode:
256
298
  nidx = get_nidx()
257
- if nidx:
258
- return FakeNode(nidx.api_client, nidx.searcher_client)
259
- else:
260
- return None
299
+ return FakeNode(nidx.api_client, nidx.searcher_client)
@@ -17,7 +17,7 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
- from datetime import datetime
20
+ import datetime
21
21
  from enum import Enum
22
22
  from typing import Any
23
23
 
@@ -57,8 +57,8 @@ class Metadata(BaseModel):
57
57
  task: TaskMetadata = TaskMetadata(status=Status.SCHEDULED)
58
58
  total: int = 0
59
59
  processed: int = 0
60
- created: datetime = datetime.utcnow()
61
- modified: datetime = datetime.utcnow()
60
+ created: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
61
+ modified: datetime.datetime = datetime.datetime.now(datetime.timezone.utc)
62
62
 
63
63
 
64
64
  class ExportMetadata(Metadata):
nucliadb/health.py CHANGED
@@ -40,13 +40,6 @@ def nats_manager_healthy() -> bool:
40
40
  return nats_manager.healthy()
41
41
 
42
42
 
43
- def nodes_health_check() -> bool:
44
- from nucliadb.common.cluster import manager
45
- from nucliadb.ingest.settings import DriverConfig, settings
46
-
47
- return len(manager.INDEX_NODES) > 0 or settings.driver == DriverConfig.LOCAL
48
-
49
-
50
43
  def pubsub_check() -> bool:
51
44
  driver: Optional[PubSubDriver] = get_utility(Utility.PUBSUB)
52
45
  if driver is None:
nucliadb/ingest/app.py CHANGED
@@ -22,10 +22,6 @@ import importlib.metadata
22
22
  from typing import Awaitable, Callable
23
23
 
24
24
  from nucliadb import health
25
- from nucliadb.common.cluster.discovery.utils import (
26
- setup_cluster_discovery,
27
- teardown_cluster_discovery,
28
- )
29
25
  from nucliadb.common.cluster.settings import settings as cluster_settings
30
26
  from nucliadb.common.cluster.utils import setup_cluster, teardown_cluster
31
27
  from nucliadb.common.context import ApplicationContext
@@ -89,13 +85,9 @@ async def initialize() -> list[Callable[[], Awaitable[None]]]:
89
85
  )
90
86
  finalizers.append(stop_nats_manager)
91
87
 
92
- await setup_cluster_discovery()
93
- finalizers.append(teardown_cluster_discovery)
94
-
95
88
  health.register_health_checks(
96
89
  [
97
90
  health.nats_manager_healthy,
98
- health.nodes_health_check,
99
91
  health.pubsub_check,
100
92
  ]
101
93
  )
@@ -113,7 +113,7 @@ class IndexAuditHandler:
113
113
 
114
114
  for shard_obj in shard_groups:
115
115
  # TODO: Uses node for auditing, don't want to suddenly change metrics
116
- node, shard_id = choose_node(shard_obj, use_nidx=False)
116
+ node, shard_id = choose_node(shard_obj)
117
117
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
118
118
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
119
119
  )
@@ -103,7 +103,7 @@ class ShardCreatorHandler:
103
103
  async with locking.distributed_lock(locking.NEW_SHARD_LOCK.format(kbid=kbid)):
104
104
  # remember, a lock will do at least 1+ reads and 1 write.
105
105
  # with heavy writes, this adds some simple k/v pressure
106
- node, shard_id = choose_node(current_shard, use_nidx=True)
106
+ node, shard_id = choose_node(current_shard)
107
107
  shard: nodereader_pb2.Shard = await node.reader.GetShard(
108
108
  nodereader_pb2.GetShardRequest(shard_id=noderesources_pb2.ShardId(id=shard_id)) # type: ignore
109
109
  )
@@ -21,12 +21,13 @@ from __future__ import annotations
21
21
 
22
22
  import enum
23
23
  from datetime import datetime
24
- from typing import Any, Generic, Optional, Type, TypeVar
24
+ from typing import TYPE_CHECKING, Any, Generic, Optional, Type, TypeVar
25
25
 
26
26
  from google.protobuf.message import DecodeError, Message
27
27
 
28
28
  from nucliadb.common import datamanagers
29
29
  from nucliadb.ingest.fields.exceptions import InvalidFieldClass, InvalidPBClass
30
+ from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
30
31
  from nucliadb_protos.resources_pb2 import (
31
32
  CloudFile,
32
33
  ExtractedTextWrapper,
@@ -41,12 +42,19 @@ from nucliadb_protos.resources_pb2 import (
41
42
  QuestionAnswers,
42
43
  )
43
44
  from nucliadb_protos.utils_pb2 import ExtractedText, VectorObject
44
- from nucliadb_protos.writer_pb2 import Error
45
+ from nucliadb_protos.writer_pb2 import Error, FieldStatus
46
+ from nucliadb_utils.storages.exceptions import CouldNotCopyNotFound
45
47
  from nucliadb_utils.storages.storage import Storage, StorageField
46
48
 
49
+ if TYPE_CHECKING: # pragma: no cover
50
+ from nucliadb.ingest.orm.resource import Resource
51
+
52
+
47
53
  SUBFIELDFIELDS = ("c",)
48
54
 
49
55
 
56
+ # NOTE extracted vectors key is no longer a static key, it is stored in each
57
+ # vectorset
50
58
  class FieldTypes(str, enum.Enum):
51
59
  FIELD_TEXT = "extracted_text"
52
60
  FIELD_VECTORS = "extracted_vectors"
@@ -73,7 +81,7 @@ class Field(Generic[PbType]):
73
81
  def __init__(
74
82
  self,
75
83
  id: str,
76
- resource: Any,
84
+ resource: Resource,
77
85
  pb: Optional[Any] = None,
78
86
  value: Optional[Any] = None,
79
87
  ):
@@ -88,7 +96,7 @@ class Field(Generic[PbType]):
88
96
  self.question_answers = None
89
97
 
90
98
  self.id: str = id
91
- self.resource: Any = resource
99
+ self.resource = resource
92
100
 
93
101
  if value is not None:
94
102
  newpb = self.pbklass()
@@ -119,11 +127,20 @@ class Field(Generic[PbType]):
119
127
  def get_storage_field(self, field_type: FieldTypes) -> StorageField:
120
128
  return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, field_type.value)
121
129
 
122
- def _get_extracted_vectors_storage_field(self, vectorset: Optional[str] = None) -> StorageField:
123
- if vectorset:
130
+ def _get_extracted_vectors_storage_field(
131
+ self,
132
+ vectorset: str,
133
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
134
+ ) -> StorageField:
135
+ if storage_key_kind == VectorSetConfig.StorageKeyKind.LEGACY:
136
+ key = FieldTypes.FIELD_VECTORS.value
137
+ elif storage_key_kind == VectorSetConfig.StorageKeyKind.VECTORSET_PREFIX:
124
138
  key = FieldTypes.FIELD_VECTORSET.value.format(vectorset=vectorset)
125
139
  else:
126
- key = FieldTypes.FIELD_VECTORS.value
140
+ raise ValueError(
141
+ f"Can't do anything with UNSET or unknown vectorset storage key kind: {storage_key_kind}"
142
+ )
143
+
127
144
  return self.storage.file_extracted(self.kbid, self.uuid, self.type, self.id, key)
128
145
 
129
146
  async def db_get_value(self) -> Optional[PbType]:
@@ -163,7 +180,8 @@ class Field(Generic[PbType]):
163
180
  field_id=self.id,
164
181
  )
165
182
  await self.delete_extracted_text()
166
- await self.delete_vectors()
183
+ async for vectorset_id, vs in datamanagers.vectorsets.iter(self.resource.txn, kbid=self.kbid):
184
+ await self.delete_vectors(vectorset_id, vs.storage_key_kind)
167
185
  await self.delete_metadata()
168
186
  await self.delete_question_answers()
169
187
 
@@ -181,9 +199,13 @@ class Field(Generic[PbType]):
181
199
  except KeyError:
182
200
  pass
183
201
 
184
- async def delete_vectors(self, vectorset: Optional[str] = None) -> None:
202
+ async def delete_vectors(
203
+ self,
204
+ vectorset: str,
205
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
206
+ ) -> None:
185
207
  # Try delete vectors
186
- sf = self._get_extracted_vectors_storage_field(vectorset)
208
+ sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
187
209
  try:
188
210
  await self.storage.delete_upload(sf.key, sf.bucket)
189
211
  except KeyError:
@@ -215,6 +237,25 @@ class Field(Generic[PbType]):
215
237
  error=error,
216
238
  )
217
239
 
240
+ async def get_status(self) -> Optional[FieldStatus]:
241
+ return await datamanagers.fields.get_status(
242
+ self.resource.txn,
243
+ kbid=self.kbid,
244
+ rid=self.uuid,
245
+ field_type=self.type,
246
+ field_id=self.id,
247
+ )
248
+
249
+ async def set_status(self, status: FieldStatus) -> None:
250
+ await datamanagers.fields.set_status(
251
+ self.resource.txn,
252
+ kbid=self.kbid,
253
+ rid=self.uuid,
254
+ field_type=self.type,
255
+ field_id=self.id,
256
+ status=status,
257
+ )
258
+
218
259
  async def get_question_answers(self, force=False) -> Optional[FieldQuestionAnswers]:
219
260
  if self.question_answers is None or force:
220
261
  sf = self.get_storage_field(FieldTypes.QUESTION_ANSWERS)
@@ -309,12 +350,17 @@ class Field(Generic[PbType]):
309
350
  self.extracted_text = payload
310
351
  return self.extracted_text
311
352
 
312
- async def set_vectors(self, payload: ExtractedVectorsWrapper) -> Optional[VectorObject]:
313
- vectorset = payload.vectorset_id or None
353
+ async def set_vectors(
354
+ self,
355
+ payload: ExtractedVectorsWrapper,
356
+ vectorset: str,
357
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
358
+ ) -> Optional[VectorObject]:
314
359
  if self.type in SUBFIELDFIELDS:
315
360
  try:
316
361
  actual_payload: Optional[VectorObject] = await self.get_vectors(
317
362
  vectorset=vectorset,
363
+ storage_key_kind=storage_key_kind,
318
364
  force=True,
319
365
  )
320
366
  except KeyError:
@@ -322,12 +368,29 @@ class Field(Generic[PbType]):
322
368
  else:
323
369
  actual_payload = None
324
370
 
325
- sf = self._get_extracted_vectors_storage_field(vectorset)
371
+ sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
326
372
  vo: Optional[VectorObject] = None
327
373
  if actual_payload is None:
328
- # Its first extracted text
374
+ # Its first extracted vectors
329
375
  if payload.HasField("file"):
330
- await self.storage.normalize_binary(payload.file, sf)
376
+ # When we receive vectors in a cloud file, it points to our
377
+ # storage but paths are different, we may want to move it. This
378
+ # can happen, for example, with LEGACY KBs where processing
379
+ # sends us the extracted vectors prefixed by vectorset but, to
380
+ # maintain bw/c, we move those to the original not prefixed
381
+ # path.
382
+ try:
383
+ await self.storage.normalize_binary(payload.file, sf)
384
+ except CouldNotCopyNotFound:
385
+ # A failure here could mean the payload has already been
386
+ # moved and we're retrying due to a redelivery or another
387
+ # retry mechanism
388
+ already_moved = await sf.exists()
389
+ if already_moved:
390
+ # We assume is the correct one and do nothing else
391
+ pass
392
+ else:
393
+ raise
331
394
  vo = await self.storage.download_pb(sf, VectorObject)
332
395
  else:
333
396
  await self.storage.upload_pb(sf, payload.vectors)
@@ -354,14 +417,13 @@ class Field(Generic[PbType]):
354
417
  return vo
355
418
 
356
419
  async def get_vectors(
357
- self, vectorset: Optional[str] = None, force: bool = False
420
+ self,
421
+ vectorset: str,
422
+ storage_key_kind: VectorSetConfig.StorageKeyKind.ValueType,
423
+ force: bool = False,
358
424
  ) -> Optional[VectorObject]:
359
- # compat with vectorsets coming from protobuffers where no value is
360
- # empty string instead of None. This shouldn't be handled here but we
361
- # have to make sure it gets the correct vectorset
362
- vectorset = vectorset or None
363
425
  if self.extracted_vectors.get(vectorset, None) is None or force:
364
- sf = self._get_extracted_vectors_storage_field(vectorset)
426
+ sf = self._get_extracted_vectors_storage_field(vectorset, storage_key_kind)
365
427
  payload = await self.storage.download_pb(sf, VectorObject)
366
428
  if payload is not None:
367
429
  self.extracted_vectors[vectorset] = payload
@@ -100,6 +100,8 @@ class ResourceBrain:
100
100
  page_positions: Optional[FilePagePositions],
101
101
  extracted_text: Optional[ExtractedText],
102
102
  basic_user_field_metadata: Optional[UserFieldMetadata] = None,
103
+ *,
104
+ replace_field: bool = False,
103
105
  ):
104
106
  # To check for duplicate paragraphs
105
107
  unique_paragraphs: set[str] = set()
@@ -224,6 +226,11 @@ class ResourceBrain:
224
226
 
225
227
  self.brain.paragraphs[field_key].paragraphs[key].CopyFrom(p)
226
228
 
229
+ if replace_field:
230
+ field_type, field_name = field_key.split("/")
231
+ full_field_id = ids.FieldId(rid=self.rid, type=field_type, key=field_name).full()
232
+ self.brain.paragraphs_to_delete.append(full_field_id)
233
+
227
234
  for relations in metadata.metadata.relations:
228
235
  for relation in relations.relations:
229
236
  self.brain.relations.append(relation)
@@ -239,9 +246,10 @@ class ResourceBrain:
239
246
  field_id: str,
240
247
  vo: utils_pb2.VectorObject,
241
248
  *,
242
- vectorset: Optional[str] = None,
249
+ vectorset: str,
243
250
  replace_field: bool = False,
244
- matryoshka_vector_dimension: Optional[int] = None,
251
+ # cut to specific dimension if specified
252
+ vector_dimension: Optional[int] = None,
245
253
  ):
246
254
  fid = ids.FieldId.from_string(f"{self.rid}/{field_id}")
247
255
  for subfield, vectors in vo.split_vectors.items():
@@ -270,7 +278,7 @@ class ResourceBrain:
270
278
  sentence_key,
271
279
  vector,
272
280
  vectorset=vectorset,
273
- matryoshka_vector_dimension=matryoshka_vector_dimension,
281
+ vector_dimension=vector_dimension,
274
282
  )
275
283
 
276
284
  _field_id = ids.FieldId(
@@ -296,13 +304,12 @@ class ResourceBrain:
296
304
  sentence_key,
297
305
  vector,
298
306
  vectorset=vectorset,
299
- matryoshka_vector_dimension=matryoshka_vector_dimension,
307
+ vector_dimension=vector_dimension,
300
308
  )
301
309
 
302
310
  if replace_field:
303
311
  full_field_id = ids.FieldId(rid=self.rid, type=fid.type, key=fid.key).full()
304
- self.brain.sentences_to_delete.append(full_field_id)
305
- self.brain.paragraphs_to_delete.append(full_field_id)
312
+ self.brain.vector_prefixes_to_delete[vectorset].items.append(full_field_id)
306
313
 
307
314
  def _apply_field_vector(
308
315
  self,
@@ -311,22 +318,15 @@ class ResourceBrain:
311
318
  sentence_key: ids.VectorId,
312
319
  vector: utils_pb2.Vector,
313
320
  *,
314
- vectorset: Optional[str],
315
- matryoshka_vector_dimension: Optional[int] = None,
321
+ vectorset: str,
322
+ # cut vectors if a specific dimension is specified
323
+ vector_dimension: Optional[int] = None,
316
324
  ):
317
325
  paragraph_pb = self.brain.paragraphs[field_id].paragraphs[paragraph_key.full()]
318
- if vectorset:
319
- sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
320
- else:
321
- sentence_pb = paragraph_pb.sentences[sentence_key.full()]
326
+ sentence_pb = paragraph_pb.vectorsets_sentences[vectorset].sentences[sentence_key.full()]
322
327
 
323
328
  sentence_pb.ClearField("vector") # clear first to prevent duplicates
324
-
325
- # cut vectors if a specific dimension is specified
326
- if matryoshka_vector_dimension is not None:
327
- sentence_pb.vector.extend(vector.vector[:matryoshka_vector_dimension])
328
- else:
329
- sentence_pb.vector.extend(vector.vector)
329
+ sentence_pb.vector.extend(vector.vector[:vector_dimension])
330
330
 
331
331
  # we only care about start/stop position of the paragraph for a given sentence here
332
332
  # the key has the sentence position
@@ -490,22 +490,29 @@ class ResourceBrain:
490
490
  ):
491
491
  if metadata.mime_type != "":
492
492
  labels["mt"].add(metadata.mime_type)
493
+
494
+ base_classification_relation = Relation(
495
+ relation=Relation.ABOUT,
496
+ source=relation_node_document,
497
+ to=RelationNode(
498
+ ntype=RelationNode.NodeType.LABEL,
499
+ ),
500
+ )
493
501
  for classification in metadata.classifications:
494
502
  label = f"{classification.labelset}/{classification.label}"
495
503
  if label not in user_canceled_labels:
496
504
  labels["l"].add(label)
497
- relation_node_label = RelationNode(
498
- value=label,
499
- ntype=RelationNode.NodeType.LABEL,
500
- )
501
- self.brain.relations.append(
502
- Relation(
503
- relation=Relation.ABOUT,
504
- source=relation_node_document,
505
- to=relation_node_label,
506
- )
507
- )
505
+ relation = Relation()
506
+ relation.CopyFrom(base_classification_relation)
507
+ relation.to.value = label
508
+ self.brain.relations.append(relation)
509
+
508
510
  # Data Augmentation + Processor entities
511
+ base_entity_relation = Relation(
512
+ relation=Relation.ENTITY,
513
+ source=relation_node_document,
514
+ to=RelationNode(ntype=RelationNode.NodeType.ENTITY),
515
+ )
509
516
  use_legacy_entities = True
510
517
  for data_augmentation_task_id, entities in metadata.entities.items():
511
518
  # If we recieved the entities from the processor here, we don't want to use the legacy entities
@@ -521,38 +528,30 @@ class ResourceBrain:
521
528
  labels["e"].add(
522
529
  f"{entity_label}/{entity_text}"
523
530
  ) # Add data_augmentation_task_id as a prefix?
524
- relation_node_entity = RelationNode(
525
- value=entity_text,
526
- ntype=RelationNode.NodeType.ENTITY,
527
- subtype=entity_label,
528
- )
529
- rel = Relation(
530
- relation=Relation.ENTITY,
531
- source=relation_node_document,
532
- to=relation_node_entity,
533
- )
534
- self.brain.relations.append(rel)
531
+ relation = Relation()
532
+ relation.CopyFrom(base_entity_relation)
533
+ relation.to.value = entity_text
534
+ relation.to.subtype = entity_label
535
+ self.brain.relations.append(relation)
535
536
 
536
537
  # Legacy processor entities
537
538
  # TODO: Remove once processor doesn't use this anymore and remove the positions and ner fields from the message
539
+ def _parse_entity(klass_entity: str) -> tuple[str, str]:
540
+ try:
541
+ klass, entity = klass_entity.split("/", 1)
542
+ return klass, entity
543
+ except ValueError:
544
+ raise AttributeError(f"Entity should be with type {klass_entity}")
545
+
538
546
  if use_legacy_entities:
539
- for klass_entity, _ in metadata.positions.items():
547
+ for klass_entity in metadata.positions.keys():
540
548
  labels["e"].add(klass_entity)
541
- entity_array = klass_entity.split("/")
542
- if len(entity_array) == 1:
543
- raise AttributeError(f"Entity should be with type {klass_entity}")
544
- elif len(entity_array) > 1:
545
- klass = entity_array[0]
546
- entity = "/".join(entity_array[1:])
547
- relation_node_entity = RelationNode(
548
- value=entity, ntype=RelationNode.NodeType.ENTITY, subtype=klass
549
- )
550
- rel = Relation(
551
- relation=Relation.ENTITY,
552
- source=relation_node_document,
553
- to=relation_node_entity,
554
- )
555
- self.brain.relations.append(rel)
549
+ klass, entity = _parse_entity(klass_entity)
550
+ relation = Relation()
551
+ relation.CopyFrom(base_entity_relation)
552
+ relation.to.value = entity
553
+ relation.to.subtype = klass
554
+ self.brain.relations.append(relation)
556
555
 
557
556
  def apply_field_labels(
558
557
  self,