nucliadb 6.3.4.post3645__py3-none-any.whl → 6.3.4.post3663__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nucliadb/backups/const.py CHANGED
@@ -37,6 +37,8 @@ class StorageKeys:
37
37
  RESOURCE = "backups/{backup_id}/resources/{resource_id}.tar"
38
38
  ENTITIES = "backups/{backup_id}/entities.pb"
39
39
  LABELS = "backups/{backup_id}/labels.pb"
40
+ SYNONYMS = "backups/{backup_id}/synonyms.pb"
41
+ SEARCH_CONFIGURATIONS = "backups/{backup_id}/search_configurations.pb"
40
42
 
41
43
 
42
44
  class BackupFinishedStream:
@@ -18,6 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  import asyncio
21
+ import json
21
22
  import logging
22
23
  import tarfile
23
24
  from datetime import datetime, timezone
@@ -38,6 +39,8 @@ from nucliadb.export_import.utils import (
38
39
  get_cloud_files,
39
40
  get_entities,
40
41
  get_labels,
42
+ get_search_configurations,
43
+ get_synonyms,
41
44
  )
42
45
  from nucliadb.tasks.retries import TaskRetryHandler
43
46
  from nucliadb_protos import backups_pb2, resources_pb2, writer_pb2
@@ -74,6 +77,8 @@ async def backup_kb(context: ApplicationContext, kbid: str, backup_id: str):
74
77
  await backup_resources(context, kbid, backup_id)
75
78
  await backup_labels(context, kbid, backup_id)
76
79
  await backup_entities(context, kbid, backup_id)
80
+ await backup_synonyms(context, kbid, backup_id)
81
+ await backup_search_configurations(context, kbid, backup_id)
77
82
  await notify_backup_completed(context, kbid, backup_id)
78
83
  await delete_metadata(context, kbid, backup_id)
79
84
 
@@ -216,7 +221,7 @@ async def backup_resource_with_binaries(
216
221
  await upload_to_bucket(
217
222
  context,
218
223
  resource_data_iterator(),
219
- key=StorageKeys.RESOURCE.format(kbid=kbid, backup_id=backup_id, resource_id=rid),
224
+ key=StorageKeys.RESOURCE.format(backup_id=backup_id, resource_id=rid),
220
225
  )
221
226
  return total_size
222
227
 
@@ -225,7 +230,7 @@ async def backup_labels(context: ApplicationContext, kbid: str, backup_id: str):
225
230
  labels = await get_labels(context, kbid)
226
231
  await context.blob_storage.upload_object(
227
232
  bucket=settings.backups_bucket,
228
- key=StorageKeys.LABELS.format(kbid=kbid, backup_id=backup_id),
233
+ key=StorageKeys.LABELS.format(backup_id=backup_id),
229
234
  data=labels.SerializeToString(),
230
235
  )
231
236
 
@@ -234,11 +239,33 @@ async def backup_entities(context: ApplicationContext, kbid: str, backup_id: str
234
239
  entities = await get_entities(context, kbid)
235
240
  await context.blob_storage.upload_object(
236
241
  bucket=settings.backups_bucket,
237
- key=StorageKeys.ENTITIES.format(kbid=kbid, backup_id=backup_id),
242
+ key=StorageKeys.ENTITIES.format(backup_id=backup_id),
238
243
  data=entities.SerializeToString(),
239
244
  )
240
245
 
241
246
 
247
+ async def backup_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
248
+ synonyms = await get_synonyms(context, kbid)
249
+ await context.blob_storage.upload_object(
250
+ bucket=settings.backups_bucket,
251
+ key=StorageKeys.SYNONYMS.format(backup_id=backup_id),
252
+ data=synonyms.SerializeToString(),
253
+ )
254
+
255
+
256
+ async def backup_search_configurations(context: ApplicationContext, kbid: str, backup_id: str):
257
+ search_configurations = await get_search_configurations(context, kbid=kbid)
258
+ serialized_search_configs = {
259
+ config_id: config.model_dump(mode="python", exclude_unset=True)
260
+ for config_id, config in search_configurations.items()
261
+ }
262
+ await context.blob_storage.upload_object(
263
+ bucket=settings.backups_bucket,
264
+ key=StorageKeys.SEARCH_CONFIGURATIONS.format(backup_id=backup_id),
265
+ data=json.dumps(serialized_search_configs).encode(),
266
+ )
267
+
268
+
242
269
  async def get_metadata(
243
270
  context: ApplicationContext, kbid: str, backup_id: str
244
271
  ) -> Optional[BackupMetadata]:
@@ -21,9 +21,12 @@
21
21
 
22
22
  import asyncio
23
23
  import functools
24
+ import json
24
25
  import logging
25
26
  import tarfile
26
- from typing import AsyncIterator, Callable, Optional, Union
27
+ from typing import Any, AsyncIterator, Callable, Optional, Union
28
+
29
+ from pydantic import TypeAdapter
27
30
 
28
31
  from nucliadb.backups.const import MaindbKeys, StorageKeys
29
32
  from nucliadb.backups.models import RestoreBackupRequest
@@ -34,8 +37,11 @@ from nucliadb.export_import.utils import (
34
37
  restore_broker_message,
35
38
  set_entities_groups,
36
39
  set_labels,
40
+ set_search_configurations,
41
+ set_synonyms,
37
42
  )
38
43
  from nucliadb.tasks.retries import TaskRetryHandler
44
+ from nucliadb_models.configuration import SearchConfiguration
39
45
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
40
46
  from nucliadb_protos.resources_pb2 import CloudFile
41
47
  from nucliadb_protos.writer_pb2 import BrokerMessage
@@ -69,6 +75,8 @@ async def restore_kb(context: ApplicationContext, kbid: str, backup_id: str):
69
75
  await restore_resources(context, kbid, backup_id)
70
76
  await restore_labels(context, kbid, backup_id)
71
77
  await restore_entities(context, kbid, backup_id)
78
+ await restore_synonyms(context, kbid, backup_id)
79
+ await restore_search_configurations(context, kbid, backup_id)
72
80
  await delete_last_restored(context, kbid, backup_id)
73
81
 
74
82
 
@@ -77,7 +85,7 @@ async def restore_resources(context: ApplicationContext, kbid: str, backup_id: s
77
85
  tasks = []
78
86
  async for object_info in context.blob_storage.iterate_objects(
79
87
  bucket=settings.backups_bucket,
80
- prefix=StorageKeys.RESOURCES_PREFIX.format(kbid=kbid, backup_id=backup_id),
88
+ prefix=StorageKeys.RESOURCES_PREFIX.format(backup_id=backup_id),
81
89
  start=last_restored,
82
90
  ):
83
91
  key = object_info.name
@@ -210,7 +218,7 @@ class ResourceBackupReader:
210
218
  async def restore_resource(context: ApplicationContext, kbid: str, backup_id: str, resource_id: str):
211
219
  download_stream = context.blob_storage.download(
212
220
  bucket=settings.backups_bucket,
213
- key=StorageKeys.RESOURCE.format(kbid=kbid, backup_id=backup_id, resource_id=resource_id),
221
+ key=StorageKeys.RESOURCE.format(backup_id=backup_id, resource_id=resource_id),
214
222
  )
215
223
  reader = ResourceBackupReader(download_stream)
216
224
  bm = None
@@ -242,7 +250,7 @@ async def restore_resource(context: ApplicationContext, kbid: str, backup_id: st
242
250
  async def restore_labels(context: ApplicationContext, kbid: str, backup_id: str):
243
251
  raw = await context.blob_storage.downloadbytes(
244
252
  bucket=settings.backups_bucket,
245
- key=StorageKeys.LABELS.format(kbid=kbid, backup_id=backup_id),
253
+ key=StorageKeys.LABELS.format(backup_id=backup_id),
246
254
  )
247
255
  labels = kb_pb2.Labels()
248
256
  labels.ParseFromString(raw.getvalue())
@@ -252,8 +260,31 @@ async def restore_labels(context: ApplicationContext, kbid: str, backup_id: str)
252
260
  async def restore_entities(context: ApplicationContext, kbid: str, backup_id: str):
253
261
  raw = await context.blob_storage.downloadbytes(
254
262
  bucket=settings.backups_bucket,
255
- key=StorageKeys.ENTITIES.format(kbid=kbid, backup_id=backup_id),
263
+ key=StorageKeys.ENTITIES.format(backup_id=backup_id),
256
264
  )
257
265
  entities = kb_pb2.EntitiesGroups()
258
266
  entities.ParseFromString(raw.getvalue())
259
267
  await set_entities_groups(context, kbid, entities)
268
+
269
+
270
+ async def restore_synonyms(context: ApplicationContext, kbid: str, backup_id: str):
271
+ raw = await context.blob_storage.downloadbytes(
272
+ bucket=settings.backups_bucket,
273
+ key=StorageKeys.SYNONYMS.format(backup_id=backup_id),
274
+ )
275
+ synonyms = kb_pb2.Synonyms()
276
+ synonyms.ParseFromString(raw.getvalue())
277
+ await set_synonyms(context, kbid, synonyms)
278
+
279
+
280
+ async def restore_search_configurations(context: ApplicationContext, kbid: str, backup_id: str):
281
+ raw = await context.blob_storage.downloadbytes(
282
+ bucket=settings.backups_bucket,
283
+ key=StorageKeys.SEARCH_CONFIGURATIONS.format(backup_id=backup_id),
284
+ )
285
+ as_dict: dict[str, dict[str, Any]] = json.loads(raw.getvalue())
286
+ search_configurations: dict[str, SearchConfiguration] = {}
287
+ for name, data in as_dict.items():
288
+ config: SearchConfiguration = TypeAdapter(SearchConfiguration).validate_python(data)
289
+ search_configurations[name] = config
290
+ await set_search_configurations(context, kbid, search_configurations)
@@ -34,6 +34,7 @@ from nucliadb.export_import.exceptions import (
34
34
  )
35
35
  from nucliadb.export_import.models import ExportedItemType, ExportItem, Metadata
36
36
  from nucliadb.ingest.orm.broker_message import generate_broker_message
37
+ from nucliadb_models.configuration import SearchConfiguration
37
38
  from nucliadb_models.export_import import Status
38
39
  from nucliadb_protos import knowledgebox_pb2 as kb_pb2
39
40
  from nucliadb_protos import resources_pb2, writer_pb2
@@ -45,23 +46,61 @@ BinaryStream = AsyncIterator[bytes]
45
46
  BinaryStreamGenerator = Callable[[int], BinaryStream]
46
47
 
47
48
 
48
- # Broker message fields that are populated by the processing pipeline
49
- PROCESSING_BM_FIELDS = [
50
- "link_extracted_data",
51
- "file_extracted_data",
52
- "extracted_text",
53
- "field_metadata",
54
- "field_vectors",
55
- "field_large_metadata",
56
- ]
57
-
58
- # Broker message fields that are populated by the nucliadb writer component
59
- WRITER_BM_FIELDS = [
60
- "links",
61
- "files",
62
- "texts",
63
- "conversations",
64
- ]
49
+ # Map that indicates which fields are written by the writer
50
+ # and which are written by the processor.
51
+ BM_FIELDS = {
52
+ "common": [
53
+ "kbid",
54
+ "uuid",
55
+ "type",
56
+ "source",
57
+ "reindex",
58
+ ],
59
+ "writer": [
60
+ "slug",
61
+ "basic",
62
+ "origin",
63
+ "user_relations",
64
+ "conversations",
65
+ "texts",
66
+ "links",
67
+ "files",
68
+ "extra",
69
+ "security",
70
+ ],
71
+ "processor": [
72
+ "link_extracted_data",
73
+ "file_extracted_data",
74
+ "extracted_text",
75
+ "field_metadata",
76
+ "field_vectors",
77
+ "field_large_metadata",
78
+ "question_answers",
79
+ "relations",
80
+ "field_statuses",
81
+ ],
82
+ # These fields are mostly used for internal purposes and they are not part of
83
+ # the representation of the exported resource as broker message.
84
+ "ignored": [
85
+ "audit",
86
+ "multiid",
87
+ "origin_seq",
88
+ "slow_processing_time",
89
+ "pre_processing_time",
90
+ "done_time",
91
+ "processing_id",
92
+ "account_seq",
93
+ "delete_fields",
94
+ "delete_question_answers",
95
+ "errors",
96
+ "generated_by",
97
+ ],
98
+ # No longer used fields
99
+ "deprecated": [
100
+ "txseqid",
101
+ "user_vectors",
102
+ ],
103
+ }
65
104
 
66
105
 
67
106
  async def import_broker_message(
@@ -125,7 +164,7 @@ async def transaction_commit(
125
164
  def get_writer_bm(bm: writer_pb2.BrokerMessage) -> writer_pb2.BrokerMessage:
126
165
  wbm = writer_pb2.BrokerMessage()
127
166
  wbm.CopyFrom(bm)
128
- for field in PROCESSING_BM_FIELDS:
167
+ for field in BM_FIELDS["processor"]:
129
168
  wbm.ClearField(field) # type: ignore
130
169
  wbm.type = writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT
131
170
  wbm.source = writer_pb2.BrokerMessage.MessageSource.WRITER
@@ -135,7 +174,7 @@ def get_writer_bm(bm: writer_pb2.BrokerMessage) -> writer_pb2.BrokerMessage:
135
174
  def get_processor_bm(bm: writer_pb2.BrokerMessage) -> writer_pb2.BrokerMessage:
136
175
  pbm = writer_pb2.BrokerMessage()
137
176
  pbm.CopyFrom(bm)
138
- for field in WRITER_BM_FIELDS:
177
+ for field in BM_FIELDS["writer"]:
139
178
  pbm.ClearField(field) # type: ignore
140
179
  pbm.type = writer_pb2.BrokerMessage.MessageType.AUTOCOMMIT
141
180
  pbm.source = writer_pb2.BrokerMessage.MessageSource.PROCESSOR
@@ -173,6 +212,21 @@ async def set_entities_groups(
173
212
  await txn.commit()
174
213
 
175
214
 
215
+ async def set_synonyms(context: ApplicationContext, kbid: str, synonyms: kb_pb2.Synonyms) -> None:
216
+ async with datamanagers.with_transaction() as txn:
217
+ await datamanagers.synonyms.set(txn, kbid=kbid, synonyms=synonyms)
218
+ await txn.commit()
219
+
220
+
221
+ async def set_search_configurations(
222
+ context: ApplicationContext, kbid: str, search_configurations: dict[str, SearchConfiguration]
223
+ ) -> None:
224
+ async with datamanagers.with_transaction() as txn:
225
+ for name, config in search_configurations.items():
226
+ await datamanagers.search_configurations.set(txn, kbid=kbid, name=name, config=config)
227
+ await txn.commit()
228
+
229
+
176
230
  async def set_labels(context: ApplicationContext, kbid: str, labels: kb_pb2.Labels) -> None:
177
231
  async with datamanagers.with_transaction() as txn:
178
232
  await datamanagers.labels.set_labels(txn, kbid=kbid, labels=labels)
@@ -273,6 +327,18 @@ async def get_labels(context: ApplicationContext, kbid: str) -> kb_pb2.Labels:
273
327
  return await datamanagers.labels.get_labels(txn, kbid=kbid)
274
328
 
275
329
 
330
+ async def get_synonyms(context: ApplicationContext, kbid: str) -> kb_pb2.Synonyms:
331
+ async with datamanagers.with_ro_transaction() as txn:
332
+ return await datamanagers.synonyms.get(txn, kbid=kbid) or kb_pb2.Synonyms()
333
+
334
+
335
+ async def get_search_configurations(
336
+ context: ApplicationContext, kbid: str
337
+ ) -> dict[str, SearchConfiguration]:
338
+ async with datamanagers.with_ro_transaction() as txn:
339
+ return await datamanagers.search_configurations.list(txn, kbid=kbid)
340
+
341
+
276
342
  class EndOfStream(Exception): ...
277
343
 
278
344
 
@@ -26,6 +26,7 @@ from nucliadb.ingest.fields.conversation import Conversation
26
26
  from nucliadb.ingest.fields.file import File
27
27
  from nucliadb.ingest.fields.link import Link
28
28
  from nucliadb.ingest.orm.resource import Resource
29
+ from nucliadb_protos import writer_pb2
29
30
  from nucliadb_protos.knowledgebox_pb2 import VectorSetConfig
30
31
  from nucliadb_protos.resources_pb2 import (
31
32
  ExtractedTextWrapper,
@@ -34,7 +35,7 @@ from nucliadb_protos.resources_pb2 import (
34
35
  FieldType,
35
36
  LargeComputedMetadataWrapper,
36
37
  )
37
- from nucliadb_protos.writer_pb2 import BrokerMessage
38
+ from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus
38
39
 
39
40
 
40
41
  async def generate_broker_message(resource: Resource) -> BrokerMessage:
@@ -102,8 +103,25 @@ class _BrokerMessageBuilder:
102
103
  # Large metadata
103
104
  await self.generate_field_large_computed_metadata(type_id, field_id, field)
104
105
 
106
+ # Field status
107
+ await self.generate_field_status(type_id, field_id, field)
108
+
105
109
  return self.bm
106
110
 
111
+ async def generate_field_status(
112
+ self,
113
+ type_id: FieldType.ValueType,
114
+ field_id: str,
115
+ field: Field,
116
+ ):
117
+ fid = writer_pb2.FieldID(field_type=type_id, field=field_id)
118
+ status = await field.get_status()
119
+ if status is not None:
120
+ field_id_status = FieldIDStatus()
121
+ field_id_status.id.CopyFrom(fid)
122
+ field_id_status.status = status.status
123
+ self.bm.field_statuses.append(field_id_status)
124
+
107
125
  async def generate_field(
108
126
  self,
109
127
  type_id: FieldType.ValueType,
@@ -19,6 +19,7 @@
19
19
  #
20
20
  import base64
21
21
  import datetime
22
+ import json
22
23
  import logging
23
24
  import uuid
24
25
  from collections import defaultdict
@@ -32,6 +33,7 @@ import jwt
32
33
  from pydantic import BaseModel, Field
33
34
 
34
35
  import nucliadb_models as models
36
+ from nucliadb_models.labels import ClassificationLabel
35
37
  from nucliadb_models.resource import QueueType
36
38
  from nucliadb_protos.resources_pb2 import CloudFile
37
39
  from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
@@ -93,7 +95,10 @@ class PushPayload(BaseModel):
93
95
  genericfield: dict[str, models.Text] = {}
94
96
 
95
97
  # New File
96
- filefield: dict[str, str] = {}
98
+ filefield: dict[str, str] = Field(
99
+ default={},
100
+ description="Map of each file field to the jwt token computed in ProcessingEngine methods",
101
+ )
97
102
 
98
103
  # New Link
99
104
  linkfield: dict[str, models.LinkUpload] = {}
@@ -238,7 +243,9 @@ class ProcessingEngine:
238
243
  }
239
244
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
240
245
 
241
- def generate_file_token_from_fieldfile(self, file: FieldFilePB) -> str:
246
+ def generate_file_token_from_fieldfile(
247
+ self, file: FieldFilePB, classif_labels: Optional[list[ClassificationLabel]] = None
248
+ ) -> str:
242
249
  if self.nuclia_jwt_key is None:
243
250
  raise AttributeError("Nuclia JWT key not set")
244
251
  now = datetime.datetime.now(tz=datetime.timezone.utc)
@@ -263,6 +270,8 @@ class ProcessingEngine:
263
270
  "language": file.language,
264
271
  "extract_strategy": file.extract_strategy,
265
272
  }
273
+ if classif_labels:
274
+ payload["classification_labels"] = self.encode_classif_labels(classif_labels)
266
275
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
267
276
 
268
277
  @backoff.on_exception(
@@ -272,7 +281,9 @@ class ProcessingEngine:
272
281
  max_tries=MAX_TRIES,
273
282
  )
274
283
  @processing_observer.wrap({"type": "file_field_upload"})
275
- async def convert_filefield_to_str(self, file: models.FileField) -> str:
284
+ async def convert_filefield_to_str(
285
+ self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
286
+ ) -> str:
276
287
  # Upload file without storing on Nuclia DB
277
288
  headers = {}
278
289
  headers["X-PASSWORD"] = file.password
@@ -281,6 +292,8 @@ class ProcessingEngine:
281
292
  headers["X-MD5"] = file.file.md5
282
293
  if file.extract_strategy is not None:
283
294
  headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
295
+ if classif_labels:
296
+ headers["X-CLASSIFICATION-LABELS"] = self.encode_classif_labels(classif_labels)
284
297
  headers["CONTENT_TYPE"] = file.file.content_type
285
298
  headers["CONTENT-LENGTH"] = str(len(file.file.payload)) # type: ignore
286
299
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
@@ -299,7 +312,14 @@ class ProcessingEngine:
299
312
  text = await resp.text()
300
313
  raise Exception(f"STATUS: {resp.status} - {text}")
301
314
 
302
- def convert_external_filefield_to_str(self, file_field: models.FileField) -> str:
315
+ def encode_classif_labels(self, classif_labels: list[ClassificationLabel]) -> str:
316
+ return base64.b64encode(
317
+ json.dumps([label.model_dump(mode="python") for label in classif_labels]).encode()
318
+ ).decode()
319
+
320
+ def convert_external_filefield_to_str(
321
+ self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
322
+ ) -> str:
303
323
  if self.nuclia_jwt_key is None:
304
324
  raise AttributeError("Nuclia JWT key not set")
305
325
 
@@ -322,6 +342,8 @@ class ProcessingEngine:
322
342
  "password": file_field.password,
323
343
  "extract_strategy": file_field.extract_strategy,
324
344
  }
345
+ if classif_labels:
346
+ payload["classification_labels"] = self.encode_classif_labels(classif_labels)
325
347
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
326
348
 
327
349
  @backoff.on_exception(
@@ -331,11 +353,16 @@ class ProcessingEngine:
331
353
  max_tries=MAX_TRIES,
332
354
  )
333
355
  @processing_observer.wrap({"type": "file_field_upload_internal"})
334
- async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
356
+ async def convert_internal_filefield_to_str(
357
+ self,
358
+ file: FieldFilePB,
359
+ storage: Storage,
360
+ classif_labels: Optional[list[ClassificationLabel]] = None,
361
+ ) -> str:
335
362
  """It's already an internal file that needs to be uploaded"""
336
363
  if self.onprem is False:
337
364
  # Upload the file to processing upload
338
- jwttoken = self.generate_file_token_from_fieldfile(file)
365
+ jwttoken = self.generate_file_token_from_fieldfile(file, classif_labels)
339
366
  else:
340
367
  headers = {}
341
368
  headers["X-PASSWORD"] = file.password
@@ -347,6 +374,8 @@ class ProcessingEngine:
347
374
  headers["CONTENT-LENGTH"] = str(file.file.size)
348
375
  if file.extract_strategy != "":
349
376
  headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
377
+ if classif_labels:
378
+ headers["X-CLASSIFICATION-LABELS"] = self.encode_classif_labels(classif_labels)
350
379
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
351
380
 
352
381
  iterator = storage.downloadbytescf_iterator(file.file)
@@ -488,22 +517,31 @@ class DummyProcessingEngine(ProcessingEngine):
488
517
  async def finalize(self):
489
518
  pass
490
519
 
491
- async def convert_filefield_to_str(self, file: models.FileField) -> str:
520
+ async def convert_filefield_to_str(
521
+ self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
522
+ ) -> str:
492
523
  self.calls.append([file])
493
524
  index = len(self.values["convert_filefield_to_str"])
494
- self.values["convert_filefield_to_str"].append(file)
525
+ self.values["convert_filefield_to_str"].append((file, classif_labels))
495
526
  return f"convert_filefield_to_str,{index}"
496
527
 
497
- def convert_external_filefield_to_str(self, file_field: models.FileField) -> str:
528
+ def convert_external_filefield_to_str(
529
+ self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
530
+ ) -> str:
498
531
  self.calls.append([file_field])
499
532
  index = len(self.values["convert_external_filefield_to_str"])
500
- self.values["convert_external_filefield_to_str"].append(file_field)
533
+ self.values["convert_external_filefield_to_str"].append((file_field, classif_labels))
501
534
  return f"convert_external_filefield_to_str,{index}"
502
535
 
503
- async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
536
+ async def convert_internal_filefield_to_str(
537
+ self,
538
+ file: FieldFilePB,
539
+ storage: Storage,
540
+ classif_labels: Optional[list[ClassificationLabel]] = None,
541
+ ) -> str:
504
542
  self.calls.append([file, storage])
505
543
  index = len(self.values["convert_internal_filefield_to_str"])
506
- self.values["convert_internal_filefield_to_str"].append([file, storage])
544
+ self.values["convert_internal_filefield_to_str"].append((file, storage, classif_labels))
507
545
  return f"convert_internal_filefield_to_str,{index}"
508
546
 
509
547
  async def convert_internal_cf_to_str(self, cf: CloudFile, storage: Storage) -> str:
@@ -43,7 +43,10 @@ from nucliadb.writer.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREF
43
43
  from nucliadb.writer.back_pressure import maybe_back_pressure
44
44
  from nucliadb.writer.resource.audit import parse_audit
45
45
  from nucliadb.writer.resource.field import (
46
+ ResourceClassifications,
47
+ atomic_get_stored_resource_classifications,
46
48
  extract_file_field,
49
+ get_stored_resource_classifications,
47
50
  parse_conversation_field,
48
51
  parse_file_field,
49
52
  parse_link_field,
@@ -114,11 +117,31 @@ async def add_field_to_resource(
114
117
 
115
118
  parse_audit(writer.audit, request)
116
119
 
120
+ resource_classifications = await atomic_get_stored_resource_classifications(kbid=kbid, rid=rid)
121
+
117
122
  parse_field = FIELD_PARSERS_MAP[type(field_payload)]
118
123
  if iscoroutinefunction(parse_field):
119
- await parse_field(kbid, rid, field_id, field_payload, writer, toprocess, **parser_kwargs)
124
+ await parse_field(
125
+ kbid,
126
+ rid,
127
+ field_id,
128
+ field_payload,
129
+ writer,
130
+ toprocess,
131
+ resource_classifications,
132
+ **parser_kwargs,
133
+ )
120
134
  else:
121
- parse_field(kbid, rid, field_id, field_payload, writer, toprocess, **parser_kwargs)
135
+ parse_field(
136
+ kbid,
137
+ rid,
138
+ field_id,
139
+ field_payload,
140
+ writer,
141
+ toprocess,
142
+ resource_classifications,
143
+ **parser_kwargs,
144
+ )
122
145
 
123
146
  processing = get_processing()
124
147
  await transaction.commit(writer, partition)
@@ -200,8 +223,9 @@ def parse_text_field_adapter(
200
223
  field_payload: models.TextField,
201
224
  writer: BrokerMessage,
202
225
  toprocess: PushPayload,
226
+ resource_classifications: ResourceClassifications,
203
227
  ):
204
- return parse_text_field(field_id, field_payload, writer, toprocess)
228
+ return parse_text_field(field_id, field_payload, writer, toprocess, resource_classifications)
205
229
 
206
230
 
207
231
  def parse_link_field_adapter(
@@ -211,8 +235,9 @@ def parse_link_field_adapter(
211
235
  field_payload: models.LinkField,
212
236
  writer: BrokerMessage,
213
237
  toprocess: PushPayload,
238
+ resource_classifications: ResourceClassifications,
214
239
  ):
215
- return parse_link_field(field_id, field_payload, writer, toprocess)
240
+ return parse_link_field(field_id, field_payload, writer, toprocess, resource_classifications)
216
241
 
217
242
 
218
243
  async def parse_conversation_field_adapter(
@@ -222,8 +247,11 @@ async def parse_conversation_field_adapter(
222
247
  field_payload: models.InputConversationField,
223
248
  writer: BrokerMessage,
224
249
  toprocess: PushPayload,
250
+ resource_classifications: ResourceClassifications,
225
251
  ):
226
- return await parse_conversation_field(field_id, field_payload, writer, toprocess, kbid, rid)
252
+ return await parse_conversation_field(
253
+ field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications
254
+ )
227
255
 
228
256
 
229
257
  async def parse_file_field_adapter(
@@ -233,14 +261,22 @@ async def parse_file_field_adapter(
233
261
  field_payload: models.FileField,
234
262
  writer: BrokerMessage,
235
263
  toprocess: PushPayload,
264
+ resource_classifications: ResourceClassifications,
236
265
  skip_store: bool,
237
266
  ):
238
267
  return await parse_file_field(
239
- field_id, field_payload, writer, toprocess, kbid, rid, skip_store=skip_store
268
+ field_id,
269
+ field_payload,
270
+ writer,
271
+ toprocess,
272
+ kbid,
273
+ rid,
274
+ resource_classifications,
275
+ skip_store=skip_store,
240
276
  )
241
277
 
242
278
 
243
- FIELD_PARSERS_MAP: dict[Type, Union[Callable]] = {
279
+ FIELD_PARSERS_MAP: dict[Type, Callable] = {
244
280
  models.TextField: parse_text_field_adapter,
245
281
  models.LinkField: parse_link_field_adapter,
246
282
  models.InputConversationField: parse_conversation_field_adapter,
@@ -537,12 +573,15 @@ async def reprocess_file_field(
537
573
  if resource.basic is not None:
538
574
  toprocess.title = resource.basic.title
539
575
 
576
+ rclassif = await get_stored_resource_classifications(txn, kbid=kbid, rid=rid)
577
+
540
578
  try:
541
579
  await extract_file_field(
542
580
  field_id,
543
581
  resource=resource,
544
582
  toprocess=toprocess,
545
583
  password=x_file_password,
584
+ resource_classifications=rclassif,
546
585
  )
547
586
  except KeyError:
548
587
  raise HTTPException(status_code=404, detail="Field does not exist")
@@ -50,10 +50,16 @@ from nucliadb.writer.resource.audit import parse_audit
50
50
  from nucliadb.writer.resource.basic import (
51
51
  parse_basic_creation,
52
52
  parse_basic_modify,
53
+ parse_user_classifications,
53
54
  set_status,
54
55
  set_status_modify,
55
56
  )
56
- from nucliadb.writer.resource.field import extract_fields, parse_fields
57
+ from nucliadb.writer.resource.field import (
58
+ ResourceClassifications,
59
+ atomic_get_stored_resource_classifications,
60
+ extract_fields,
61
+ parse_fields,
62
+ )
57
63
  from nucliadb.writer.resource.origin import parse_extra, parse_origin
58
64
  from nucliadb.writer.utilities import get_processing
59
65
  from nucliadb_models.resource import NucliaDBRoles
@@ -139,6 +145,11 @@ async def create_resource(
139
145
  if item.extra is not None:
140
146
  parse_extra(writer.extra, item.extra)
141
147
 
148
+ # Since this is a resource creation, we need to care only about the user-provided
149
+ # classifications in the request.
150
+ resource_classifications = ResourceClassifications(
151
+ resource_level=set(parse_user_classifications(item))
152
+ )
142
153
  await parse_fields(
143
154
  writer=writer,
144
155
  item=item,
@@ -146,6 +157,7 @@ async def create_resource(
146
157
  kbid=kbid,
147
158
  uuid=uuid,
148
159
  x_skip_store=x_skip_store,
160
+ resource_classifications=resource_classifications,
149
161
  )
150
162
 
151
163
  set_status(writer.basic, item)
@@ -296,6 +308,15 @@ async def modify_resource(
296
308
  if item.extra is not None:
297
309
  parse_extra(writer.extra, item.extra)
298
310
 
311
+ if item.usermetadata is not None:
312
+ # If usermetadata is set in the request payload, this means that stored resource classifications
313
+ # are not valid and we need to use the ones provided by the user in the request
314
+ resource_classifications = ResourceClassifications(
315
+ resource_level=set(parse_user_classifications(item))
316
+ )
317
+ else:
318
+ resource_classifications = await atomic_get_stored_resource_classifications(kbid, rid)
319
+
299
320
  await parse_fields(
300
321
  writer=writer,
301
322
  item=item,
@@ -303,6 +324,7 @@ async def modify_resource(
303
324
  kbid=kbid,
304
325
  uuid=rid,
305
326
  x_skip_store=x_skip_store,
327
+ resource_classifications=resource_classifications,
306
328
  )
307
329
  set_status_modify(writer.basic, item)
308
330
 
@@ -46,7 +46,10 @@ from nucliadb.writer.api.v1.slug import ensure_slug_uniqueness, noop_context_man
46
46
  from nucliadb.writer.back_pressure import maybe_back_pressure
47
47
  from nucliadb.writer.resource.audit import parse_audit
48
48
  from nucliadb.writer.resource.basic import parse_basic_creation
49
- from nucliadb.writer.resource.field import parse_fields
49
+ from nucliadb.writer.resource.field import (
50
+ atomic_get_stored_resource_classifications,
51
+ parse_fields,
52
+ )
50
53
  from nucliadb.writer.resource.origin import parse_extra, parse_origin
51
54
  from nucliadb.writer.tus import TUSUPLOAD, UPLOAD, get_dm, get_storage_manager
52
55
  from nucliadb.writer.tus.exceptions import (
@@ -64,6 +67,7 @@ from nucliadb_models import content_types
64
67
  from nucliadb_models.resource import NucliaDBRoles
65
68
  from nucliadb_models.utils import FieldIdString
66
69
  from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
70
+ from nucliadb_protos import resources_pb2
67
71
  from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FieldID, FieldType, Metadata
68
72
  from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus, FieldStatus
69
73
  from nucliadb_utils.authentication import requires_one
@@ -864,7 +868,6 @@ async def store_file_on_nuclia_db(
864
868
  partitioning = get_partitioning()
865
869
  processing = get_processing()
866
870
  storage = await get_storage(service_name=SERVICE_NAME)
867
-
868
871
  partition = partitioning.generate_partition(kbid, rid)
869
872
 
870
873
  writer = BrokerMessage()
@@ -901,6 +904,7 @@ async def store_file_on_nuclia_db(
901
904
 
902
905
  toprocess.title = writer.basic.title
903
906
 
907
+ resource_classifications = await atomic_get_stored_resource_classifications(kbid, rid)
904
908
  await parse_fields(
905
909
  writer=writer,
906
910
  item=item,
@@ -908,6 +912,7 @@ async def store_file_on_nuclia_db(
908
912
  kbid=kbid,
909
913
  uuid=rid,
910
914
  x_skip_store=False,
915
+ resource_classifications=resource_classifications,
911
916
  )
912
917
  else:
913
918
  # Use defaults for everything, but don't forget hidden which depends on KB config
@@ -953,8 +958,10 @@ async def store_file_on_nuclia_db(
953
958
  )
954
959
  )
955
960
 
961
+ rclassif = await atomic_get_stored_resource_classifications(kbid, rid)
962
+ classif_labels = rclassif.for_field(field, resources_pb2.FieldType.FILE)
956
963
  toprocess.filefield[field] = await processing.convert_internal_filefield_to_str(
957
- file_field, storage=storage
964
+ file_field, storage=storage, classif_labels=classif_labels
958
965
  )
959
966
 
960
967
  writer.source = BrokerMessage.MessageSource.WRITER
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from datetime import datetime
21
- from typing import Optional
21
+ from typing import Optional, Union
22
22
 
23
23
  from fastapi import HTTPException
24
24
 
@@ -31,6 +31,7 @@ from nucliadb.ingest.orm.utils import set_title
31
31
  from nucliadb.ingest.processing import PushPayload
32
32
  from nucliadb_models.content_types import GENERIC_MIME_TYPE
33
33
  from nucliadb_models.file import FileField
34
+ from nucliadb_models.labels import ClassificationLabel
34
35
  from nucliadb_models.link import LinkField
35
36
  from nucliadb_models.metadata import (
36
37
  ParagraphAnnotation,
@@ -290,3 +291,20 @@ def build_question_answer_annotation_pb(
290
291
  answer.ids_paragraphs.extend(answer_annotation.ids_paragraphs)
291
292
  pb.question_answer.answers.append(answer)
292
293
  return pb
294
+
295
+
296
+ def parse_user_classifications(
297
+ item: Union[CreateResourcePayload, UpdateResourcePayload],
298
+ ) -> list[ClassificationLabel]:
299
+ return (
300
+ [
301
+ ClassificationLabel(
302
+ labelset=classification.labelset,
303
+ label=classification.label,
304
+ )
305
+ for classification in item.usermetadata.classifications
306
+ if classification.cancelled_by_user is False
307
+ ]
308
+ if item.usermetadata is not None
309
+ else []
310
+ )
@@ -17,12 +17,15 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import dataclasses
20
21
  from datetime import datetime
21
22
  from typing import Optional, Union
22
23
 
23
24
  from google.protobuf.json_format import MessageToDict
24
25
 
25
26
  import nucliadb_models as models
27
+ from nucliadb.common import datamanagers
28
+ from nucliadb.common.maindb.driver import Transaction
26
29
  from nucliadb.common.models_utils import from_proto, to_proto
27
30
  from nucliadb.ingest.fields.conversation import Conversation
28
31
  from nucliadb.ingest.orm.resource import Resource as ORMResource
@@ -32,6 +35,7 @@ from nucliadb.writer.utilities import get_processing
32
35
  from nucliadb_models.common import FieldTypeName
33
36
  from nucliadb_models.content_types import GENERIC_MIME_TYPE
34
37
  from nucliadb_models.conversation import PushConversation
38
+ from nucliadb_models.labels import ClassificationLabel
35
39
  from nucliadb_models.writer import (
36
40
  CreateResourcePayload,
37
41
  UpdateResourcePayload,
@@ -42,9 +46,29 @@ from nucliadb_utils.storages.storage import StorageField
42
46
  from nucliadb_utils.utilities import get_storage
43
47
 
44
48
 
45
- async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
46
- processing = get_processing()
49
+ @dataclasses.dataclass
50
+ class ResourceClassifications:
51
+ resource_level: set[ClassificationLabel] = dataclasses.field(default_factory=set)
52
+ field_level: dict[tuple[resources_pb2.FieldType.ValueType, str], set[ClassificationLabel]] = (
53
+ dataclasses.field(default_factory=dict)
54
+ )
47
55
 
56
+ def for_field(
57
+ self, field_key: str, field_type: resources_pb2.FieldType.ValueType
58
+ ) -> list[ClassificationLabel]:
59
+ """
60
+ Returns a list of unique classification labels for a given field, including those inherited from the resource.
61
+ """
62
+ field_id = (field_type, field_key)
63
+ resource_level = self.resource_level
64
+ field_level = self.field_level.get(field_id, set())
65
+ return list(resource_level.union(field_level))
66
+
67
+
68
+ async def extract_file_field_from_pb(
69
+ field_pb: resources_pb2.FieldFile, classif_labels: list[ClassificationLabel]
70
+ ) -> str:
71
+ processing = get_processing()
48
72
  if field_pb.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
49
73
  file_field = models.FileField(
50
74
  language=field_pb.language,
@@ -52,16 +76,17 @@ async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
52
76
  file=models.File(payload=None, uri=field_pb.file.uri),
53
77
  extract_strategy=field_pb.extract_strategy,
54
78
  )
55
- return processing.convert_external_filefield_to_str(file_field)
79
+ return processing.convert_external_filefield_to_str(file_field, classif_labels)
56
80
  else:
57
81
  storage = await get_storage(service_name=SERVICE_NAME)
58
- return await processing.convert_internal_filefield_to_str(field_pb, storage)
82
+ return await processing.convert_internal_filefield_to_str(field_pb, storage, classif_labels)
59
83
 
60
84
 
61
85
  async def extract_file_field(
62
86
  field_id: str,
63
87
  resource: ORMResource,
64
88
  toprocess: PushPayload,
89
+ resource_classifications: ResourceClassifications,
65
90
  password: Optional[str] = None,
66
91
  ):
67
92
  field_type = resources_pb2.FieldType.FILE
@@ -73,13 +98,19 @@ async def extract_file_field(
73
98
  if password is not None:
74
99
  field_pb.password = password
75
100
 
76
- toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
101
+ classif_labels = resource_classifications.for_field(field_id, resources_pb2.FieldType.FILE)
102
+ toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb, classif_labels)
77
103
 
78
104
 
79
105
  async def extract_fields(resource: ORMResource, toprocess: PushPayload):
80
106
  processing = get_processing()
81
107
  storage = await get_storage(service_name=SERVICE_NAME)
82
108
  await resource.get_fields()
109
+
110
+ resource_classifications = await atomic_get_stored_resource_classifications(
111
+ kbid=toprocess.kbid,
112
+ rid=toprocess.uuid,
113
+ )
83
114
  for (field_type, field_id), field in resource.fields.items():
84
115
  field_type_name = from_proto.field_type_name(field_type)
85
116
 
@@ -92,9 +123,9 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
92
123
  continue
93
124
 
94
125
  field_pb = await field.get_value()
95
-
126
+ classif_labels = resource_classifications.for_field(field_id, field_type)
96
127
  if field_type_name is FieldTypeName.FILE:
97
- toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
128
+ toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb, classif_labels)
98
129
 
99
130
  if field_type_name is FieldTypeName.LINK:
100
131
  parsed_link = MessageToDict(
@@ -104,6 +135,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
104
135
  )
105
136
  parsed_link["link"] = parsed_link.pop("uri", None)
106
137
  toprocess.linkfield[field_id] = models.LinkUpload(**parsed_link)
138
+ toprocess.linkfield[field_id].classification_labels = classif_labels
107
139
 
108
140
  if field_type_name is FieldTypeName.TEXT:
109
141
  parsed_text = MessageToDict(
@@ -113,6 +145,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
113
145
  )
114
146
  parsed_text["format"] = models.PushTextFormat[parsed_text["format"]]
115
147
  toprocess.textfield[field_id] = models.Text(**parsed_text)
148
+ toprocess.textfield[field_id].classification_labels = classif_labels
116
149
 
117
150
  if field_type_name is FieldTypeName.CONVERSATION and isinstance(field, Conversation):
118
151
  metadata = await field.get_metadata()
@@ -143,6 +176,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
143
176
  )
144
177
  full_conversation.messages.append(models.PushMessage(**parsed_message))
145
178
  toprocess.conversationfield[field_id] = full_conversation
179
+ toprocess.conversationfield[field_id].classification_labels = classif_labels
146
180
 
147
181
 
148
182
  async def parse_fields(
@@ -152,18 +186,48 @@ async def parse_fields(
152
186
  kbid: str,
153
187
  uuid: str,
154
188
  x_skip_store: bool,
189
+ resource_classifications: ResourceClassifications,
155
190
  ):
156
191
  for key, file_field in item.files.items():
157
- await parse_file_field(key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store)
192
+ await parse_file_field(
193
+ key,
194
+ file_field,
195
+ writer,
196
+ toprocess,
197
+ kbid,
198
+ uuid,
199
+ resource_classifications,
200
+ skip_store=x_skip_store,
201
+ )
158
202
 
159
203
  for key, link_field in item.links.items():
160
- parse_link_field(key, link_field, writer, toprocess)
204
+ parse_link_field(
205
+ key,
206
+ link_field,
207
+ writer,
208
+ toprocess,
209
+ resource_classifications,
210
+ )
161
211
 
162
212
  for key, text_field in item.texts.items():
163
- parse_text_field(key, text_field, writer, toprocess)
213
+ parse_text_field(
214
+ key,
215
+ text_field,
216
+ writer,
217
+ toprocess,
218
+ resource_classifications,
219
+ )
164
220
 
165
221
  for key, conversation_field in item.conversations.items():
166
- await parse_conversation_field(key, conversation_field, writer, toprocess, kbid, uuid)
222
+ await parse_conversation_field(
223
+ key,
224
+ conversation_field,
225
+ writer,
226
+ toprocess,
227
+ kbid,
228
+ uuid,
229
+ resource_classifications,
230
+ )
167
231
 
168
232
 
169
233
  def parse_text_field(
@@ -171,7 +235,9 @@ def parse_text_field(
171
235
  text_field: models.TextField,
172
236
  writer: BrokerMessage,
173
237
  toprocess: PushPayload,
238
+ resource_classifications: ResourceClassifications,
174
239
  ) -> None:
240
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.TEXT)
175
241
  if text_field.extract_strategy is not None:
176
242
  writer.texts[key].extract_strategy = text_field.extract_strategy
177
243
  writer.texts[key].body = text_field.body
@@ -185,6 +251,7 @@ def parse_text_field(
185
251
  body=text_field.body,
186
252
  format=getattr(models.PushTextFormat, text_field.format.value),
187
253
  extract_strategy=text_field.extract_strategy,
254
+ classification_labels=classif_labels,
188
255
  )
189
256
  writer.field_statuses.append(
190
257
  FieldIDStatus(
@@ -201,13 +268,21 @@ async def parse_file_field(
201
268
  toprocess: PushPayload,
202
269
  kbid: str,
203
270
  uuid: str,
271
+ resource_classifications: ResourceClassifications,
204
272
  skip_store: bool = False,
205
273
  ):
206
274
  if file_field.file.is_external:
207
- parse_external_file_field(key, file_field, writer, toprocess)
275
+ parse_external_file_field(key, file_field, writer, toprocess, resource_classifications)
208
276
  else:
209
277
  await parse_internal_file_field(
210
- key, file_field, writer, toprocess, kbid, uuid, skip_store=skip_store
278
+ key,
279
+ file_field,
280
+ writer,
281
+ toprocess,
282
+ kbid,
283
+ uuid,
284
+ resource_classifications,
285
+ skip_store=skip_store,
211
286
  )
212
287
 
213
288
  writer.field_statuses.append(
@@ -225,8 +300,10 @@ async def parse_internal_file_field(
225
300
  toprocess: PushPayload,
226
301
  kbid: str,
227
302
  uuid: str,
303
+ resource_classifications: ResourceClassifications,
228
304
  skip_store: bool = False,
229
305
  ) -> None:
306
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.FILE)
230
307
  writer.files[key].added.FromDatetime(datetime.now())
231
308
  if file_field.language:
232
309
  writer.files[key].language = file_field.language
@@ -234,10 +311,9 @@ async def parse_internal_file_field(
234
311
  writer.files[key].extract_strategy = file_field.extract_strategy
235
312
 
236
313
  processing = get_processing()
237
-
238
314
  if skip_store:
239
315
  # Does not store file on nuclia's blob storage. Only sends it to process
240
- toprocess.filefield[key] = await processing.convert_filefield_to_str(file_field)
316
+ toprocess.filefield[key] = await processing.convert_filefield_to_str(file_field, classif_labels)
241
317
 
242
318
  else:
243
319
  # Store file on nuclia's blob storage
@@ -254,7 +330,7 @@ async def parse_internal_file_field(
254
330
  )
255
331
  # Send the pointer of the new blob to processing
256
332
  toprocess.filefield[key] = await processing.convert_internal_filefield_to_str(
257
- writer.files[key], storage
333
+ writer.files[key], storage, classif_labels
258
334
  )
259
335
 
260
336
 
@@ -263,7 +339,9 @@ def parse_external_file_field(
263
339
  file_field: models.FileField,
264
340
  writer: BrokerMessage,
265
341
  toprocess: PushPayload,
342
+ resource_classifications: ResourceClassifications,
266
343
  ) -> None:
344
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.FILE)
267
345
  writer.files[key].added.FromDatetime(datetime.now())
268
346
  if file_field.language:
269
347
  writer.files[key].language = file_field.language
@@ -276,9 +354,8 @@ def parse_external_file_field(
276
354
  writer.files[key].file.content_type = file_field.file.content_type
277
355
  if file_field.file.content_type and writer.basic.icon == GENERIC_MIME_TYPE:
278
356
  writer.basic.icon = file_field.file.content_type
279
-
280
357
  processing = get_processing()
281
- toprocess.filefield[key] = processing.convert_external_filefield_to_str(file_field)
358
+ toprocess.filefield[key] = processing.convert_external_filefield_to_str(file_field, classif_labels)
282
359
 
283
360
 
284
361
  def parse_link_field(
@@ -286,7 +363,9 @@ def parse_link_field(
286
363
  link_field: models.LinkField,
287
364
  writer: BrokerMessage,
288
365
  toprocess: PushPayload,
366
+ resource_classifications: ResourceClassifications,
289
367
  ) -> None:
368
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.LINK)
290
369
  writer.links[key].added.FromDatetime(datetime.now())
291
370
 
292
371
  writer.links[key].uri = link_field.uri
@@ -322,6 +401,7 @@ def parse_link_field(
322
401
  css_selector=link_field.css_selector,
323
402
  xpath=link_field.xpath,
324
403
  extract_strategy=link_field.extract_strategy,
404
+ classification_labels=classif_labels,
325
405
  )
326
406
  writer.field_statuses.append(
327
407
  FieldIDStatus(
@@ -338,7 +418,9 @@ async def parse_conversation_field(
338
418
  toprocess: PushPayload,
339
419
  kbid: str,
340
420
  uuid: str,
421
+ resource_classifications: ResourceClassifications,
341
422
  ) -> None:
423
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.CONVERSATION)
342
424
  storage = await get_storage(service_name=SERVICE_NAME)
343
425
  processing = get_processing()
344
426
  field_value = resources_pb2.Conversation()
@@ -401,7 +483,7 @@ async def parse_conversation_field(
401
483
  processing_message.to.append(to)
402
484
  convs.messages.append(processing_message)
403
485
  field_value.messages.append(cm)
404
-
486
+ convs.classification_labels = classif_labels
405
487
  toprocess.conversationfield[key] = convs
406
488
  writer.conversations[key].CopyFrom(field_value)
407
489
  writer.field_statuses.append(
@@ -410,3 +492,37 @@ async def parse_conversation_field(
410
492
  status=FieldStatus.Status.PENDING,
411
493
  )
412
494
  )
495
+
496
+
497
+ async def atomic_get_stored_resource_classifications(
498
+ kbid: str,
499
+ rid: str,
500
+ ) -> ResourceClassifications:
501
+ async with datamanagers.with_ro_transaction() as txn:
502
+ return await get_stored_resource_classifications(txn, kbid=kbid, rid=rid)
503
+
504
+
505
+ async def get_stored_resource_classifications(
506
+ txn: Transaction,
507
+ *,
508
+ kbid: str,
509
+ rid: str,
510
+ ) -> ResourceClassifications:
511
+ rc = ResourceClassifications()
512
+ basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
513
+ if basic is None:
514
+ # Resource not found
515
+ return rc
516
+
517
+ # User resource-level classifications
518
+ for u_classif in basic.usermetadata.classifications:
519
+ classif = ClassificationLabel(labelset=u_classif.labelset, label=u_classif.label)
520
+ rc.resource_level.add(classif)
521
+
522
+ # Processor-computed field-level classifications. These are not user-defined and are immutable.
523
+ for field_classif in basic.computedmetadata.field_classifications:
524
+ fid = (field_classif.field.field_type, field_classif.field.field)
525
+ for f_classif in field_classif.classifications:
526
+ classif = ClassificationLabel(labelset=f_classif.labelset, label=f_classif.label)
527
+ rc.field_level.setdefault(fid, set()).add(classif)
528
+ return rc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: nucliadb
3
- Version: 6.3.4.post3645
3
+ Version: 6.3.4.post3663
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3645
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3645
25
- Requires-Dist: nucliadb-protos>=6.3.4.post3645
26
- Requires-Dist: nucliadb-models>=6.3.4.post3645
27
- Requires-Dist: nidx-protos>=6.3.4.post3645
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3663
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3663
25
+ Requires-Dist: nucliadb-protos>=6.3.4.post3663
26
+ Requires-Dist: nucliadb-models>=6.3.4.post3663
27
+ Requires-Dist: nidx-protos>=6.3.4.post3663
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn
@@ -40,11 +40,11 @@ nucliadb/metrics_exporter.py,sha256=6u0geEYFxgE5I2Fhl_sxsvGN-ZkaFZNGutSXwrzrsVs,
40
40
  nucliadb/openapi.py,sha256=wDiw0dVEvTpJvbatkJ0JZLkKm9RItZT5PWRHjqRfqTA,2272
41
41
  nucliadb/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
42
  nucliadb/backups/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
43
- nucliadb/backups/const.py,sha256=9vPAhLxQO_gNAjSdPxWuv3V66s9WcdpjOQ89CZlfmuk,1894
44
- nucliadb/backups/create.py,sha256=TJtYewhD0jkYV_h3rNUhKzhqB2QHAhLWYOgRVlGysGs,11450
43
+ nucliadb/backups/const.py,sha256=AaIsBB04WerR-V6t8NoCUScxO1ojMYJzfHgdkF2qh4M,2018
44
+ nucliadb/backups/create.py,sha256=D0MEpIYu74AhbZXeybinQo8cm_A2-T9JxDGLrp5lavA,12507
45
45
  nucliadb/backups/delete.py,sha256=AAs-WP-ujejj6c1LJgiMn7Ht67N_j0a1sKQlUepHpEA,2659
46
46
  nucliadb/backups/models.py,sha256=-hITU4Mv6AxePu12toBu_fjpEv6vVGcwNVxV22O9jQA,1273
47
- nucliadb/backups/restore.py,sha256=YD3Bbo9ry4YLMM6imB-DXbOAMXfGxVzJtTAAUFDvB0I,10153
47
+ nucliadb/backups/restore.py,sha256=KAly9iTXUP32mFFnW6neJm7qyNSZJ7fO5LGbC46vSAM,11416
48
48
  nucliadb/backups/settings.py,sha256=SyzsInj1BRbBI0atg5IXWbMbOZ_eVg4eSQ3IcnUhCxQ,1357
49
49
  nucliadb/backups/tasks.py,sha256=WkL1LgdYBHbV_A5ilyYv5p3zmXwxH68TDudytN5f7zk,4225
50
50
  nucliadb/backups/utils.py,sha256=_Vogjqcru5oqNZM-bZ0q7Ju79Bv1PD-LVFEa7Z-Q13I,1261
@@ -110,11 +110,11 @@ nucliadb/export_import/exporter.py,sha256=k2QVx1EjqFlDYiggriWiEJzwtMXzHbldsqWdpG
110
110
  nucliadb/export_import/importer.py,sha256=v5cq9Nn8c2zrY_K_00mydR52f8mdFxR7tLdtNLQ0qvk,4229
111
111
  nucliadb/export_import/models.py,sha256=dbjScNkiMRv4X3Ktudy1JRliD25bfoDTy3JmEZgQSCc,2121
112
112
  nucliadb/export_import/tasks.py,sha256=DWbdqY97ffoyfipelGXz3Jqz1iam6JCjQSh367Fc3NA,2947
113
- nucliadb/export_import/utils.py,sha256=DlGUHaqT43b3jG9U-vZ48GpC4O2OgD2WSP_0-hrYW9k,20774
113
+ nucliadb/export_import/utils.py,sha256=aBBB7p05GfKknpb9LQa8Krtz0LlFoP5NUTiPy7PwPBY,22840
114
114
  nucliadb/ingest/__init__.py,sha256=fsw3C38VP50km3R-nHL775LNGPpJ4JxqXJ2Ib1f5SqE,1011
115
115
  nucliadb/ingest/app.py,sha256=TaVgh5B2riFVmcsrbPb7a5YCzmnybjx-NK0BXgTwGAY,7535
116
116
  nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE,2410
117
- nucliadb/ingest/processing.py,sha256=8OggvuxNzktTTKDTUwsIuazhDParEWhn46CBZaMYAy8,20659
117
+ nucliadb/ingest/processing.py,sha256=7NNoVxbSwsRdbo5goqVSrUc_QXZRVfOT_jZPzrmbxJQ,22207
118
118
  nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
119
  nucliadb/ingest/serialize.py,sha256=42zNKu-O6g9EmLnQOXwhfagD76FSmWD6xRf69LrJxfA,16415
120
120
  nucliadb/ingest/settings.py,sha256=0B-wQNa8FLqtNcQgRzh-fuIuGptM816XHcbH1NQKfmE,3050
@@ -138,7 +138,7 @@ nucliadb/ingest/fields/link.py,sha256=kN_gjRUEEj5cy8K_BwPijYg3TiWhedc24apXYlTbRJ
138
138
  nucliadb/ingest/fields/text.py,sha256=tFvSQJAe0W7ePpp2_WDfLiE2yglR1OTU0Zht9acvOFw,1594
139
139
  nucliadb/ingest/orm/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
140
140
  nucliadb/ingest/orm/brain.py,sha256=JwHRneyE2sAo6PwYJnnyUMXKiGBWXLv4JI_aemodB3U,28479
141
- nucliadb/ingest/orm/broker_message.py,sha256=ZEMueoGuuRKO4tHgzc0P0AM1Ls1TTYey_4UvRQf0BpY,6915
141
+ nucliadb/ingest/orm/broker_message.py,sha256=vFDdfo_kz_GRai2MGq_3qKRynRXOGR7lawogNx6ZsfA,7553
142
142
  nucliadb/ingest/orm/entities.py,sha256=3_n6lKhBy2GsdmNmkh0_mvxP8md20OZsbtTNEmfJ8Hg,14888
143
143
  nucliadb/ingest/orm/exceptions.py,sha256=k4Esv4NtL4TrGTcsQpwrSfDhPQpiYcRbB1SpYmBX5MY,1432
144
144
  nucliadb/ingest/orm/knowledgebox.py,sha256=IGOPvBR1qXqDxE5DeiOdYCLdPgjzOVVpsASJ2zYvWwQ,23651
@@ -328,20 +328,20 @@ nucliadb/writer/api/constants.py,sha256=qWEDjFUycrEZnSJyLnNK4PQNodU2oVmkO4NycaEZ
328
328
  nucliadb/writer/api/utils.py,sha256=wIQHlU8RQiIGVLI72suvyVIKlCU44Unh0Ae0IiN6Qwo,1313
329
329
  nucliadb/writer/api/v1/__init__.py,sha256=akI9A_jloNLb0dU4T5zjfdyvmSAiDeIdjAlzNx74FlU,1128
330
330
  nucliadb/writer/api/v1/export_import.py,sha256=elf-EQY5DD3mhw8kWb9tQpDcbrF9sY6VFYqxQOjuVP0,8201
331
- nucliadb/writer/api/v1/field.py,sha256=OsWOYA0WQ6onE5Rkl20QIEdtrSi7Jgnu62fUt90Ziy8,17503
331
+ nucliadb/writer/api/v1/field.py,sha256=FySCMpcruSAKGeepeAlOihjwxyUPcDO73Uilq5VDWRk,18514
332
332
  nucliadb/writer/api/v1/knowledgebox.py,sha256=MLeIuym4jPrJgfy1NTcN9CpUGwuBiqDHMcx0hY9DR7g,9530
333
333
  nucliadb/writer/api/v1/learning_config.py,sha256=CKBjqcbewkfPwGUPLDWzZSpro6XkmCaVppe5Qtpu5Go,3117
334
- nucliadb/writer/api/v1/resource.py,sha256=A8fAHlN5XFsg6XFYKhfWJS8czgNH6yXr-PsnUqz2WUE,18757
334
+ nucliadb/writer/api/v1/resource.py,sha256=jV9HM-ID1PPYypfy4Sl4_9aSPF87v7gSJZUSzHjHcQ4,19740
335
335
  nucliadb/writer/api/v1/router.py,sha256=RjuoWLpZer6Kl2BW_wznpNo6XL3BOpdTGqXZCn3QrrQ,1034
336
336
  nucliadb/writer/api/v1/services.py,sha256=HLQW18AEC5zQp5azpeAtRi7ZTzQSwG6SbmkHlmjTIFA,13165
337
337
  nucliadb/writer/api/v1/slug.py,sha256=xlVBDBpRi9bNulpBHZwhyftVvulfE0zFm1XZIWl-AKY,2389
338
338
  nucliadb/writer/api/v1/transaction.py,sha256=d2Vbgnkk_-FLGSTt3vfldwiJIUf0XoyD0wP1jQNz_DY,2430
339
- nucliadb/writer/api/v1/upload.py,sha256=VOeqNTrZx1_z8iaKjM7p8fVlVcIYMtnQNK1dm72ct6k,33161
339
+ nucliadb/writer/api/v1/upload.py,sha256=iFHD3nB1ZXiSNyoOzIdzJ0PNLBnnlfOWPba1qWieT4o,33608
340
340
  nucliadb/writer/api/v1/vectorsets.py,sha256=mESaXkkI9f-jWWMW61ZZgv7E5YWXKemyc6vwT0lFXns,6747
341
341
  nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
342
342
  nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
343
- nucliadb/writer/resource/basic.py,sha256=cHhh5hQRHFIoKd-6fEteHuGWW6fGN56ornIWPBuSpHg,11214
344
- nucliadb/writer/resource/field.py,sha256=HsOERELyAsb9e0dx2IkSQ9lk0SThALFRcDKCVBw8ifU,15478
343
+ nucliadb/writer/resource/basic.py,sha256=_zdAr110C7rtEzOKoBRMzPjAnQ0pAtRfGjB8qCzodvI,11767
344
+ nucliadb/writer/resource/field.py,sha256=qnj31lM9F0AFlj3QhPcPj90vHg7SMbbYW098fMtYt9o,20053
345
345
  nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
346
346
  nucliadb/writer/tus/__init__.py,sha256=huWpKnDnjsrKlBBJk30ta5vamlA-4x0TbPs_2Up8hyM,5443
347
347
  nucliadb/writer/tus/azure.py,sha256=XhWAlWTM0vmXcXtuEPYjjeEhuZjiZXZu8q9WsJ7omFE,4107
@@ -352,8 +352,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
352
352
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
353
353
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
354
354
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
355
- nucliadb-6.3.4.post3645.dist-info/METADATA,sha256=xz2Jc-1oMxDuyL7MTPcAu4jfc1gbBWL7-yoP6Vt2onw,4291
356
- nucliadb-6.3.4.post3645.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
357
- nucliadb-6.3.4.post3645.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
358
- nucliadb-6.3.4.post3645.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
359
- nucliadb-6.3.4.post3645.dist-info/RECORD,,
355
+ nucliadb-6.3.4.post3663.dist-info/METADATA,sha256=Olqrr9vrS1hKJrDcrbrYwfm62xUA3i4aj8WVRLxeePc,4291
356
+ nucliadb-6.3.4.post3663.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
357
+ nucliadb-6.3.4.post3663.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
358
+ nucliadb-6.3.4.post3663.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
359
+ nucliadb-6.3.4.post3663.dist-info/RECORD,,