nucliadb 6.3.4.post3656__py3-none-any.whl → 6.3.4.post3675__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,6 +19,7 @@
19
19
  #
20
20
  import base64
21
21
  import datetime
22
+ import json
22
23
  import logging
23
24
  import uuid
24
25
  from collections import defaultdict
@@ -32,6 +33,7 @@ import jwt
32
33
  from pydantic import BaseModel, Field
33
34
 
34
35
  import nucliadb_models as models
36
+ from nucliadb_models.labels import ClassificationLabel
35
37
  from nucliadb_models.resource import QueueType
36
38
  from nucliadb_protos.resources_pb2 import CloudFile
37
39
  from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
@@ -93,7 +95,10 @@ class PushPayload(BaseModel):
93
95
  genericfield: dict[str, models.Text] = {}
94
96
 
95
97
  # New File
96
- filefield: dict[str, str] = {}
98
+ filefield: dict[str, str] = Field(
99
+ default={},
100
+ description="Map of each file field to the jwt token computed in ProcessingEngine methods",
101
+ )
97
102
 
98
103
  # New Link
99
104
  linkfield: dict[str, models.LinkUpload] = {}
@@ -238,7 +243,9 @@ class ProcessingEngine:
238
243
  }
239
244
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
240
245
 
241
- def generate_file_token_from_fieldfile(self, file: FieldFilePB) -> str:
246
+ def generate_file_token_from_fieldfile(
247
+ self, file: FieldFilePB, classif_labels: Optional[list[ClassificationLabel]] = None
248
+ ) -> str:
242
249
  if self.nuclia_jwt_key is None:
243
250
  raise AttributeError("Nuclia JWT key not set")
244
251
  now = datetime.datetime.now(tz=datetime.timezone.utc)
@@ -263,6 +270,8 @@ class ProcessingEngine:
263
270
  "language": file.language,
264
271
  "extract_strategy": file.extract_strategy,
265
272
  }
273
+ if classif_labels:
274
+ payload["classification_labels"] = self.encode_classif_labels(classif_labels)
266
275
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
267
276
 
268
277
  @backoff.on_exception(
@@ -272,7 +281,9 @@ class ProcessingEngine:
272
281
  max_tries=MAX_TRIES,
273
282
  )
274
283
  @processing_observer.wrap({"type": "file_field_upload"})
275
- async def convert_filefield_to_str(self, file: models.FileField) -> str:
284
+ async def convert_filefield_to_str(
285
+ self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
286
+ ) -> str:
276
287
  # Upload file without storing on Nuclia DB
277
288
  headers = {}
278
289
  headers["X-PASSWORD"] = file.password
@@ -281,6 +292,8 @@ class ProcessingEngine:
281
292
  headers["X-MD5"] = file.file.md5
282
293
  if file.extract_strategy is not None:
283
294
  headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
295
+ if classif_labels:
296
+ headers["X-CLASSIFICATION-LABELS"] = self.encode_classif_labels(classif_labels)
284
297
  headers["CONTENT_TYPE"] = file.file.content_type
285
298
  headers["CONTENT-LENGTH"] = str(len(file.file.payload)) # type: ignore
286
299
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
@@ -299,7 +312,14 @@ class ProcessingEngine:
299
312
  text = await resp.text()
300
313
  raise Exception(f"STATUS: {resp.status} - {text}")
301
314
 
302
- def convert_external_filefield_to_str(self, file_field: models.FileField) -> str:
315
+ def encode_classif_labels(self, classif_labels: list[ClassificationLabel]) -> str:
316
+ return base64.b64encode(
317
+ json.dumps([label.model_dump(mode="python") for label in classif_labels]).encode()
318
+ ).decode()
319
+
320
+ def convert_external_filefield_to_str(
321
+ self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
322
+ ) -> str:
303
323
  if self.nuclia_jwt_key is None:
304
324
  raise AttributeError("Nuclia JWT key not set")
305
325
 
@@ -322,6 +342,8 @@ class ProcessingEngine:
322
342
  "password": file_field.password,
323
343
  "extract_strategy": file_field.extract_strategy,
324
344
  }
345
+ if classif_labels:
346
+ payload["classification_labels"] = self.encode_classif_labels(classif_labels)
325
347
  return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
326
348
 
327
349
  @backoff.on_exception(
@@ -331,11 +353,16 @@ class ProcessingEngine:
331
353
  max_tries=MAX_TRIES,
332
354
  )
333
355
  @processing_observer.wrap({"type": "file_field_upload_internal"})
334
- async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
356
+ async def convert_internal_filefield_to_str(
357
+ self,
358
+ file: FieldFilePB,
359
+ storage: Storage,
360
+ classif_labels: Optional[list[ClassificationLabel]] = None,
361
+ ) -> str:
335
362
  """It's already an internal file that needs to be uploaded"""
336
363
  if self.onprem is False:
337
364
  # Upload the file to processing upload
338
- jwttoken = self.generate_file_token_from_fieldfile(file)
365
+ jwttoken = self.generate_file_token_from_fieldfile(file, classif_labels)
339
366
  else:
340
367
  headers = {}
341
368
  headers["X-PASSWORD"] = file.password
@@ -347,6 +374,8 @@ class ProcessingEngine:
347
374
  headers["CONTENT-LENGTH"] = str(file.file.size)
348
375
  if file.extract_strategy != "":
349
376
  headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
377
+ if classif_labels:
378
+ headers["X-CLASSIFICATION-LABELS"] = self.encode_classif_labels(classif_labels)
350
379
  headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
351
380
 
352
381
  iterator = storage.downloadbytescf_iterator(file.file)
@@ -488,22 +517,31 @@ class DummyProcessingEngine(ProcessingEngine):
488
517
  async def finalize(self):
489
518
  pass
490
519
 
491
- async def convert_filefield_to_str(self, file: models.FileField) -> str:
520
+ async def convert_filefield_to_str(
521
+ self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
522
+ ) -> str:
492
523
  self.calls.append([file])
493
524
  index = len(self.values["convert_filefield_to_str"])
494
- self.values["convert_filefield_to_str"].append(file)
525
+ self.values["convert_filefield_to_str"].append((file, classif_labels))
495
526
  return f"convert_filefield_to_str,{index}"
496
527
 
497
- def convert_external_filefield_to_str(self, file_field: models.FileField) -> str:
528
+ def convert_external_filefield_to_str(
529
+ self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
530
+ ) -> str:
498
531
  self.calls.append([file_field])
499
532
  index = len(self.values["convert_external_filefield_to_str"])
500
- self.values["convert_external_filefield_to_str"].append(file_field)
533
+ self.values["convert_external_filefield_to_str"].append((file_field, classif_labels))
501
534
  return f"convert_external_filefield_to_str,{index}"
502
535
 
503
- async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
536
+ async def convert_internal_filefield_to_str(
537
+ self,
538
+ file: FieldFilePB,
539
+ storage: Storage,
540
+ classif_labels: Optional[list[ClassificationLabel]] = None,
541
+ ) -> str:
504
542
  self.calls.append([file, storage])
505
543
  index = len(self.values["convert_internal_filefield_to_str"])
506
- self.values["convert_internal_filefield_to_str"].append([file, storage])
544
+ self.values["convert_internal_filefield_to_str"].append((file, storage, classif_labels))
507
545
  return f"convert_internal_filefield_to_str,{index}"
508
546
 
509
547
  async def convert_internal_cf_to_str(self, cf: CloudFile, storage: Storage) -> str:
@@ -43,7 +43,10 @@ from nucliadb.writer.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREF
43
43
  from nucliadb.writer.back_pressure import maybe_back_pressure
44
44
  from nucliadb.writer.resource.audit import parse_audit
45
45
  from nucliadb.writer.resource.field import (
46
+ ResourceClassifications,
47
+ atomic_get_stored_resource_classifications,
46
48
  extract_file_field,
49
+ get_stored_resource_classifications,
47
50
  parse_conversation_field,
48
51
  parse_file_field,
49
52
  parse_link_field,
@@ -114,11 +117,31 @@ async def add_field_to_resource(
114
117
 
115
118
  parse_audit(writer.audit, request)
116
119
 
120
+ resource_classifications = await atomic_get_stored_resource_classifications(kbid=kbid, rid=rid)
121
+
117
122
  parse_field = FIELD_PARSERS_MAP[type(field_payload)]
118
123
  if iscoroutinefunction(parse_field):
119
- await parse_field(kbid, rid, field_id, field_payload, writer, toprocess, **parser_kwargs)
124
+ await parse_field(
125
+ kbid,
126
+ rid,
127
+ field_id,
128
+ field_payload,
129
+ writer,
130
+ toprocess,
131
+ resource_classifications,
132
+ **parser_kwargs,
133
+ )
120
134
  else:
121
- parse_field(kbid, rid, field_id, field_payload, writer, toprocess, **parser_kwargs)
135
+ parse_field(
136
+ kbid,
137
+ rid,
138
+ field_id,
139
+ field_payload,
140
+ writer,
141
+ toprocess,
142
+ resource_classifications,
143
+ **parser_kwargs,
144
+ )
122
145
 
123
146
  processing = get_processing()
124
147
  await transaction.commit(writer, partition)
@@ -200,8 +223,9 @@ def parse_text_field_adapter(
200
223
  field_payload: models.TextField,
201
224
  writer: BrokerMessage,
202
225
  toprocess: PushPayload,
226
+ resource_classifications: ResourceClassifications,
203
227
  ):
204
- return parse_text_field(field_id, field_payload, writer, toprocess)
228
+ return parse_text_field(field_id, field_payload, writer, toprocess, resource_classifications)
205
229
 
206
230
 
207
231
  def parse_link_field_adapter(
@@ -211,8 +235,9 @@ def parse_link_field_adapter(
211
235
  field_payload: models.LinkField,
212
236
  writer: BrokerMessage,
213
237
  toprocess: PushPayload,
238
+ resource_classifications: ResourceClassifications,
214
239
  ):
215
- return parse_link_field(field_id, field_payload, writer, toprocess)
240
+ return parse_link_field(field_id, field_payload, writer, toprocess, resource_classifications)
216
241
 
217
242
 
218
243
  async def parse_conversation_field_adapter(
@@ -222,8 +247,11 @@ async def parse_conversation_field_adapter(
222
247
  field_payload: models.InputConversationField,
223
248
  writer: BrokerMessage,
224
249
  toprocess: PushPayload,
250
+ resource_classifications: ResourceClassifications,
225
251
  ):
226
- return await parse_conversation_field(field_id, field_payload, writer, toprocess, kbid, rid)
252
+ return await parse_conversation_field(
253
+ field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications
254
+ )
227
255
 
228
256
 
229
257
  async def parse_file_field_adapter(
@@ -233,14 +261,22 @@ async def parse_file_field_adapter(
233
261
  field_payload: models.FileField,
234
262
  writer: BrokerMessage,
235
263
  toprocess: PushPayload,
264
+ resource_classifications: ResourceClassifications,
236
265
  skip_store: bool,
237
266
  ):
238
267
  return await parse_file_field(
239
- field_id, field_payload, writer, toprocess, kbid, rid, skip_store=skip_store
268
+ field_id,
269
+ field_payload,
270
+ writer,
271
+ toprocess,
272
+ kbid,
273
+ rid,
274
+ resource_classifications,
275
+ skip_store=skip_store,
240
276
  )
241
277
 
242
278
 
243
- FIELD_PARSERS_MAP: dict[Type, Union[Callable]] = {
279
+ FIELD_PARSERS_MAP: dict[Type, Callable] = {
244
280
  models.TextField: parse_text_field_adapter,
245
281
  models.LinkField: parse_link_field_adapter,
246
282
  models.InputConversationField: parse_conversation_field_adapter,
@@ -537,12 +573,15 @@ async def reprocess_file_field(
537
573
  if resource.basic is not None:
538
574
  toprocess.title = resource.basic.title
539
575
 
576
+ rclassif = await get_stored_resource_classifications(txn, kbid=kbid, rid=rid)
577
+
540
578
  try:
541
579
  await extract_file_field(
542
580
  field_id,
543
581
  resource=resource,
544
582
  toprocess=toprocess,
545
583
  password=x_file_password,
584
+ resource_classifications=rclassif,
546
585
  )
547
586
  except KeyError:
548
587
  raise HTTPException(status_code=404, detail="Field does not exist")
@@ -50,10 +50,16 @@ from nucliadb.writer.resource.audit import parse_audit
50
50
  from nucliadb.writer.resource.basic import (
51
51
  parse_basic_creation,
52
52
  parse_basic_modify,
53
+ parse_user_classifications,
53
54
  set_status,
54
55
  set_status_modify,
55
56
  )
56
- from nucliadb.writer.resource.field import extract_fields, parse_fields
57
+ from nucliadb.writer.resource.field import (
58
+ ResourceClassifications,
59
+ atomic_get_stored_resource_classifications,
60
+ extract_fields,
61
+ parse_fields,
62
+ )
57
63
  from nucliadb.writer.resource.origin import parse_extra, parse_origin
58
64
  from nucliadb.writer.utilities import get_processing
59
65
  from nucliadb_models.resource import NucliaDBRoles
@@ -139,6 +145,11 @@ async def create_resource(
139
145
  if item.extra is not None:
140
146
  parse_extra(writer.extra, item.extra)
141
147
 
148
+ # Since this is a resource creation, we need to care only about the user-provided
149
+ # classifications in the request.
150
+ resource_classifications = ResourceClassifications(
151
+ resource_level=set(parse_user_classifications(item))
152
+ )
142
153
  await parse_fields(
143
154
  writer=writer,
144
155
  item=item,
@@ -146,6 +157,7 @@ async def create_resource(
146
157
  kbid=kbid,
147
158
  uuid=uuid,
148
159
  x_skip_store=x_skip_store,
160
+ resource_classifications=resource_classifications,
149
161
  )
150
162
 
151
163
  set_status(writer.basic, item)
@@ -296,6 +308,15 @@ async def modify_resource(
296
308
  if item.extra is not None:
297
309
  parse_extra(writer.extra, item.extra)
298
310
 
311
+ if item.usermetadata is not None:
312
+ # If usermetadata is set in the request payload, this means that stored resource classifications
313
+ # are not valid and we need to use the ones provided by the user in the request
314
+ resource_classifications = ResourceClassifications(
315
+ resource_level=set(parse_user_classifications(item))
316
+ )
317
+ else:
318
+ resource_classifications = await atomic_get_stored_resource_classifications(kbid, rid)
319
+
299
320
  await parse_fields(
300
321
  writer=writer,
301
322
  item=item,
@@ -303,6 +324,7 @@ async def modify_resource(
303
324
  kbid=kbid,
304
325
  uuid=rid,
305
326
  x_skip_store=x_skip_store,
327
+ resource_classifications=resource_classifications,
306
328
  )
307
329
  set_status_modify(writer.basic, item)
308
330
 
@@ -45,8 +45,11 @@ from nucliadb.writer.api.v1.resource import (
45
45
  from nucliadb.writer.api.v1.slug import ensure_slug_uniqueness, noop_context_manager
46
46
  from nucliadb.writer.back_pressure import maybe_back_pressure
47
47
  from nucliadb.writer.resource.audit import parse_audit
48
- from nucliadb.writer.resource.basic import parse_basic_creation
49
- from nucliadb.writer.resource.field import parse_fields
48
+ from nucliadb.writer.resource.basic import parse_basic_creation, parse_user_classifications
49
+ from nucliadb.writer.resource.field import (
50
+ atomic_get_stored_resource_classifications,
51
+ parse_fields,
52
+ )
50
53
  from nucliadb.writer.resource.origin import parse_extra, parse_origin
51
54
  from nucliadb.writer.tus import TUSUPLOAD, UPLOAD, get_dm, get_storage_manager
52
55
  from nucliadb.writer.tus.exceptions import (
@@ -64,6 +67,7 @@ from nucliadb_models import content_types
64
67
  from nucliadb_models.resource import NucliaDBRoles
65
68
  from nucliadb_models.utils import FieldIdString
66
69
  from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
70
+ from nucliadb_protos import resources_pb2
67
71
  from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FieldID, FieldType, Metadata
68
72
  from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus, FieldStatus
69
73
  from nucliadb_utils.authentication import requires_one
@@ -864,7 +868,6 @@ async def store_file_on_nuclia_db(
864
868
  partitioning = get_partitioning()
865
869
  processing = get_processing()
866
870
  storage = await get_storage(service_name=SERVICE_NAME)
867
-
868
871
  partition = partitioning.generate_partition(kbid, rid)
869
872
 
870
873
  writer = BrokerMessage()
@@ -884,6 +887,9 @@ async def store_file_on_nuclia_db(
884
887
  parse_audit(writer.audit, request)
885
888
 
886
889
  unique_slug_context_manager = noop_context_manager()
890
+
891
+ resource_classifications = await atomic_get_stored_resource_classifications(kbid, rid)
892
+
887
893
  if item is not None:
888
894
  if item.slug:
889
895
  unique_slug_context_manager = ensure_slug_uniqueness(kbid, item.slug)
@@ -900,7 +906,9 @@ async def store_file_on_nuclia_db(
900
906
  parse_extra(writer.extra, item.extra)
901
907
 
902
908
  toprocess.title = writer.basic.title
903
-
909
+ if item.usermetadata:
910
+ # Any resource level classification that comes on the request payload overrides the stored ones
911
+ resource_classifications.resource_level = set(parse_user_classifications(item))
904
912
  await parse_fields(
905
913
  writer=writer,
906
914
  item=item,
@@ -908,6 +916,7 @@ async def store_file_on_nuclia_db(
908
916
  kbid=kbid,
909
917
  uuid=rid,
910
918
  x_skip_store=False,
919
+ resource_classifications=resource_classifications,
911
920
  )
912
921
  else:
913
922
  # Use defaults for everything, but don't forget hidden which depends on KB config
@@ -953,8 +962,9 @@ async def store_file_on_nuclia_db(
953
962
  )
954
963
  )
955
964
 
965
+ classif_labels = resource_classifications.for_field(field, resources_pb2.FieldType.FILE)
956
966
  toprocess.filefield[field] = await processing.convert_internal_filefield_to_str(
957
- file_field, storage=storage
967
+ file_field, storage=storage, classif_labels=classif_labels
958
968
  )
959
969
 
960
970
  writer.source = BrokerMessage.MessageSource.WRITER
@@ -18,7 +18,7 @@
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
20
  from datetime import datetime
21
- from typing import Optional
21
+ from typing import Optional, Union
22
22
 
23
23
  from fastapi import HTTPException
24
24
 
@@ -31,6 +31,7 @@ from nucliadb.ingest.orm.utils import set_title
31
31
  from nucliadb.ingest.processing import PushPayload
32
32
  from nucliadb_models.content_types import GENERIC_MIME_TYPE
33
33
  from nucliadb_models.file import FileField
34
+ from nucliadb_models.labels import ClassificationLabel
34
35
  from nucliadb_models.link import LinkField
35
36
  from nucliadb_models.metadata import (
36
37
  ParagraphAnnotation,
@@ -290,3 +291,20 @@ def build_question_answer_annotation_pb(
290
291
  answer.ids_paragraphs.extend(answer_annotation.ids_paragraphs)
291
292
  pb.question_answer.answers.append(answer)
292
293
  return pb
294
+
295
+
296
+ def parse_user_classifications(
297
+ item: Union[CreateResourcePayload, UpdateResourcePayload],
298
+ ) -> list[ClassificationLabel]:
299
+ return (
300
+ [
301
+ ClassificationLabel(
302
+ labelset=classification.labelset,
303
+ label=classification.label,
304
+ )
305
+ for classification in item.usermetadata.classifications
306
+ if classification.cancelled_by_user is False
307
+ ]
308
+ if item.usermetadata is not None
309
+ else []
310
+ )
@@ -17,12 +17,15 @@
17
17
  # You should have received a copy of the GNU Affero General Public License
18
18
  # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19
  #
20
+ import dataclasses
20
21
  from datetime import datetime
21
22
  from typing import Optional, Union
22
23
 
23
24
  from google.protobuf.json_format import MessageToDict
24
25
 
25
26
  import nucliadb_models as models
27
+ from nucliadb.common import datamanagers
28
+ from nucliadb.common.maindb.driver import Transaction
26
29
  from nucliadb.common.models_utils import from_proto, to_proto
27
30
  from nucliadb.ingest.fields.conversation import Conversation
28
31
  from nucliadb.ingest.orm.resource import Resource as ORMResource
@@ -32,6 +35,7 @@ from nucliadb.writer.utilities import get_processing
32
35
  from nucliadb_models.common import FieldTypeName
33
36
  from nucliadb_models.content_types import GENERIC_MIME_TYPE
34
37
  from nucliadb_models.conversation import PushConversation
38
+ from nucliadb_models.labels import ClassificationLabel
35
39
  from nucliadb_models.writer import (
36
40
  CreateResourcePayload,
37
41
  UpdateResourcePayload,
@@ -42,9 +46,29 @@ from nucliadb_utils.storages.storage import StorageField
42
46
  from nucliadb_utils.utilities import get_storage
43
47
 
44
48
 
45
- async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
46
- processing = get_processing()
49
+ @dataclasses.dataclass
50
+ class ResourceClassifications:
51
+ resource_level: set[ClassificationLabel] = dataclasses.field(default_factory=set)
52
+ field_level: dict[tuple[resources_pb2.FieldType.ValueType, str], set[ClassificationLabel]] = (
53
+ dataclasses.field(default_factory=dict)
54
+ )
47
55
 
56
+ def for_field(
57
+ self, field_key: str, field_type: resources_pb2.FieldType.ValueType
58
+ ) -> list[ClassificationLabel]:
59
+ """
60
+ Returns a list of unique classification labels for a given field, including those inherited from the resource.
61
+ """
62
+ field_id = (field_type, field_key)
63
+ resource_level = self.resource_level
64
+ field_level = self.field_level.get(field_id, set())
65
+ return list(resource_level.union(field_level))
66
+
67
+
68
+ async def extract_file_field_from_pb(
69
+ field_pb: resources_pb2.FieldFile, classif_labels: list[ClassificationLabel]
70
+ ) -> str:
71
+ processing = get_processing()
48
72
  if field_pb.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
49
73
  file_field = models.FileField(
50
74
  language=field_pb.language,
@@ -52,16 +76,17 @@ async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
52
76
  file=models.File(payload=None, uri=field_pb.file.uri),
53
77
  extract_strategy=field_pb.extract_strategy,
54
78
  )
55
- return processing.convert_external_filefield_to_str(file_field)
79
+ return processing.convert_external_filefield_to_str(file_field, classif_labels)
56
80
  else:
57
81
  storage = await get_storage(service_name=SERVICE_NAME)
58
- return await processing.convert_internal_filefield_to_str(field_pb, storage)
82
+ return await processing.convert_internal_filefield_to_str(field_pb, storage, classif_labels)
59
83
 
60
84
 
61
85
  async def extract_file_field(
62
86
  field_id: str,
63
87
  resource: ORMResource,
64
88
  toprocess: PushPayload,
89
+ resource_classifications: ResourceClassifications,
65
90
  password: Optional[str] = None,
66
91
  ):
67
92
  field_type = resources_pb2.FieldType.FILE
@@ -73,13 +98,19 @@ async def extract_file_field(
73
98
  if password is not None:
74
99
  field_pb.password = password
75
100
 
76
- toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
101
+ classif_labels = resource_classifications.for_field(field_id, resources_pb2.FieldType.FILE)
102
+ toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb, classif_labels)
77
103
 
78
104
 
79
105
  async def extract_fields(resource: ORMResource, toprocess: PushPayload):
80
106
  processing = get_processing()
81
107
  storage = await get_storage(service_name=SERVICE_NAME)
82
108
  await resource.get_fields()
109
+
110
+ resource_classifications = await atomic_get_stored_resource_classifications(
111
+ kbid=toprocess.kbid,
112
+ rid=toprocess.uuid,
113
+ )
83
114
  for (field_type, field_id), field in resource.fields.items():
84
115
  field_type_name = from_proto.field_type_name(field_type)
85
116
 
@@ -92,9 +123,9 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
92
123
  continue
93
124
 
94
125
  field_pb = await field.get_value()
95
-
126
+ classif_labels = resource_classifications.for_field(field_id, field_type)
96
127
  if field_type_name is FieldTypeName.FILE:
97
- toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
128
+ toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb, classif_labels)
98
129
 
99
130
  if field_type_name is FieldTypeName.LINK:
100
131
  parsed_link = MessageToDict(
@@ -104,6 +135,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
104
135
  )
105
136
  parsed_link["link"] = parsed_link.pop("uri", None)
106
137
  toprocess.linkfield[field_id] = models.LinkUpload(**parsed_link)
138
+ toprocess.linkfield[field_id].classification_labels = classif_labels
107
139
 
108
140
  if field_type_name is FieldTypeName.TEXT:
109
141
  parsed_text = MessageToDict(
@@ -113,6 +145,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
113
145
  )
114
146
  parsed_text["format"] = models.PushTextFormat[parsed_text["format"]]
115
147
  toprocess.textfield[field_id] = models.Text(**parsed_text)
148
+ toprocess.textfield[field_id].classification_labels = classif_labels
116
149
 
117
150
  if field_type_name is FieldTypeName.CONVERSATION and isinstance(field, Conversation):
118
151
  metadata = await field.get_metadata()
@@ -143,6 +176,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
143
176
  )
144
177
  full_conversation.messages.append(models.PushMessage(**parsed_message))
145
178
  toprocess.conversationfield[field_id] = full_conversation
179
+ toprocess.conversationfield[field_id].classification_labels = classif_labels
146
180
 
147
181
 
148
182
  async def parse_fields(
@@ -152,18 +186,48 @@ async def parse_fields(
152
186
  kbid: str,
153
187
  uuid: str,
154
188
  x_skip_store: bool,
189
+ resource_classifications: ResourceClassifications,
155
190
  ):
156
191
  for key, file_field in item.files.items():
157
- await parse_file_field(key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store)
192
+ await parse_file_field(
193
+ key,
194
+ file_field,
195
+ writer,
196
+ toprocess,
197
+ kbid,
198
+ uuid,
199
+ resource_classifications,
200
+ skip_store=x_skip_store,
201
+ )
158
202
 
159
203
  for key, link_field in item.links.items():
160
- parse_link_field(key, link_field, writer, toprocess)
204
+ parse_link_field(
205
+ key,
206
+ link_field,
207
+ writer,
208
+ toprocess,
209
+ resource_classifications,
210
+ )
161
211
 
162
212
  for key, text_field in item.texts.items():
163
- parse_text_field(key, text_field, writer, toprocess)
213
+ parse_text_field(
214
+ key,
215
+ text_field,
216
+ writer,
217
+ toprocess,
218
+ resource_classifications,
219
+ )
164
220
 
165
221
  for key, conversation_field in item.conversations.items():
166
- await parse_conversation_field(key, conversation_field, writer, toprocess, kbid, uuid)
222
+ await parse_conversation_field(
223
+ key,
224
+ conversation_field,
225
+ writer,
226
+ toprocess,
227
+ kbid,
228
+ uuid,
229
+ resource_classifications,
230
+ )
167
231
 
168
232
 
169
233
  def parse_text_field(
@@ -171,7 +235,9 @@ def parse_text_field(
171
235
  text_field: models.TextField,
172
236
  writer: BrokerMessage,
173
237
  toprocess: PushPayload,
238
+ resource_classifications: ResourceClassifications,
174
239
  ) -> None:
240
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.TEXT)
175
241
  if text_field.extract_strategy is not None:
176
242
  writer.texts[key].extract_strategy = text_field.extract_strategy
177
243
  writer.texts[key].body = text_field.body
@@ -185,6 +251,7 @@ def parse_text_field(
185
251
  body=text_field.body,
186
252
  format=getattr(models.PushTextFormat, text_field.format.value),
187
253
  extract_strategy=text_field.extract_strategy,
254
+ classification_labels=classif_labels,
188
255
  )
189
256
  writer.field_statuses.append(
190
257
  FieldIDStatus(
@@ -201,13 +268,21 @@ async def parse_file_field(
201
268
  toprocess: PushPayload,
202
269
  kbid: str,
203
270
  uuid: str,
271
+ resource_classifications: ResourceClassifications,
204
272
  skip_store: bool = False,
205
273
  ):
206
274
  if file_field.file.is_external:
207
- parse_external_file_field(key, file_field, writer, toprocess)
275
+ parse_external_file_field(key, file_field, writer, toprocess, resource_classifications)
208
276
  else:
209
277
  await parse_internal_file_field(
210
- key, file_field, writer, toprocess, kbid, uuid, skip_store=skip_store
278
+ key,
279
+ file_field,
280
+ writer,
281
+ toprocess,
282
+ kbid,
283
+ uuid,
284
+ resource_classifications,
285
+ skip_store=skip_store,
211
286
  )
212
287
 
213
288
  writer.field_statuses.append(
@@ -225,8 +300,10 @@ async def parse_internal_file_field(
225
300
  toprocess: PushPayload,
226
301
  kbid: str,
227
302
  uuid: str,
303
+ resource_classifications: ResourceClassifications,
228
304
  skip_store: bool = False,
229
305
  ) -> None:
306
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.FILE)
230
307
  writer.files[key].added.FromDatetime(datetime.now())
231
308
  if file_field.language:
232
309
  writer.files[key].language = file_field.language
@@ -234,10 +311,9 @@ async def parse_internal_file_field(
234
311
  writer.files[key].extract_strategy = file_field.extract_strategy
235
312
 
236
313
  processing = get_processing()
237
-
238
314
  if skip_store:
239
315
  # Does not store file on nuclia's blob storage. Only sends it to process
240
- toprocess.filefield[key] = await processing.convert_filefield_to_str(file_field)
316
+ toprocess.filefield[key] = await processing.convert_filefield_to_str(file_field, classif_labels)
241
317
 
242
318
  else:
243
319
  # Store file on nuclia's blob storage
@@ -254,7 +330,7 @@ async def parse_internal_file_field(
254
330
  )
255
331
  # Send the pointer of the new blob to processing
256
332
  toprocess.filefield[key] = await processing.convert_internal_filefield_to_str(
257
- writer.files[key], storage
333
+ writer.files[key], storage, classif_labels
258
334
  )
259
335
 
260
336
 
@@ -263,7 +339,9 @@ def parse_external_file_field(
263
339
  file_field: models.FileField,
264
340
  writer: BrokerMessage,
265
341
  toprocess: PushPayload,
342
+ resource_classifications: ResourceClassifications,
266
343
  ) -> None:
344
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.FILE)
267
345
  writer.files[key].added.FromDatetime(datetime.now())
268
346
  if file_field.language:
269
347
  writer.files[key].language = file_field.language
@@ -276,9 +354,8 @@ def parse_external_file_field(
276
354
  writer.files[key].file.content_type = file_field.file.content_type
277
355
  if file_field.file.content_type and writer.basic.icon == GENERIC_MIME_TYPE:
278
356
  writer.basic.icon = file_field.file.content_type
279
-
280
357
  processing = get_processing()
281
- toprocess.filefield[key] = processing.convert_external_filefield_to_str(file_field)
358
+ toprocess.filefield[key] = processing.convert_external_filefield_to_str(file_field, classif_labels)
282
359
 
283
360
 
284
361
  def parse_link_field(
@@ -286,7 +363,9 @@ def parse_link_field(
286
363
  link_field: models.LinkField,
287
364
  writer: BrokerMessage,
288
365
  toprocess: PushPayload,
366
+ resource_classifications: ResourceClassifications,
289
367
  ) -> None:
368
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.LINK)
290
369
  writer.links[key].added.FromDatetime(datetime.now())
291
370
 
292
371
  writer.links[key].uri = link_field.uri
@@ -322,6 +401,7 @@ def parse_link_field(
322
401
  css_selector=link_field.css_selector,
323
402
  xpath=link_field.xpath,
324
403
  extract_strategy=link_field.extract_strategy,
404
+ classification_labels=classif_labels,
325
405
  )
326
406
  writer.field_statuses.append(
327
407
  FieldIDStatus(
@@ -338,7 +418,9 @@ async def parse_conversation_field(
338
418
  toprocess: PushPayload,
339
419
  kbid: str,
340
420
  uuid: str,
421
+ resource_classifications: ResourceClassifications,
341
422
  ) -> None:
423
+ classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.CONVERSATION)
342
424
  storage = await get_storage(service_name=SERVICE_NAME)
343
425
  processing = get_processing()
344
426
  field_value = resources_pb2.Conversation()
@@ -401,7 +483,7 @@ async def parse_conversation_field(
401
483
  processing_message.to.append(to)
402
484
  convs.messages.append(processing_message)
403
485
  field_value.messages.append(cm)
404
-
486
+ convs.classification_labels = classif_labels
405
487
  toprocess.conversationfield[key] = convs
406
488
  writer.conversations[key].CopyFrom(field_value)
407
489
  writer.field_statuses.append(
@@ -410,3 +492,37 @@ async def parse_conversation_field(
410
492
  status=FieldStatus.Status.PENDING,
411
493
  )
412
494
  )
495
+
496
+
497
+ async def atomic_get_stored_resource_classifications(
498
+ kbid: str,
499
+ rid: str,
500
+ ) -> ResourceClassifications:
501
+ async with datamanagers.with_ro_transaction() as txn:
502
+ return await get_stored_resource_classifications(txn, kbid=kbid, rid=rid)
503
+
504
+
505
+ async def get_stored_resource_classifications(
506
+ txn: Transaction,
507
+ *,
508
+ kbid: str,
509
+ rid: str,
510
+ ) -> ResourceClassifications:
511
+ rc = ResourceClassifications()
512
+ basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
513
+ if basic is None:
514
+ # Resource not found
515
+ return rc
516
+
517
+ # User resource-level classifications
518
+ for u_classif in basic.usermetadata.classifications:
519
+ classif = ClassificationLabel(labelset=u_classif.labelset, label=u_classif.label)
520
+ rc.resource_level.add(classif)
521
+
522
+ # Processor-computed field-level classifications. These are not user-defined and are immutable.
523
+ for field_classif in basic.computedmetadata.field_classifications:
524
+ fid = (field_classif.field.field_type, field_classif.field.field)
525
+ for f_classif in field_classif.classifications:
526
+ classif = ClassificationLabel(labelset=f_classif.labelset, label=f_classif.label)
527
+ rc.field_level.setdefault(fid, set()).add(classif)
528
+ return rc
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: nucliadb
3
- Version: 6.3.4.post3656
3
+ Version: 6.3.4.post3675
4
4
  Summary: NucliaDB
5
5
  Author-email: Nuclia <nucliadb@nuclia.com>
6
6
  License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: <4,>=3.9
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3656
24
- Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3656
25
- Requires-Dist: nucliadb-protos>=6.3.4.post3656
26
- Requires-Dist: nucliadb-models>=6.3.4.post3656
27
- Requires-Dist: nidx-protos>=6.3.4.post3656
23
+ Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3675
24
+ Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3675
25
+ Requires-Dist: nucliadb-protos>=6.3.4.post3675
26
+ Requires-Dist: nucliadb-models>=6.3.4.post3675
27
+ Requires-Dist: nidx-protos>=6.3.4.post3675
28
28
  Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
29
29
  Requires-Dist: nuclia-models>=0.24.2
30
30
  Requires-Dist: uvicorn
@@ -114,7 +114,7 @@ nucliadb/export_import/utils.py,sha256=aBBB7p05GfKknpb9LQa8Krtz0LlFoP5NUTiPy7PwP
114
114
  nucliadb/ingest/__init__.py,sha256=fsw3C38VP50km3R-nHL775LNGPpJ4JxqXJ2Ib1f5SqE,1011
115
115
  nucliadb/ingest/app.py,sha256=TaVgh5B2riFVmcsrbPb7a5YCzmnybjx-NK0BXgTwGAY,7535
116
116
  nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE,2410
117
- nucliadb/ingest/processing.py,sha256=8OggvuxNzktTTKDTUwsIuazhDParEWhn46CBZaMYAy8,20659
117
+ nucliadb/ingest/processing.py,sha256=7NNoVxbSwsRdbo5goqVSrUc_QXZRVfOT_jZPzrmbxJQ,22207
118
118
  nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
119
  nucliadb/ingest/serialize.py,sha256=42zNKu-O6g9EmLnQOXwhfagD76FSmWD6xRf69LrJxfA,16415
120
120
  nucliadb/ingest/settings.py,sha256=0B-wQNa8FLqtNcQgRzh-fuIuGptM816XHcbH1NQKfmE,3050
@@ -328,20 +328,20 @@ nucliadb/writer/api/constants.py,sha256=qWEDjFUycrEZnSJyLnNK4PQNodU2oVmkO4NycaEZ
328
328
  nucliadb/writer/api/utils.py,sha256=wIQHlU8RQiIGVLI72suvyVIKlCU44Unh0Ae0IiN6Qwo,1313
329
329
  nucliadb/writer/api/v1/__init__.py,sha256=akI9A_jloNLb0dU4T5zjfdyvmSAiDeIdjAlzNx74FlU,1128
330
330
  nucliadb/writer/api/v1/export_import.py,sha256=elf-EQY5DD3mhw8kWb9tQpDcbrF9sY6VFYqxQOjuVP0,8201
331
- nucliadb/writer/api/v1/field.py,sha256=OsWOYA0WQ6onE5Rkl20QIEdtrSi7Jgnu62fUt90Ziy8,17503
331
+ nucliadb/writer/api/v1/field.py,sha256=FySCMpcruSAKGeepeAlOihjwxyUPcDO73Uilq5VDWRk,18514
332
332
  nucliadb/writer/api/v1/knowledgebox.py,sha256=MLeIuym4jPrJgfy1NTcN9CpUGwuBiqDHMcx0hY9DR7g,9530
333
333
  nucliadb/writer/api/v1/learning_config.py,sha256=CKBjqcbewkfPwGUPLDWzZSpro6XkmCaVppe5Qtpu5Go,3117
334
- nucliadb/writer/api/v1/resource.py,sha256=A8fAHlN5XFsg6XFYKhfWJS8czgNH6yXr-PsnUqz2WUE,18757
334
+ nucliadb/writer/api/v1/resource.py,sha256=jV9HM-ID1PPYypfy4Sl4_9aSPF87v7gSJZUSzHjHcQ4,19740
335
335
  nucliadb/writer/api/v1/router.py,sha256=RjuoWLpZer6Kl2BW_wznpNo6XL3BOpdTGqXZCn3QrrQ,1034
336
336
  nucliadb/writer/api/v1/services.py,sha256=HLQW18AEC5zQp5azpeAtRi7ZTzQSwG6SbmkHlmjTIFA,13165
337
337
  nucliadb/writer/api/v1/slug.py,sha256=xlVBDBpRi9bNulpBHZwhyftVvulfE0zFm1XZIWl-AKY,2389
338
338
  nucliadb/writer/api/v1/transaction.py,sha256=d2Vbgnkk_-FLGSTt3vfldwiJIUf0XoyD0wP1jQNz_DY,2430
339
- nucliadb/writer/api/v1/upload.py,sha256=VOeqNTrZx1_z8iaKjM7p8fVlVcIYMtnQNK1dm72ct6k,33161
339
+ nucliadb/writer/api/v1/upload.py,sha256=hLMHXSaqEOE-vjKjhIupgdx8klJc3mVQp_oMwx5N-7o,33800
340
340
  nucliadb/writer/api/v1/vectorsets.py,sha256=mESaXkkI9f-jWWMW61ZZgv7E5YWXKemyc6vwT0lFXns,6747
341
341
  nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
342
342
  nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
343
- nucliadb/writer/resource/basic.py,sha256=cHhh5hQRHFIoKd-6fEteHuGWW6fGN56ornIWPBuSpHg,11214
344
- nucliadb/writer/resource/field.py,sha256=HsOERELyAsb9e0dx2IkSQ9lk0SThALFRcDKCVBw8ifU,15478
343
+ nucliadb/writer/resource/basic.py,sha256=_zdAr110C7rtEzOKoBRMzPjAnQ0pAtRfGjB8qCzodvI,11767
344
+ nucliadb/writer/resource/field.py,sha256=qnj31lM9F0AFlj3QhPcPj90vHg7SMbbYW098fMtYt9o,20053
345
345
  nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
346
346
  nucliadb/writer/tus/__init__.py,sha256=huWpKnDnjsrKlBBJk30ta5vamlA-4x0TbPs_2Up8hyM,5443
347
347
  nucliadb/writer/tus/azure.py,sha256=XhWAlWTM0vmXcXtuEPYjjeEhuZjiZXZu8q9WsJ7omFE,4107
@@ -352,8 +352,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
352
352
  nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
353
353
  nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
354
354
  nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
355
- nucliadb-6.3.4.post3656.dist-info/METADATA,sha256=spsdbVapfZnp-QuN7s9NBMX7yMWkHmsKvcgPAwKrckk,4291
356
- nucliadb-6.3.4.post3656.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
357
- nucliadb-6.3.4.post3656.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
358
- nucliadb-6.3.4.post3656.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
359
- nucliadb-6.3.4.post3656.dist-info/RECORD,,
355
+ nucliadb-6.3.4.post3675.dist-info/METADATA,sha256=WfRjwWmeEZALPK3kwAm8Yh2VuYhd3KvyxNzEcrs5IWs,4291
356
+ nucliadb-6.3.4.post3675.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
357
+ nucliadb-6.3.4.post3675.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
358
+ nucliadb-6.3.4.post3675.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
359
+ nucliadb-6.3.4.post3675.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.0.0)
2
+ Generator: setuptools (76.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5