PyPI - nucliadb - Versions diffs - 6.3.4.post3656__py3-none-any.whl → 6.3.4.post3675__py3-none-any.whl - Mend

nucliadb 6.3.4.post3656py3-none-any.whl → 6.3.4.post3675py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

nucliadb/ingest/processing.py CHANGED Viewed

@@ -19,6 +19,7 @@
 #
 import base64
 import datetime
+import json
 import logging
 import uuid
 from collections import defaultdict
@@ -32,6 +33,7 @@ import jwt
 from pydantic import BaseModel, Field
 import nucliadb_models as models
+from nucliadb_models.labels import ClassificationLabel
 from nucliadb_models.resource import QueueType
 from nucliadb_protos.resources_pb2 import CloudFile
 from nucliadb_protos.resources_pb2 import FieldFile as FieldFilePB
@@ -93,7 +95,10 @@ class PushPayload(BaseModel):
     genericfield: dict[str, models.Text] = {}
     # New File
-    filefield: dict[str, str] = {}
+    filefield: dict[str, str] = Field(
+        default={},
+        description="Map of each file field to the jwt token computed in ProcessingEngine methods",
+    )
     # New Link
     linkfield: dict[str, models.LinkUpload] = {}
@@ -238,7 +243,9 @@ class ProcessingEngine:
         }
         return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
-    def generate_file_token_from_fieldfile(self, file: FieldFilePB) -> str:
+    def generate_file_token_from_fieldfile(
+        self, file: FieldFilePB, classif_labels: Optional[list[ClassificationLabel]] = None
+    ) -> str:
         if self.nuclia_jwt_key is None:
             raise AttributeError("Nuclia JWT key not set")
         now = datetime.datetime.now(tz=datetime.timezone.utc)
@@ -263,6 +270,8 @@ class ProcessingEngine:
             "language": file.language,
             "extract_strategy": file.extract_strategy,
         }
+        if classif_labels:
+            payload["classification_labels"] = self.encode_classif_labels(classif_labels)
         return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
     @backoff.on_exception(
@@ -272,7 +281,9 @@ class ProcessingEngine:
         max_tries=MAX_TRIES,
     )
     @processing_observer.wrap({"type": "file_field_upload"})
-    async def convert_filefield_to_str(self, file: models.FileField) -> str:
+    async def convert_filefield_to_str(
+        self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+    ) -> str:
         # Upload file without storing on Nuclia DB
         headers = {}
         headers["X-PASSWORD"] = file.password
@@ -281,6 +292,8 @@ class ProcessingEngine:
         headers["X-MD5"] = file.file.md5
         if file.extract_strategy is not None:
             headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
+        if classif_labels:
+            headers["X-CLASSIFICATION-LABELS"] = self.encode_classif_labels(classif_labels)
         headers["CONTENT_TYPE"] = file.file.content_type
         headers["CONTENT-LENGTH"] = str(len(file.file.payload))  # type: ignore
         headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
@@ -299,7 +312,14 @@ class ProcessingEngine:
                 text = await resp.text()
                 raise Exception(f"STATUS: {resp.status} - {text}")
-    def convert_external_filefield_to_str(self, file_field: models.FileField) -> str:
+    def encode_classif_labels(self, classif_labels: list[ClassificationLabel]) -> str:
+        return base64.b64encode(
+            json.dumps([label.model_dump(mode="python") for label in classif_labels]).encode()
+        ).decode()
+    def convert_external_filefield_to_str(
+        self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+    ) -> str:
         if self.nuclia_jwt_key is None:
             raise AttributeError("Nuclia JWT key not set")
@@ -322,6 +342,8 @@ class ProcessingEngine:
             "password": file_field.password,
             "extract_strategy": file_field.extract_strategy,
         }
+        if classif_labels:
+            payload["classification_labels"] = self.encode_classif_labels(classif_labels)
         return jwt.encode(payload, self.nuclia_jwt_key, algorithm="HS256")
     @backoff.on_exception(
@@ -331,11 +353,16 @@ class ProcessingEngine:
         max_tries=MAX_TRIES,
     )
     @processing_observer.wrap({"type": "file_field_upload_internal"})
-    async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
+    async def convert_internal_filefield_to_str(
+        self,
+        file: FieldFilePB,
+        storage: Storage,
+        classif_labels: Optional[list[ClassificationLabel]] = None,
+    ) -> str:
         """It's already an internal file that needs to be uploaded"""
         if self.onprem is False:
             # Upload the file to processing upload
-            jwttoken = self.generate_file_token_from_fieldfile(file)
+            jwttoken = self.generate_file_token_from_fieldfile(file, classif_labels)
         else:
             headers = {}
             headers["X-PASSWORD"] = file.password
@@ -347,6 +374,8 @@ class ProcessingEngine:
                 headers["CONTENT-LENGTH"] = str(file.file.size)
             if file.extract_strategy != "":
                 headers["X-EXTRACT-STRATEGY"] = file.extract_strategy
+            if classif_labels:
+                headers["X-CLASSIFICATION-LABELS"] = self.encode_classif_labels(classif_labels)
             headers["X-STF-NUAKEY"] = f"Bearer {self.nuclia_service_account}"
             iterator = storage.downloadbytescf_iterator(file.file)
@@ -488,22 +517,31 @@ class DummyProcessingEngine(ProcessingEngine):
     async def finalize(self):
         pass
-    async def convert_filefield_to_str(self, file: models.FileField) -> str:
+    async def convert_filefield_to_str(
+        self, file: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+    ) -> str:
         self.calls.append([file])
         index = len(self.values["convert_filefield_to_str"])
-        self.values["convert_filefield_to_str"].append(file)
+        self.values["convert_filefield_to_str"].append((file, classif_labels))
         return f"convert_filefield_to_str,{index}"
-    def convert_external_filefield_to_str(self, file_field: models.FileField) -> str:
+    def convert_external_filefield_to_str(
+        self, file_field: models.FileField, classif_labels: Optional[list[ClassificationLabel]] = None
+    ) -> str:
         self.calls.append([file_field])
         index = len(self.values["convert_external_filefield_to_str"])
-        self.values["convert_external_filefield_to_str"].append(file_field)
+        self.values["convert_external_filefield_to_str"].append((file_field, classif_labels))
         return f"convert_external_filefield_to_str,{index}"
-    async def convert_internal_filefield_to_str(self, file: FieldFilePB, storage: Storage) -> str:
+    async def convert_internal_filefield_to_str(
+        self,
+        file: FieldFilePB,
+        storage: Storage,
+        classif_labels: Optional[list[ClassificationLabel]] = None,
+    ) -> str:
         self.calls.append([file, storage])
         index = len(self.values["convert_internal_filefield_to_str"])
-        self.values["convert_internal_filefield_to_str"].append([file, storage])
+        self.values["convert_internal_filefield_to_str"].append((file, storage, classif_labels))
         return f"convert_internal_filefield_to_str,{index}"
     async def convert_internal_cf_to_str(self, cf: CloudFile, storage: Storage) -> str:

nucliadb/writer/api/v1/field.py CHANGED Viewed

@@ -43,7 +43,10 @@ from nucliadb.writer.api.v1.router import KB_PREFIX, RESOURCE_PREFIX, RSLUG_PREF
 from nucliadb.writer.back_pressure import maybe_back_pressure
 from nucliadb.writer.resource.audit import parse_audit
 from nucliadb.writer.resource.field import (
+    ResourceClassifications,
+    atomic_get_stored_resource_classifications,
     extract_file_field,
+    get_stored_resource_classifications,
     parse_conversation_field,
     parse_file_field,
     parse_link_field,
@@ -114,11 +117,31 @@ async def add_field_to_resource(
     parse_audit(writer.audit, request)
+    resource_classifications = await atomic_get_stored_resource_classifications(kbid=kbid, rid=rid)
     parse_field = FIELD_PARSERS_MAP[type(field_payload)]
     if iscoroutinefunction(parse_field):
-        await parse_field(kbid, rid, field_id, field_payload, writer, toprocess, **parser_kwargs)
+        await parse_field(
+            kbid,
+            rid,
+            field_id,
+            field_payload,
+            writer,
+            toprocess,
+            resource_classifications,
+            **parser_kwargs,
+        )
     else:
-        parse_field(kbid, rid, field_id, field_payload, writer, toprocess, **parser_kwargs)
+        parse_field(
+            kbid,
+            rid,
+            field_id,
+            field_payload,
+            writer,
+            toprocess,
+            resource_classifications,
+            **parser_kwargs,
+        )
     processing = get_processing()
     await transaction.commit(writer, partition)
@@ -200,8 +223,9 @@ def parse_text_field_adapter(
     field_payload: models.TextField,
     writer: BrokerMessage,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
 ):
-    return parse_text_field(field_id, field_payload, writer, toprocess)
+    return parse_text_field(field_id, field_payload, writer, toprocess, resource_classifications)
 def parse_link_field_adapter(
@@ -211,8 +235,9 @@ def parse_link_field_adapter(
     field_payload: models.LinkField,
     writer: BrokerMessage,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
 ):
-    return parse_link_field(field_id, field_payload, writer, toprocess)
+    return parse_link_field(field_id, field_payload, writer, toprocess, resource_classifications)
 async def parse_conversation_field_adapter(
@@ -222,8 +247,11 @@ async def parse_conversation_field_adapter(
     field_payload: models.InputConversationField,
     writer: BrokerMessage,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
 ):
-    return await parse_conversation_field(field_id, field_payload, writer, toprocess, kbid, rid)
+    return await parse_conversation_field(
+        field_id, field_payload, writer, toprocess, kbid, rid, resource_classifications
+    )
 async def parse_file_field_adapter(
@@ -233,14 +261,22 @@ async def parse_file_field_adapter(
     field_payload: models.FileField,
     writer: BrokerMessage,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
     skip_store: bool,
 ):
     return await parse_file_field(
-        field_id, field_payload, writer, toprocess, kbid, rid, skip_store=skip_store
+        field_id,
+        field_payload,
+        writer,
+        toprocess,
+        kbid,
+        rid,
+        resource_classifications,
+        skip_store=skip_store,
     )
-FIELD_PARSERS_MAP: dict[Type, Union[Callable]] = {
+FIELD_PARSERS_MAP: dict[Type, Callable] = {
     models.TextField: parse_text_field_adapter,
     models.LinkField: parse_link_field_adapter,
     models.InputConversationField: parse_conversation_field_adapter,
@@ -537,12 +573,15 @@ async def reprocess_file_field(
         if resource.basic is not None:
             toprocess.title = resource.basic.title
+        rclassif = await get_stored_resource_classifications(txn, kbid=kbid, rid=rid)
         try:
             await extract_file_field(
                 field_id,
                 resource=resource,
                 toprocess=toprocess,
                 password=x_file_password,
+                resource_classifications=rclassif,
             )
         except KeyError:
             raise HTTPException(status_code=404, detail="Field does not exist")

nucliadb/writer/api/v1/resource.py CHANGED Viewed

@@ -50,10 +50,16 @@ from nucliadb.writer.resource.audit import parse_audit
 from nucliadb.writer.resource.basic import (
     parse_basic_creation,
     parse_basic_modify,
+    parse_user_classifications,
     set_status,
     set_status_modify,
 )
-from nucliadb.writer.resource.field import extract_fields, parse_fields
+from nucliadb.writer.resource.field import (
+    ResourceClassifications,
+    atomic_get_stored_resource_classifications,
+    extract_fields,
+    parse_fields,
+)
 from nucliadb.writer.resource.origin import parse_extra, parse_origin
 from nucliadb.writer.utilities import get_processing
 from nucliadb_models.resource import NucliaDBRoles
@@ -139,6 +145,11 @@ async def create_resource(
         if item.extra is not None:
             parse_extra(writer.extra, item.extra)
+        # Since this is a resource creation, we need to care only about the user-provided
+        # classifications in the request.
+        resource_classifications = ResourceClassifications(
+            resource_level=set(parse_user_classifications(item))
+        )
         await parse_fields(
             writer=writer,
             item=item,
@@ -146,6 +157,7 @@ async def create_resource(
             kbid=kbid,
             uuid=uuid,
             x_skip_store=x_skip_store,
+            resource_classifications=resource_classifications,
         )
         set_status(writer.basic, item)
@@ -296,6 +308,15 @@ async def modify_resource(
     if item.extra is not None:
         parse_extra(writer.extra, item.extra)
+    if item.usermetadata is not None:
+        # If usermetadata is set in the request payload, this means that stored resource classifications
+        # are not valid and we need to use the ones provided by the user in the request
+        resource_classifications = ResourceClassifications(
+            resource_level=set(parse_user_classifications(item))
+        )
+    else:
+        resource_classifications = await atomic_get_stored_resource_classifications(kbid, rid)
     await parse_fields(
         writer=writer,
         item=item,
@@ -303,6 +324,7 @@ async def modify_resource(
         kbid=kbid,
         uuid=rid,
         x_skip_store=x_skip_store,
+        resource_classifications=resource_classifications,
     )
     set_status_modify(writer.basic, item)

nucliadb/writer/api/v1/upload.py CHANGED Viewed

@@ -45,8 +45,11 @@ from nucliadb.writer.api.v1.resource import (
 from nucliadb.writer.api.v1.slug import ensure_slug_uniqueness, noop_context_manager
 from nucliadb.writer.back_pressure import maybe_back_pressure
 from nucliadb.writer.resource.audit import parse_audit
-from nucliadb.writer.resource.basic import parse_basic_creation
-from nucliadb.writer.resource.field import parse_fields
+from nucliadb.writer.resource.basic import parse_basic_creation, parse_user_classifications
+from nucliadb.writer.resource.field import (
+    atomic_get_stored_resource_classifications,
+    parse_fields,
+)
 from nucliadb.writer.resource.origin import parse_extra, parse_origin
 from nucliadb.writer.tus import TUSUPLOAD, UPLOAD, get_dm, get_storage_manager
 from nucliadb.writer.tus.exceptions import (
@@ -64,6 +67,7 @@ from nucliadb_models import content_types
 from nucliadb_models.resource import NucliaDBRoles
 from nucliadb_models.utils import FieldIdString
 from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
+from nucliadb_protos import resources_pb2
 from nucliadb_protos.resources_pb2 import CloudFile, FieldFile, FieldID, FieldType, Metadata
 from nucliadb_protos.writer_pb2 import BrokerMessage, FieldIDStatus, FieldStatus
 from nucliadb_utils.authentication import requires_one
@@ -864,7 +868,6 @@ async def store_file_on_nuclia_db(
     partitioning = get_partitioning()
     processing = get_processing()
     storage = await get_storage(service_name=SERVICE_NAME)
     partition = partitioning.generate_partition(kbid, rid)
     writer = BrokerMessage()
@@ -884,6 +887,9 @@ async def store_file_on_nuclia_db(
     parse_audit(writer.audit, request)
     unique_slug_context_manager = noop_context_manager()
+    resource_classifications = await atomic_get_stored_resource_classifications(kbid, rid)
     if item is not None:
         if item.slug:
             unique_slug_context_manager = ensure_slug_uniqueness(kbid, item.slug)
@@ -900,7 +906,9 @@ async def store_file_on_nuclia_db(
             parse_extra(writer.extra, item.extra)
         toprocess.title = writer.basic.title
+        if item.usermetadata:
+            # Any resource level classification that comes on the request payload overrides the stored ones
+            resource_classifications.resource_level = set(parse_user_classifications(item))
         await parse_fields(
             writer=writer,
             item=item,
@@ -908,6 +916,7 @@ async def store_file_on_nuclia_db(
             kbid=kbid,
             uuid=rid,
             x_skip_store=False,
+            resource_classifications=resource_classifications,
         )
     else:
         # Use defaults for everything, but don't forget hidden which depends on KB config
@@ -953,8 +962,9 @@ async def store_file_on_nuclia_db(
             )
         )
+        classif_labels = resource_classifications.for_field(field, resources_pb2.FieldType.FILE)
         toprocess.filefield[field] = await processing.convert_internal_filefield_to_str(
-            file_field, storage=storage
+            file_field, storage=storage, classif_labels=classif_labels
         )
         writer.source = BrokerMessage.MessageSource.WRITER

nucliadb/writer/resource/basic.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 from datetime import datetime
-from typing import Optional
+from typing import Optional, Union
 from fastapi import HTTPException
@@ -31,6 +31,7 @@ from nucliadb.ingest.orm.utils import set_title
 from nucliadb.ingest.processing import PushPayload
 from nucliadb_models.content_types import GENERIC_MIME_TYPE
 from nucliadb_models.file import FileField
+from nucliadb_models.labels import ClassificationLabel
 from nucliadb_models.link import LinkField
 from nucliadb_models.metadata import (
     ParagraphAnnotation,
@@ -290,3 +291,20 @@ def build_question_answer_annotation_pb(
         answer.ids_paragraphs.extend(answer_annotation.ids_paragraphs)
         pb.question_answer.answers.append(answer)
     return pb
+def parse_user_classifications(
+    item: Union[CreateResourcePayload, UpdateResourcePayload],
+) -> list[ClassificationLabel]:
+    return (
+        [
+            ClassificationLabel(
+                labelset=classification.labelset,
+                label=classification.label,
+            )
+            for classification in item.usermetadata.classifications
+            if classification.cancelled_by_user is False
+        ]
+        if item.usermetadata is not None
+        else []
+    )

nucliadb/writer/resource/field.py CHANGED Viewed

@@ -17,12 +17,15 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
+import dataclasses
 from datetime import datetime
 from typing import Optional, Union
 from google.protobuf.json_format import MessageToDict
 import nucliadb_models as models
+from nucliadb.common import datamanagers
+from nucliadb.common.maindb.driver import Transaction
 from nucliadb.common.models_utils import from_proto, to_proto
 from nucliadb.ingest.fields.conversation import Conversation
 from nucliadb.ingest.orm.resource import Resource as ORMResource
@@ -32,6 +35,7 @@ from nucliadb.writer.utilities import get_processing
 from nucliadb_models.common import FieldTypeName
 from nucliadb_models.content_types import GENERIC_MIME_TYPE
 from nucliadb_models.conversation import PushConversation
+from nucliadb_models.labels import ClassificationLabel
 from nucliadb_models.writer import (
     CreateResourcePayload,
     UpdateResourcePayload,
@@ -42,9 +46,29 @@ from nucliadb_utils.storages.storage import StorageField
 from nucliadb_utils.utilities import get_storage
-async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
-    processing = get_processing()
+@dataclasses.dataclass
+class ResourceClassifications:
+    resource_level: set[ClassificationLabel] = dataclasses.field(default_factory=set)
+    field_level: dict[tuple[resources_pb2.FieldType.ValueType, str], set[ClassificationLabel]] = (
+        dataclasses.field(default_factory=dict)
+    )
+    def for_field(
+        self, field_key: str, field_type: resources_pb2.FieldType.ValueType
+    ) -> list[ClassificationLabel]:
+        """
+        Returns a list of unique classification labels for a given field, including those inherited from the resource.
+        """
+        field_id = (field_type, field_key)
+        resource_level = self.resource_level
+        field_level = self.field_level.get(field_id, set())
+        return list(resource_level.union(field_level))
+async def extract_file_field_from_pb(
+    field_pb: resources_pb2.FieldFile, classif_labels: list[ClassificationLabel]
+) -> str:
+    processing = get_processing()
     if field_pb.file.source == resources_pb2.CloudFile.Source.EXTERNAL:
         file_field = models.FileField(
             language=field_pb.language,
@@ -52,16 +76,17 @@ async def extract_file_field_from_pb(field_pb: resources_pb2.FieldFile) -> str:
             file=models.File(payload=None, uri=field_pb.file.uri),
             extract_strategy=field_pb.extract_strategy,
         )
-        return processing.convert_external_filefield_to_str(file_field)
+        return processing.convert_external_filefield_to_str(file_field, classif_labels)
     else:
         storage = await get_storage(service_name=SERVICE_NAME)
-        return await processing.convert_internal_filefield_to_str(field_pb, storage)
+        return await processing.convert_internal_filefield_to_str(field_pb, storage, classif_labels)
 async def extract_file_field(
     field_id: str,
     resource: ORMResource,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
     password: Optional[str] = None,
 ):
     field_type = resources_pb2.FieldType.FILE
@@ -73,13 +98,19 @@ async def extract_file_field(
     if password is not None:
         field_pb.password = password
-    toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
+    classif_labels = resource_classifications.for_field(field_id, resources_pb2.FieldType.FILE)
+    toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb, classif_labels)
 async def extract_fields(resource: ORMResource, toprocess: PushPayload):
     processing = get_processing()
     storage = await get_storage(service_name=SERVICE_NAME)
     await resource.get_fields()
+    resource_classifications = await atomic_get_stored_resource_classifications(
+        kbid=toprocess.kbid,
+        rid=toprocess.uuid,
+    )
     for (field_type, field_id), field in resource.fields.items():
         field_type_name = from_proto.field_type_name(field_type)
@@ -92,9 +123,9 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
             continue
         field_pb = await field.get_value()
+        classif_labels = resource_classifications.for_field(field_id, field_type)
         if field_type_name is FieldTypeName.FILE:
-            toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb)
+            toprocess.filefield[field_id] = await extract_file_field_from_pb(field_pb, classif_labels)
         if field_type_name is FieldTypeName.LINK:
             parsed_link = MessageToDict(
@@ -104,6 +135,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
             )
             parsed_link["link"] = parsed_link.pop("uri", None)
             toprocess.linkfield[field_id] = models.LinkUpload(**parsed_link)
+            toprocess.linkfield[field_id].classification_labels = classif_labels
         if field_type_name is FieldTypeName.TEXT:
             parsed_text = MessageToDict(
@@ -113,6 +145,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
             )
             parsed_text["format"] = models.PushTextFormat[parsed_text["format"]]
             toprocess.textfield[field_id] = models.Text(**parsed_text)
+            toprocess.textfield[field_id].classification_labels = classif_labels
         if field_type_name is FieldTypeName.CONVERSATION and isinstance(field, Conversation):
             metadata = await field.get_metadata()
@@ -143,6 +176,7 @@ async def extract_fields(resource: ORMResource, toprocess: PushPayload):
                     )
                     full_conversation.messages.append(models.PushMessage(**parsed_message))
             toprocess.conversationfield[field_id] = full_conversation
+            toprocess.conversationfield[field_id].classification_labels = classif_labels
 async def parse_fields(
@@ -152,18 +186,48 @@ async def parse_fields(
     kbid: str,
     uuid: str,
     x_skip_store: bool,
+    resource_classifications: ResourceClassifications,
 ):
     for key, file_field in item.files.items():
-        await parse_file_field(key, file_field, writer, toprocess, kbid, uuid, skip_store=x_skip_store)
+        await parse_file_field(
+            key,
+            file_field,
+            writer,
+            toprocess,
+            kbid,
+            uuid,
+            resource_classifications,
+            skip_store=x_skip_store,
+        )
     for key, link_field in item.links.items():
-        parse_link_field(key, link_field, writer, toprocess)
+        parse_link_field(
+            key,
+            link_field,
+            writer,
+            toprocess,
+            resource_classifications,
+        )
     for key, text_field in item.texts.items():
-        parse_text_field(key, text_field, writer, toprocess)
+        parse_text_field(
+            key,
+            text_field,
+            writer,
+            toprocess,
+            resource_classifications,
+        )
     for key, conversation_field in item.conversations.items():
-        await parse_conversation_field(key, conversation_field, writer, toprocess, kbid, uuid)
+        await parse_conversation_field(
+            key,
+            conversation_field,
+            writer,
+            toprocess,
+            kbid,
+            uuid,
+            resource_classifications,
+        )
 def parse_text_field(
@@ -171,7 +235,9 @@ def parse_text_field(
     text_field: models.TextField,
     writer: BrokerMessage,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
 ) -> None:
+    classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.TEXT)
     if text_field.extract_strategy is not None:
         writer.texts[key].extract_strategy = text_field.extract_strategy
     writer.texts[key].body = text_field.body
@@ -185,6 +251,7 @@ def parse_text_field(
         body=text_field.body,
         format=getattr(models.PushTextFormat, text_field.format.value),
         extract_strategy=text_field.extract_strategy,
+        classification_labels=classif_labels,
     )
     writer.field_statuses.append(
         FieldIDStatus(
@@ -201,13 +268,21 @@ async def parse_file_field(
     toprocess: PushPayload,
     kbid: str,
     uuid: str,
+    resource_classifications: ResourceClassifications,
     skip_store: bool = False,
 ):
     if file_field.file.is_external:
-        parse_external_file_field(key, file_field, writer, toprocess)
+        parse_external_file_field(key, file_field, writer, toprocess, resource_classifications)
     else:
         await parse_internal_file_field(
-            key, file_field, writer, toprocess, kbid, uuid, skip_store=skip_store
+            key,
+            file_field,
+            writer,
+            toprocess,
+            kbid,
+            uuid,
+            resource_classifications,
+            skip_store=skip_store,
         )
     writer.field_statuses.append(
@@ -225,8 +300,10 @@ async def parse_internal_file_field(
     toprocess: PushPayload,
     kbid: str,
     uuid: str,
+    resource_classifications: ResourceClassifications,
     skip_store: bool = False,
 ) -> None:
+    classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.FILE)
     writer.files[key].added.FromDatetime(datetime.now())
     if file_field.language:
         writer.files[key].language = file_field.language
@@ -234,10 +311,9 @@ async def parse_internal_file_field(
         writer.files[key].extract_strategy = file_field.extract_strategy
     processing = get_processing()
     if skip_store:
         # Does not store file on nuclia's blob storage. Only sends it to process
-        toprocess.filefield[key] = await processing.convert_filefield_to_str(file_field)
+        toprocess.filefield[key] = await processing.convert_filefield_to_str(file_field, classif_labels)
     else:
         # Store file on nuclia's blob storage
@@ -254,7 +330,7 @@ async def parse_internal_file_field(
         )
         # Send the pointer of the new blob to processing
         toprocess.filefield[key] = await processing.convert_internal_filefield_to_str(
-            writer.files[key], storage
+            writer.files[key], storage, classif_labels
         )
@@ -263,7 +339,9 @@ def parse_external_file_field(
     file_field: models.FileField,
     writer: BrokerMessage,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
 ) -> None:
+    classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.FILE)
     writer.files[key].added.FromDatetime(datetime.now())
     if file_field.language:
         writer.files[key].language = file_field.language
@@ -276,9 +354,8 @@ def parse_external_file_field(
     writer.files[key].file.content_type = file_field.file.content_type
     if file_field.file.content_type and writer.basic.icon == GENERIC_MIME_TYPE:
         writer.basic.icon = file_field.file.content_type
     processing = get_processing()
-    toprocess.filefield[key] = processing.convert_external_filefield_to_str(file_field)
+    toprocess.filefield[key] = processing.convert_external_filefield_to_str(file_field, classif_labels)
 def parse_link_field(
@@ -286,7 +363,9 @@ def parse_link_field(
     link_field: models.LinkField,
     writer: BrokerMessage,
     toprocess: PushPayload,
+    resource_classifications: ResourceClassifications,
 ) -> None:
+    classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.LINK)
     writer.links[key].added.FromDatetime(datetime.now())
     writer.links[key].uri = link_field.uri
@@ -322,6 +401,7 @@ def parse_link_field(
         css_selector=link_field.css_selector,
         xpath=link_field.xpath,
         extract_strategy=link_field.extract_strategy,
+        classification_labels=classif_labels,
     )
     writer.field_statuses.append(
         FieldIDStatus(
@@ -338,7 +418,9 @@ async def parse_conversation_field(
     toprocess: PushPayload,
     kbid: str,
     uuid: str,
+    resource_classifications: ResourceClassifications,
 ) -> None:
+    classif_labels = resource_classifications.for_field(key, resources_pb2.FieldType.CONVERSATION)
     storage = await get_storage(service_name=SERVICE_NAME)
     processing = get_processing()
     field_value = resources_pb2.Conversation()
@@ -401,7 +483,7 @@ async def parse_conversation_field(
             processing_message.to.append(to)
         convs.messages.append(processing_message)
         field_value.messages.append(cm)
+    convs.classification_labels = classif_labels
     toprocess.conversationfield[key] = convs
     writer.conversations[key].CopyFrom(field_value)
     writer.field_statuses.append(
@@ -410,3 +492,37 @@ async def parse_conversation_field(
             status=FieldStatus.Status.PENDING,
         )
     )
+async def atomic_get_stored_resource_classifications(
+    kbid: str,
+    rid: str,
+) -> ResourceClassifications:
+    async with datamanagers.with_ro_transaction() as txn:
+        return await get_stored_resource_classifications(txn, kbid=kbid, rid=rid)
+async def get_stored_resource_classifications(
+    txn: Transaction,
+    *,
+    kbid: str,
+    rid: str,
+) -> ResourceClassifications:
+    rc = ResourceClassifications()
+    basic = await datamanagers.resources.get_basic(txn, kbid=kbid, rid=rid)
+    if basic is None:
+        # Resource not found
+        return rc
+    # User resource-level classifications
+    for u_classif in basic.usermetadata.classifications:
+        classif = ClassificationLabel(labelset=u_classif.labelset, label=u_classif.label)
+        rc.resource_level.add(classif)
+    # Processor-computed field-level classifications. These are not user-defined and are immutable.
+    for field_classif in basic.computedmetadata.field_classifications:
+        fid = (field_classif.field.field_type, field_classif.field.field)
+        for f_classif in field_classif.classifications:
+            classif = ClassificationLabel(labelset=f_classif.labelset, label=f_classif.label)
+            rc.field_level.setdefault(fid, set()).add(classif)
+    return rc

{nucliadb-6.3.4.post3656.dist-info → nucliadb-6.3.4.post3675.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: nucliadb
-Version: 6.3.4.post3656
+Version: 6.3.4.post3675
 Summary: NucliaDB
 Author-email: Nuclia <nucliadb@nuclia.com>
 License: AGPL
@@ -20,11 +20,11 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: <4,>=3.9
 Description-Content-Type: text/markdown
-Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3656
-Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3656
-Requires-Dist: nucliadb-protos>=6.3.4.post3656
-Requires-Dist: nucliadb-models>=6.3.4.post3656
-Requires-Dist: nidx-protos>=6.3.4.post3656
+Requires-Dist: nucliadb-telemetry[all]>=6.3.4.post3675
+Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.3.4.post3675
+Requires-Dist: nucliadb-protos>=6.3.4.post3675
+Requires-Dist: nucliadb-models>=6.3.4.post3675
+Requires-Dist: nidx-protos>=6.3.4.post3675
 Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
 Requires-Dist: nuclia-models>=0.24.2
 Requires-Dist: uvicorn

{nucliadb-6.3.4.post3656.dist-info → nucliadb-6.3.4.post3675.dist-info}/RECORD RENAMED Viewed

@@ -114,7 +114,7 @@ nucliadb/export_import/utils.py,sha256=aBBB7p05GfKknpb9LQa8Krtz0LlFoP5NUTiPy7PwP
 nucliadb/ingest/__init__.py,sha256=fsw3C38VP50km3R-nHL775LNGPpJ4JxqXJ2Ib1f5SqE,1011
 nucliadb/ingest/app.py,sha256=TaVgh5B2riFVmcsrbPb7a5YCzmnybjx-NK0BXgTwGAY,7535
 nucliadb/ingest/partitions.py,sha256=2NIhMYbNT0TNBL6bX1UMSi7vxFGICstCKEqsB0TXHOE,2410
-nucliadb/ingest/processing.py,sha256=8OggvuxNzktTTKDTUwsIuazhDParEWhn46CBZaMYAy8,20659
+nucliadb/ingest/processing.py,sha256=7NNoVxbSwsRdbo5goqVSrUc_QXZRVfOT_jZPzrmbxJQ,22207
 nucliadb/ingest/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nucliadb/ingest/serialize.py,sha256=42zNKu-O6g9EmLnQOXwhfagD76FSmWD6xRf69LrJxfA,16415
 nucliadb/ingest/settings.py,sha256=0B-wQNa8FLqtNcQgRzh-fuIuGptM816XHcbH1NQKfmE,3050
@@ -328,20 +328,20 @@ nucliadb/writer/api/constants.py,sha256=qWEDjFUycrEZnSJyLnNK4PQNodU2oVmkO4NycaEZ
 nucliadb/writer/api/utils.py,sha256=wIQHlU8RQiIGVLI72suvyVIKlCU44Unh0Ae0IiN6Qwo,1313
 nucliadb/writer/api/v1/__init__.py,sha256=akI9A_jloNLb0dU4T5zjfdyvmSAiDeIdjAlzNx74FlU,1128
 nucliadb/writer/api/v1/export_import.py,sha256=elf-EQY5DD3mhw8kWb9tQpDcbrF9sY6VFYqxQOjuVP0,8201
-nucliadb/writer/api/v1/field.py,sha256=OsWOYA0WQ6onE5Rkl20QIEdtrSi7Jgnu62fUt90Ziy8,17503
+nucliadb/writer/api/v1/field.py,sha256=FySCMpcruSAKGeepeAlOihjwxyUPcDO73Uilq5VDWRk,18514
 nucliadb/writer/api/v1/knowledgebox.py,sha256=MLeIuym4jPrJgfy1NTcN9CpUGwuBiqDHMcx0hY9DR7g,9530
 nucliadb/writer/api/v1/learning_config.py,sha256=CKBjqcbewkfPwGUPLDWzZSpro6XkmCaVppe5Qtpu5Go,3117
-nucliadb/writer/api/v1/resource.py,sha256=A8fAHlN5XFsg6XFYKhfWJS8czgNH6yXr-PsnUqz2WUE,18757
+nucliadb/writer/api/v1/resource.py,sha256=jV9HM-ID1PPYypfy4Sl4_9aSPF87v7gSJZUSzHjHcQ4,19740
 nucliadb/writer/api/v1/router.py,sha256=RjuoWLpZer6Kl2BW_wznpNo6XL3BOpdTGqXZCn3QrrQ,1034
 nucliadb/writer/api/v1/services.py,sha256=HLQW18AEC5zQp5azpeAtRi7ZTzQSwG6SbmkHlmjTIFA,13165
 nucliadb/writer/api/v1/slug.py,sha256=xlVBDBpRi9bNulpBHZwhyftVvulfE0zFm1XZIWl-AKY,2389
 nucliadb/writer/api/v1/transaction.py,sha256=d2Vbgnkk_-FLGSTt3vfldwiJIUf0XoyD0wP1jQNz_DY,2430
-nucliadb/writer/api/v1/upload.py,sha256=VOeqNTrZx1_z8iaKjM7p8fVlVcIYMtnQNK1dm72ct6k,33161
+nucliadb/writer/api/v1/upload.py,sha256=hLMHXSaqEOE-vjKjhIupgdx8klJc3mVQp_oMwx5N-7o,33800
 nucliadb/writer/api/v1/vectorsets.py,sha256=mESaXkkI9f-jWWMW61ZZgv7E5YWXKemyc6vwT0lFXns,6747
 nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
-nucliadb/writer/resource/basic.py,sha256=cHhh5hQRHFIoKd-6fEteHuGWW6fGN56ornIWPBuSpHg,11214
-nucliadb/writer/resource/field.py,sha256=HsOERELyAsb9e0dx2IkSQ9lk0SThALFRcDKCVBw8ifU,15478
+nucliadb/writer/resource/basic.py,sha256=_zdAr110C7rtEzOKoBRMzPjAnQ0pAtRfGjB8qCzodvI,11767
+nucliadb/writer/resource/field.py,sha256=qnj31lM9F0AFlj3QhPcPj90vHg7SMbbYW098fMtYt9o,20053
 nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
 nucliadb/writer/tus/__init__.py,sha256=huWpKnDnjsrKlBBJk30ta5vamlA-4x0TbPs_2Up8hyM,5443
 nucliadb/writer/tus/azure.py,sha256=XhWAlWTM0vmXcXtuEPYjjeEhuZjiZXZu8q9WsJ7omFE,4107
@@ -352,8 +352,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
 nucliadb/writer/tus/s3.py,sha256=vF0NkFTXiXhXq3bCVXXVV-ED38ECVoUeeYViP8uMqcU,8357
 nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
 nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
-nucliadb-6.3.4.post3656.dist-info/METADATA,sha256=spsdbVapfZnp-QuN7s9NBMX7yMWkHmsKvcgPAwKrckk,4291
-nucliadb-6.3.4.post3656.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
-nucliadb-6.3.4.post3656.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
-nucliadb-6.3.4.post3656.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
-nucliadb-6.3.4.post3656.dist-info/RECORD,,
+nucliadb-6.3.4.post3675.dist-info/METADATA,sha256=WfRjwWmeEZALPK3kwAm8Yh2VuYhd3KvyxNzEcrs5IWs,4291
+nucliadb-6.3.4.post3675.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
+nucliadb-6.3.4.post3675.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
+nucliadb-6.3.4.post3675.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
+nucliadb-6.3.4.post3675.dist-info/RECORD,,

{nucliadb-6.3.4.post3656.dist-info → nucliadb-6.3.4.post3675.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (76.0.0)
+Generator: setuptools (76.1.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nucliadb-6.3.4.post3656.dist-info → nucliadb-6.3.4.post3675.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nucliadb-6.3.4.post3656.dist-info → nucliadb-6.3.4.post3675.dist-info}/top_level.txt RENAMED Viewed

File without changes

nucliadb 6.3.4.post3656__py3-none-any.whl → 6.3.4.post3675__py3-none-any.whl

nucliadb 6.3.4.post3656py3-none-any.whl → 6.3.4.post3675py3-none-any.whl