PyPI - nucliadb - Versions diffs - 6.7.0.post4800__py3-none-any.whl → 6.7.0.post4805__py3-none-any.whl - Mend

nucliadb 6.7.0.post4800py3-none-any.whl → 6.7.0.post4805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

nucliadb/ingest/fields/base.py CHANGED Viewed

@@ -332,37 +332,54 @@ class Field(Generic[PbType]):
             self.question_answers = actual_payload
     async def set_extracted_text(self, payload: ExtractedTextWrapper) -> None:
+        actual_payload: Optional[ExtractedText] = None
         if self.type in SUBFIELDFIELDS:
+            # Try to get the previously extracted text protobuf if it exists so we can merge it with the new splits
+            # coming from the processing payload.
             try:
-                actual_payload: Optional[ExtractedText] = await self.get_extracted_text(force=True)
+                actual_payload = await self.get_extracted_text(force=True)
             except KeyError:
-                actual_payload = None
-        else:
-            actual_payload = None
-        sf = self.get_storage_field(FieldTypes.FIELD_TEXT)
+                # No previous extracted text found
+                pass
+        sf = self.get_storage_field(FieldTypes.FIELD_TEXT)
         if actual_payload is None:
-            # Its first extracted text
+            # No previous extracted text, this is the first time we set it so we can simply upload it to storage
             if payload.HasField("file"):
+                # Normalize the storage key if the payload is a reference to a file in storage.
+                # This is typically the case when the text is too large and we store it in a
+                # cloud file. Normalization is needed to ensure that the hybrid-onprem deployment stores
+                # the file in the correct bucket of its storage.
                 await self.storage.normalize_binary(payload.file, sf)
             else:
+                # Directly upload the ExtractedText protobuf to storage
                 await self.storage.upload_pb(sf, payload.body)
                 self.extracted_text = payload.body
         else:
             if payload.HasField("file"):
+                # The extracted text coming from processing has a reference to another storage key.
+                # Download it and copy it to its ExtractedText.body field. This is typically for cases
+                # when the text is too large.
                 raw_payload = await self.storage.downloadbytescf(payload.file)
                 pb = ExtractedText()
                 pb.ParseFromString(raw_payload.read())
                 raw_payload.flush()
                 payload.body.CopyFrom(pb)
-            # We know its payload.body
+            # Update or set the extracted text text for each split coming from the processing payload
             for key, value in payload.body.split_text.items():
                 actual_payload.split_text[key] = value
+            # Apply any split deletions that may come in the processing payload
             for key in payload.body.deleted_splits:
                 if key in actual_payload.split_text:
                     del actual_payload.split_text[key]
+            # Finally, handle the main text body (for the cases where the text is not split)
             if payload.body.text != "":
                 actual_payload.text = payload.body.text
+            # Upload the updated ExtractedText to storage
             await self.storage.upload_pb(sf, actual_payload)
             self.extracted_text = actual_payload

nucliadb/ingest/fields/conversation.py CHANGED Viewed

@@ -26,8 +26,9 @@ from nucliadb_protos.resources_pb2 import Conversation as PBConversation
 from nucliadb_utils.storages.storage import StorageField
 PAGE_SIZE = 200
-KB_RESOURCE_FIELD = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
-KB_RESOURCE_FIELD_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
+CONVERSATION_PAGE_VALUE = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}/{page}"
+CONVERSATION_METADATA = "/kbs/{kbid}/r/{uuid}/f/{type}/{field}"
 class PageNotFound(Exception):
@@ -54,56 +55,65 @@ class Conversation(Field[PBConversation]):
         self.metadata = None
     async def set_value(self, payload: PBConversation):
-        last_page: Optional[PBConversation] = None
         metadata = await self.get_metadata()
         metadata.extract_strategy = payload.extract_strategy
         metadata.split_strategy = payload.split_strategy
+        # Get the last page if it exists
+        last_page: Optional[PBConversation] = None
         if self._created is False and metadata.pages > 0:
             try:
                 last_page = await self.db_get_value(page=metadata.pages)
             except PageNotFound:
                 pass
+        if last_page is None:
+            last_page = PBConversation()
+            metadata.pages += 1
-        # Make sure message attachment files are on our region
+        # Make sure message attachment files are on our region. This is needed
+        # to support the hybrid-onprem deployment as the attachments must be stored
+        # at the storage services of the client's premises.
         for message in payload.messages:
             new_message_files = []
-            for count, file in enumerate(message.content.attachments):
+            for idx, file in enumerate(message.content.attachments):
                 if self.storage.needs_move(file, self.kbid):
                     if message.ident == "":
                         message_ident = uuid.uuid4().hex
                     else:
                         message_ident = message.ident
-                    sf: StorageField = self.storage.conversation_field(
-                        self.kbid, self.uuid, self.id, message_ident, count
+                    sf: StorageField = self.storage.conversation_field_attachment(
+                        self.kbid, self.uuid, self.id, message_ident, attachment_index=idx
                     )
                     cf: CloudFile = await self.storage.normalize_binary(file, sf)
                     new_message_files.append(cf)
                 else:
                     new_message_files.append(file)
-            # Can be cleaned a list of PB
+            # Replace the attachments in the message with the new ones
             message.content.ClearField("attachments")
             for message_file in new_message_files:
                 message.content.attachments.append(message_file)
-        if last_page is None:
-            last_page = PBConversation()
-            metadata.pages += 1
-        # Merge on last page
+        # Increment the metadata total with the number of messages
         messages = list(payload.messages)
         metadata.total += len(messages)
+        # Store the messages in pages of PAGE_SIZE messages
         while len(messages) > 0:
-            count = metadata.size - len(last_page.messages)
-            last_page.messages.extend(messages[:count])
+            # Fit the messages in the last page
+            available_space = metadata.size - len(last_page.messages)
+            last_page.messages.extend(messages[:available_space])
+            # Save the last page
             await self.db_set_value(last_page, metadata.pages)
-            messages = messages[count:]
+            # If there are still messages, create a new page
+            messages = messages[available_space:]
             if len(messages) > 0:
                 metadata.pages += 1
                 last_page = PBConversation()
+        # Finally, set the metadata
         await self.db_set_metadata(metadata)
     async def get_value(self, page: Optional[int] = None) -> Optional[PBConversation]:
@@ -139,7 +149,7 @@ class Conversation(Field[PBConversation]):
     async def get_metadata(self) -> FieldConversation:
         if self.metadata is None:
             payload = await self.resource.txn.get(
-                KB_RESOURCE_FIELD_METADATA.format(
+                CONVERSATION_METADATA.format(
                     kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
                 )
             )
@@ -158,7 +168,7 @@ class Conversation(Field[PBConversation]):
             raise ValueError(f"Conversation pages start at index 1")
         if self.value.get(page) is None:
-            field_key = KB_RESOURCE_FIELD.format(
+            field_key = CONVERSATION_PAGE_VALUE.format(
                 kbid=self.kbid,
                 uuid=self.uuid,
                 type=self.type,
@@ -174,7 +184,7 @@ class Conversation(Field[PBConversation]):
         return self.value[page]
     async def db_set_value(self, payload: PBConversation, page: int = 0):
-        field_key = KB_RESOURCE_FIELD.format(
+        field_key = CONVERSATION_PAGE_VALUE.format(
             kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id, page=page
         )
         await self.resource.txn.set(
@@ -186,9 +196,7 @@ class Conversation(Field[PBConversation]):
     async def db_set_metadata(self, payload: FieldConversation):
         await self.resource.txn.set(
-            KB_RESOURCE_FIELD_METADATA.format(
-                kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id
-            ),
+            CONVERSATION_METADATA.format(kbid=self.kbid, uuid=self.uuid, type=self.type, field=self.id),
             payload.SerializeToString(),
         )
         self.metadata = payload

nucliadb/reader/api/v1/download.py CHANGED Viewed

@@ -174,7 +174,9 @@ async def download_field_conversation_rslug_prefix(
     message_id: str,
     file_num: int,
 ) -> Response:
-    return await _download_field_conversation(request, kbid, field_id, message_id, file_num, rslug=rslug)
+    return await _download_field_conversation_attachment(
+        request, kbid, field_id, message_id, file_num, rslug=rslug
+    )
 @api.get(
@@ -185,7 +187,7 @@ async def download_field_conversation_rslug_prefix(
 )
 @requires_one([NucliaDBRoles.READER])
 @version(1)
-async def download_field_conversation_rid_prefix(
+async def download_field_conversation_attachment_rid_prefix(
     request: Request,
     kbid: str,
     rid: str,
@@ -193,10 +195,12 @@ async def download_field_conversation_rid_prefix(
     message_id: str,
     file_num: int,
 ) -> Response:
-    return await _download_field_conversation(request, kbid, field_id, message_id, file_num, rid=rid)
+    return await _download_field_conversation_attachment(
+        request, kbid, field_id, message_id, file_num, rid=rid
+    )
-async def _download_field_conversation(
+async def _download_field_conversation_attachment(
     request: Request,
     kbid: str,
     field_id: str,
@@ -209,7 +213,9 @@ async def _download_field_conversation(
     storage = await get_storage(service_name=SERVICE_NAME)
-    sf = storage.conversation_field(kbid, rid, field_id, message_id, file_num)
+    sf = storage.conversation_field_attachment(
+        kbid, rid, field_id, message_id, attachment_index=file_num
+    )
     return await download_api(sf, request.headers)

nucliadb/writer/resource/field.py CHANGED Viewed

@@ -466,9 +466,9 @@ async def parse_conversation_field(
             ]
         )
-        for count, file in enumerate(message.content.attachments):
-            sf_conv_field: StorageField = storage.conversation_field(
-                kbid, uuid, field=key, ident=message.ident, count=count
+        for idx, file in enumerate(message.content.attachments):
+            sf_conv_field: StorageField = storage.conversation_field_attachment(
+                kbid, uuid, field=key, ident=message.ident, attachment_index=idx
             )
             cf_conv_field = await storage.upload_b64file_to_cloudfile(
                 sf_conv_field,

{nucliadb-6.7.0.post4800.dist-info → nucliadb-6.7.0.post4805.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nucliadb
-Version: 6.7.0.post4800
+Version: 6.7.0.post4805
 Summary: NucliaDB
 Author-email: Nuclia <nucliadb@nuclia.com>
 License-Expression: AGPL-3.0-or-later
@@ -19,11 +19,11 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: <4,>=3.9
 Description-Content-Type: text/markdown
-Requires-Dist: nucliadb-telemetry[all]>=6.7.0.post4800
-Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.7.0.post4800
-Requires-Dist: nucliadb-protos>=6.7.0.post4800
-Requires-Dist: nucliadb-models>=6.7.0.post4800
-Requires-Dist: nidx-protos>=6.7.0.post4800
+Requires-Dist: nucliadb-telemetry[all]>=6.7.0.post4805
+Requires-Dist: nucliadb-utils[cache,fastapi,storages]>=6.7.0.post4805
+Requires-Dist: nucliadb-protos>=6.7.0.post4805
+Requires-Dist: nucliadb-models>=6.7.0.post4805
+Requires-Dist: nidx-protos>=6.7.0.post4805
 Requires-Dist: nucliadb-admin-assets>=1.0.0.post1224
 Requires-Dist: nuclia-models>=0.46.0
 Requires-Dist: uvicorn[standard]

{nucliadb-6.7.0.post4800.dist-info → nucliadb-6.7.0.post4805.dist-info}/RECORD RENAMED Viewed

@@ -148,8 +148,8 @@ nucliadb/ingest/consumer/service.py,sha256=8AD41mMN7EUeUtk4ZNy14zfvxzwmVjIX6Mwe0
 nucliadb/ingest/consumer/shard_creator.py,sha256=w0smEu01FU_2cjZnsfBRNqT_Ntho11X17zTMST-vKbc,4359
 nucliadb/ingest/consumer/utils.py,sha256=jpX8D4lKzuPCpArQLZeX_Zczq3pfen_zAf8sPJfOEZU,2642
 nucliadb/ingest/fields/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
-nucliadb/ingest/fields/base.py,sha256=vYLGB-8SRYnFIHSZBSm20iXZDAzlwuBiJQC8s3BQv6w,22958
-nucliadb/ingest/fields/conversation.py,sha256=PGOr7l6cvg-l1HMr-5UIJq53R6W93F5uew0fOrSufPI,7140
+nucliadb/ingest/fields/base.py,sha256=O9rMKZmSdZmcxtuO6qUXjeoAkmUR5A-xga49Jyn6caI,24192
+nucliadb/ingest/fields/conversation.py,sha256=W68Y7riDXbwavpE4jKslLNAjdqCVaaR2a-BisZYayeo,7675
 nucliadb/ingest/fields/exceptions.py,sha256=sZBk21BSrXFdOdo1qUdCAyD-9YMYakSLdn4_WdIPCIQ,1217
 nucliadb/ingest/fields/file.py,sha256=1v4jLg3balUua2VmSV8hHkAwPFShTUCOzufZvIUQcQw,4740
 nucliadb/ingest/fields/generic.py,sha256=elgtqv15aJUq3zY7X_g0bli_2BpcwPArVvzhe54Y4Ig,1547
@@ -199,7 +199,7 @@ nucliadb/reader/run.py,sha256=AR-iCnON3YVXgI5-KEgg99G4KAPN1BKXDg7nr4dgoDA,1447
 nucliadb/reader/api/__init__.py,sha256=c-UD29C0FVzQDGEvslebDCKtvnyEcAbiDd-3Q_QgGN4,872
 nucliadb/reader/api/models.py,sha256=UHhOPmh8xcHhDjFm8_-8t66yEggXdxRBoY0xV-hI9to,2076
 nucliadb/reader/api/v1/__init__.py,sha256=ieP8lsCCwG66Jupv8II5MSTj6nh3eCtLcF4utH9JOcU,1102
-nucliadb/reader/api/v1/download.py,sha256=rGv1c5CjrJheDgGwAnNWy76A_4V2shqqHYvwmKGFlpk,10758
+nucliadb/reader/api/v1/download.py,sha256=A4ntoyPbLQskORrVSGnnsMAmCZiQ-Mufi5j2u4lXeQQ,10872
 nucliadb/reader/api/v1/export_import.py,sha256=x4VBNDFjnlY1nIt5kdq0eZTB_DeRzGzT8T7uB7wUhNU,6448
 nucliadb/reader/api/v1/knowledgebox.py,sha256=Uu-yPB8KKZt1VaFrFNMMaXOvLsclBJDK9dzZ9lF2ctI,3645
 nucliadb/reader/api/v1/learning_config.py,sha256=t_KqQBBbhpo0m6nQTkYmNdZsLVmW53SLcHMrCWiQMrk,6536
@@ -365,7 +365,7 @@ nucliadb/writer/api/v1/vectorsets.py,sha256=F3iMViL5G95_Tns4aO2SOA0DwAzxK2_P8MXx
 nucliadb/writer/resource/__init__.py,sha256=cp15ZcFnHvpcu_5-aK2A4uUyvuZVV_MJn4bIXMa20ks,835
 nucliadb/writer/resource/audit.py,sha256=FvxMZPzrNHtd31HgpZEvxzwAkbxJTZRhPLqRYYJi3tA,1426
 nucliadb/writer/resource/basic.py,sha256=44GK8M9EEVoAUfGiabdLrrpENqeFwNn7qwxF2AHhQGg,10504
-nucliadb/writer/resource/field.py,sha256=me3Ig-11evXEJl3UNJsmZyHbs_VZYMa-WIKOHZPsO3E,21175
+nucliadb/writer/resource/field.py,sha256=1fHm2N-3LcgQDE3BUDiJ0ToCiwY1Di7UbA6fcJs8BIk,21193
 nucliadb/writer/resource/origin.py,sha256=pvhUDdU0mlWPUcpoQi4LDUJaRtfjzVVrA8XcGVI_N8k,2021
 nucliadb/writer/tus/__init__.py,sha256=Kera0BtxoDX0ngPftXiMjNgjrhtQ3l2XFc5nJqSBOJY,5498
 nucliadb/writer/tus/azure.py,sha256=XhWAlWTM0vmXcXtuEPYjjeEhuZjiZXZu8q9WsJ7omFE,4107
@@ -376,8 +376,8 @@ nucliadb/writer/tus/local.py,sha256=7jYa_w9b-N90jWgN2sQKkNcomqn6JMVBOVeDOVYJHto,
 nucliadb/writer/tus/s3.py,sha256=vu1BGg4VqJ_x2P1u2BxqPKlSfw5orT_a3R-Ln5oPUpU,8483
 nucliadb/writer/tus/storage.py,sha256=ToqwjoYnjI4oIcwzkhha_MPxi-k4Jk3Lt55zRwaC1SM,2903
 nucliadb/writer/tus/utils.py,sha256=MSdVbRsRSZVdkaum69_0wku7X3p5wlZf4nr6E0GMKbw,2556
-nucliadb-6.7.0.post4800.dist-info/METADATA,sha256=p5YogXctzlcnUaCPjJowa0_MXNaVlxDYJdow5vfEqrs,4158
-nucliadb-6.7.0.post4800.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nucliadb-6.7.0.post4800.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
-nucliadb-6.7.0.post4800.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
-nucliadb-6.7.0.post4800.dist-info/RECORD,,
+nucliadb-6.7.0.post4805.dist-info/METADATA,sha256=uVCY9B5etvUvVS9QDwmPSGBJfJdGiMnOe0HRsHqh0JU,4158
+nucliadb-6.7.0.post4805.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nucliadb-6.7.0.post4805.dist-info/entry_points.txt,sha256=XqGfgFDuY3zXQc8ewXM2TRVjTModIq851zOsgrmaXx4,1268
+nucliadb-6.7.0.post4805.dist-info/top_level.txt,sha256=hwYhTVnX7jkQ9gJCkVrbqEG1M4lT2F_iPQND1fCzF80,20
+nucliadb-6.7.0.post4805.dist-info/RECORD,,

{nucliadb-6.7.0.post4800.dist-info → nucliadb-6.7.0.post4805.dist-info}/WHEEL RENAMED Viewed

File without changes

{nucliadb-6.7.0.post4800.dist-info → nucliadb-6.7.0.post4805.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{nucliadb-6.7.0.post4800.dist-info → nucliadb-6.7.0.post4805.dist-info}/top_level.txt RENAMED Viewed

File without changes

nucliadb 6.7.0.post4800__py3-none-any.whl → 6.7.0.post4805__py3-none-any.whl

nucliadb 6.7.0.post4800py3-none-any.whl → 6.7.0.post4805py3-none-any.whl