PyPI - sunholo - Versions diffs - 0.118.10__py3-none-any.whl → 0.119.3__py3-none-any.whl - Mend

sunholo 0.118.10py3-none-any.whl → 0.119.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sunholo/discovery_engine/chunker_handler.py CHANGED Viewed

@@ -65,8 +65,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
             return None
         for corp in corpuses:
             try:
-                response = corp.import_documents(
-                    gcs_uri=message_data
+                response = corp.import_document_with_metadata(
+                    gcs_uri=message_data,
+                    metadata=metadata
                 )
                 log.info(f"Imported file to corpus: {response} with metadata: {metadata}")
             except Exception as err:
@@ -81,8 +82,9 @@ def do_discovery_engine(message_data:str, metadata:dict, config:ConfigManager=No
                         continue
                     if new_corp:
                         log.info(f"Found new DiscoveryEngine {config.vector_name=} - {new_corp=}")
-                        response = corp.import_documents(
-                            gcs_uri=message_data
+                        response = corp.import_document_with_metadata(
+                            gcs_uri=message_data,
+                            metadata=metadata
                         )
                 continue

sunholo/discovery_engine/create_new.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from .discovery_engine_client import DiscoveryEngineClient
 from ..utils import ConfigManager
 from ..utils.gcp_project import get_gcp_project
+from ..custom_logging import log
 def create_new_discovery_engine(config:ConfigManager):
@@ -12,9 +13,13 @@ def create_new_discovery_engine(config:ConfigManager):
             chunk_size = chunker_config["chunk_size"]
     gcp_config = config.vacConfig("gcp_config")
-    project_id = gcp_config.get("project_id") or get_gcp_project()
+    if not gcp_config:
+        log.info("Found no gcp_config in configuration so using get_gcp_project()")
+        project_id = get_gcp_project()
+    else:
+        project_id = gcp_config.get("project_id") or get_gcp_project()
     if not project_id:
-        raise ValueError("Could not find project_id in gcp_config")
+        raise ValueError("Could not find project_id in gcp_config or global")
     #location = gcp_config.get('location')

sunholo/discovery_engine/discovery_engine_client.py CHANGED Viewed

@@ -1,6 +1,6 @@
 try:
     from google.api_core.client_options import ClientOptions
-    from google.cloud import discoveryengine_v1alpha as discoveryengine
+    from google.cloud import discoveryengine
     from google.api_core.retry import Retry, if_exception_type
     from google.api_core.exceptions import ResourceExhausted, AlreadyExists
     from google.cloud.discoveryengine_v1alpha import SearchResponse, Chunk
@@ -13,6 +13,9 @@ except ImportError:
 from ..custom_logging import log
 from typing import Optional, List
 import asyncio
+import json
+import uuid
+from ..utils.mime import guess_mime_type
 class DiscoveryEngineClient:
     """
@@ -431,7 +434,37 @@ class DiscoveryEngineClient:
         return operation.operation.name
+    def _import_document_request(self,
+        request
+    ) -> str:
+        """
+        Handles the common logic for making an ImportDocumentsRequest, including retrying.
+        Args:
+            request (discoveryengine.ImportDocumentsRequest): The prepared request object.
+        Returns:
+            str: The operation name.
+        """
+        @self.my_retry()
+        def import_documents_with_retry(doc_client, request):
+            return doc_client.import_documents(request=request)
+        try:
+            operation = import_documents_with_retry(self.doc_client, request)
+        except ResourceExhausted as e:
+            log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
+            raise e
+        except AlreadyExists as e:
+            # Extract relevant info from the request to log
+            gcs_uri = request.gcs_source.input_uris if request.gcs_source else None
+            bigquery_table = request.bigquery_source.table_id if request.bigquery_source else None
+            log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
+        except Exception as e:
+            log.error(f"An unexpected DiscoveryEngine error occurred: {e}")
+            raise e
+        return operation.operation.name
     def import_documents(self,
         gcs_uri: Optional[str] = None,
@@ -479,23 +512,115 @@ class DiscoveryEngineClient:
             )
         # Make the request
-        @self.my_retry()
-        def import_documents_with_retry(doc_client, request):
-            return doc_client.import_documents(request=request)
-        try:
-            operation = import_documents_with_retry(self.doc_client, request)
-        except ResourceExhausted as e:
-            log.error(f"DiscoveryEngine Operation failed after retries due to quota exceeded: {e}")
+        return self._import_document_request(request)
-            raise e
-        except AlreadyExists as e:
-            log.warning(f"DiscoveryEngine - Already exists: {e} - {gcs_uri=} {bigquery_table=}")
-        except Exception as e:
-            log.error(f"An unexpected DiscoveryEngine error occurred: {e}")
+    def import_documents_with_metadata(self, gcs_uri: str, data_schema="content", branch="default_branch"):
+        """
+        Supply a JSONLD GCS location to import all the GS URIs within and their metadata
+        """
+        parent = self.doc_client.branch_path(
+            self.project_id,
+            self.location,
+            self.data_store_id,
+            branch
+        )
-            raise e
+        # 1. Prepare your documents with metadata:
+        documents_with_metadata = []
+        with open(gcs_uri, 'r') as f:  # Assuming one JSON object per line in your GCS file
+            for line in f:
+                try:
+                    document_data = json.loads(line)  # Load the JSON from the line
+                    # Check if it has the required fields, if not create them
+                    if "id" not in document_data:
+                        document_data["id"] = str(uuid.uuid4())
+                    if "structData" not in document_data:
+                        document_data["structData"] = {}
+                    if "content" not in document_data:
+                        document_data["content"] = {}
+                    # Create the Document object with your metadata
+                    document = discoveryengine.Document(
+                        name = f"{parent}/documents/{document_data['id']}", # important!
+                        id=document_data["id"],
+                        struct_data=document_data.get("structData", {}),  # Your metadata here
+                        content = discoveryengine.Content(
+                            mime_type = document_data.get("content", {}).get("mimeType", "text/plain"),
+                            uri = document_data.get("content", {}).get("uri", ""),
+                        )
+                    )
+                    if "jsonData" in document_data:
+                        document.json_data = document_data["jsonData"]
+                    documents_with_metadata.append(document)
+                except json.JSONDecodeError as e:
+                    log.error(f"Error decoding JSON in line: {line.strip()}. Error: {e}")
+                    continue  # Skip to the next line if there's an error
+        # 2. Use InlineSource to import:
+        request = discoveryengine.ImportDocumentsRequest(
+            parent=parent,
+            inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
+                documents=documents_with_metadata,  # Pass the list of Document objects
+            ),
+            reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
+        )
-        return operation.operation.name
+        return self._import_document_request(request)
+    def import_document_with_metadata(self, gcs_uri: str, metadata: dict, branch="default_branch"):
+        """
+        Imports a single document with metadata.
+        Args:
+            gcs_uri: The GCS URI of the document to import.
+            metadata: A dictionary containing the metadata for the document.
+            branch: The branch to import the document into.
+        Returns:
+            str: The operation name.
+        """
+        try:
+            # 1. Generate a unique document ID
+            document_id = str(uuid.uuid4())
+            # 2. Create a Document object
+            parent = self.doc_client.branch_path(
+                self.project_id, self.location, self.data_store_id, branch
+            )
+            document = discoveryengine.Document(
+                name=f"{parent}/documents/{document_id}",
+                id=document_id,
+                struct_data=metadata,
+                content=discoveryengine.Document.Content(
+                    uri=gcs_uri,
+                    mime_type=self.get_mime_type(gcs_uri)
+                )
+            )
+            # 3. Use InlineSource for import
+            request = discoveryengine.ImportDocumentsRequest(
+                parent=parent,
+                inline_source=discoveryengine.ImportDocumentsRequest.InlineSource(
+                    documents=[document],
+                ),
+                reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL
+            )
+            # 4. Make the import request (using the common method)
+            return self._import_document_request(request)
+        except Exception as e:
+            log.error(f"Error importing document with metadata: {e}")
+    def get_mime_type(self, uri:str):
+        return guess_mime_type(uri)
+    def search_with_filters(self, query, folder=None, date=None,
+                        num_previous_chunks=3, num_next_chunks=3,
+                        page_size=10, parse_chunks_to_string=True,
+                        serving_config="default_serving_config"):
+        pass

{sunholo-0.118.10.dist-info → sunholo-0.119.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: sunholo
-Version: 0.118.10
+Version: 0.119.3
 Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
 Author-email: Holosun ApS <multivac@sunholo.com>
 License: Apache License, Version 2.0
@@ -47,7 +47,7 @@ Requires-Dist: google-cloud-service-control; extra == "all"
 Requires-Dist: google-cloud-logging; extra == "all"
 Requires-Dist: google-cloud-storage; extra == "all"
 Requires-Dist: google-cloud-pubsub; extra == "all"
-Requires-Dist: google-cloud-discoveryengine; extra == "all"
+Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "all"
 Requires-Dist: google-cloud-texttospeech; extra == "all"
 Requires-Dist: google-generativeai>=0.7.1; extra == "all"
 Requires-Dist: google-genai; extra == "all"
@@ -133,7 +133,7 @@ Requires-Dist: google-cloud-service-control; extra == "gcp"
 Requires-Dist: google-cloud-storage; extra == "gcp"
 Requires-Dist: google-cloud-logging; extra == "gcp"
 Requires-Dist: google-cloud-pubsub; extra == "gcp"
-Requires-Dist: google-cloud-discoveryengine; extra == "gcp"
+Requires-Dist: google-cloud-discoveryengine>=0.13.4; extra == "gcp"
 Requires-Dist: google-cloud-texttospeech; extra == "gcp"
 Requires-Dist: google-genai; extra == "gcp"
 Requires-Dist: google-generativeai>=0.8.3; extra == "gcp"

{sunholo-0.118.10.dist-info → sunholo-0.119.3.dist-info}/RECORD RENAMED Viewed

@@ -72,9 +72,9 @@ sunholo/database/sql/sb/delete_source_row.sql,sha256=r6fEuUKdbiLHCDGKSbKINDCpJjs
 sunholo/database/sql/sb/return_sources.sql,sha256=89KAnxfK8n_qGK9jy1OQT8f9n4uYUtYL5cCxbC2mj_c,255
 sunholo/database/sql/sb/setup.sql,sha256=CvoFvZQev2uWjmFa3aj3m3iuPFzAAJZ0S7Qi3L3-zZI,89
 sunholo/discovery_engine/__init__.py,sha256=hLgqRDJ22Aov9o2QjAEfsVgnL3kMdM-g5p8RJ9OyKdQ,130
-sunholo/discovery_engine/chunker_handler.py,sha256=5tw5_jalNQosf7uFyCmsYA__VdNpWC1PPVVa420CzWU,5479
-sunholo/discovery_engine/create_new.py,sha256=jWg5LW-QpFE8zq50ShaQJB3Wu8loiWB0P4lRWaCHpss,1023
-sunholo/discovery_engine/discovery_engine_client.py,sha256=jfIayVUOPM4svGF1S5Kk60rIG-xSo_e3zOHtBRg0nZA,22002
+sunholo/discovery_engine/chunker_handler.py,sha256=E3z-rVUuhjDebJY6nderr9QBYe8CrjwKskwIkOa_e68,5591
+sunholo/discovery_engine/create_new.py,sha256=WUi4_xh_dFaGX3xA9jkNKZhaR6LCELjMPeRb0hyj4FU,1226
+sunholo/discovery_engine/discovery_engine_client.py,sha256=C9fz341ZFMPtVSvqw2DbAgosJ5r5-YjfigRK-uFsldY,27407
 sunholo/discovery_engine/get_ai_search_chunks.py,sha256=hsFGOQugSeTMPEaQ16XTs_D45F8NABBm2IsAEdTk7kQ,4316
 sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
 sunholo/embedder/embed_chunk.py,sha256=Vvvj3-H4pSb1a2sLik3-X3X459j2jrUq1dBNAsOcQLo,7156
@@ -164,9 +164,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
 sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
 sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
 sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
-sunholo-0.118.10.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
-sunholo-0.118.10.dist-info/METADATA,sha256=prYJcbvo8lNaYqKHCeNC-I2_9ObUb8hr_AZHt6YMRRY,9639
-sunholo-0.118.10.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-sunholo-0.118.10.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
-sunholo-0.118.10.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
-sunholo-0.118.10.dist-info/RECORD,,
+sunholo-0.119.3.dist-info/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
+sunholo-0.119.3.dist-info/METADATA,sha256=OmzF8MtMCAYTxGe_VmAh8Zv9eMFwfwQ5ZC85d0FtZHg,9654
+sunholo-0.119.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+sunholo-0.119.3.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
+sunholo-0.119.3.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
+sunholo-0.119.3.dist-info/RECORD,,

{sunholo-0.118.10.dist-info → sunholo-0.119.3.dist-info}/LICENSE.txt RENAMED Viewed

File without changes

{sunholo-0.118.10.dist-info → sunholo-0.119.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{sunholo-0.118.10.dist-info → sunholo-0.119.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sunholo-0.118.10.dist-info → sunholo-0.119.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

sunholo 0.118.10__py3-none-any.whl → 0.119.3__py3-none-any.whl

sunholo 0.118.10py3-none-any.whl → 0.119.3py3-none-any.whl