PyPI - sunholo - Versions diffs - 0.123.5__py3-none-any.whl → 0.125.0__py3-none-any.whl - Mend

sunholo 0.123.5py3-none-any.whl → 0.125.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sunholo/database/alloydb_client.py CHANGED Viewed

@@ -651,9 +651,102 @@ class AlloyDBClient:
         return result
+    def flatten_dict(self, nested_dict, parent_key='', separator='.'):
+        """
+        Flatten a nested dictionary into a single-level dictionary with dot notation for keys.
+        Args:
+            nested_dict (dict): The nested dictionary to flatten
+            parent_key (str): The parent key for the current recursion level
+            separator (str): The separator to use between key levels (default: '.')
+        Returns:
+            dict: A flattened dictionary with special handling for lists
+        """
+        flattened = {}
+        for key, value in nested_dict.items():
+            # Create the new key with parent_key if it exists
+            new_key = f"{parent_key}{separator}{key}" if parent_key else key
+            # If value is a dictionary, recursively flatten it
+            if isinstance(value, dict):
+                flattened.update(self.flatten_dict(value, new_key, separator))
+            # Handle lists containing dictionaries or other values
+            elif isinstance(value, list):
+                # Mark lists for special processing during database insertion
+                # We'll use a special format to indicate this is a list that needs expansion
+                flattened[new_key] = {
+                    "__is_expandable_list__": True,
+                    "items": value
+                }
+            else:
+                # For simple values, just add them with the new key
+                flattened[new_key] = value
+        return flattened
     async def write_data_to_table(self, table_name: str, data: dict, metadata: dict = None):
         """
-        Writes data to the specified table.
+        Writes data to the specified table, with special handling for expandable lists.
+        Args:
+            table_name (str): Name of the table
+            data (dict): Data to write to the table
+            metadata (dict, optional): Additional metadata to include
+        Returns:
+            List of results from SQL executions
+        """
+        # Find any expandable lists in the data
+        expandable_lists = {}
+        regular_data = {}
+        for key, value in data.items():
+            if isinstance(value, dict) and value.get("__is_expandable_list__", False):
+                expandable_lists[key] = value["items"]
+            else:
+                regular_data[key] = value
+        # If no expandable lists are found, do a simple insert
+        if not expandable_lists:
+            return await self._insert_single_row(table_name, regular_data, metadata)
+        # For expandable lists, we need to create multiple rows
+        results = []
+        # Create combinations of rows based on expandable lists
+        if expandable_lists:
+            # Get the first expandable list to start with
+            primary_list_key = next(iter(expandable_lists))
+            primary_list = expandable_lists[primary_list_key]
+            # For each item in the primary list, create a new row
+            for item_idx, item in enumerate(primary_list):
+                # Create a copy of the regular data
+                row_data = dict(regular_data)
+                # Add the current item from the primary list
+                if isinstance(item, dict):
+                    # If it's a dictionary, flatten it with the primary key as prefix
+                    flattened_item = self.flatten_dict(item, primary_list_key, "_")
+                    row_data.update(flattened_item)
+                else:
+                    # If it's a simple value, just add it with the list key
+                    row_data[primary_list_key] = item
+                # Add item index for reference
+                row_data[f"{primary_list_key}_index"] = item_idx
+                # Insert this row
+                result = await self._insert_single_row(table_name, row_data, metadata)
+                results.append(result)
+        return results
+    async def _insert_single_row(self, table_name: str, data: dict, metadata: dict = None):
+        """
+        Inserts a single row of data into the specified table.
         Args:
             table_name (str): Name of the table
@@ -663,14 +756,15 @@ class AlloyDBClient:
         Returns:
             Result of SQL execution
         """
         # Create copies to avoid modifying the original data
         insert_data = dict(data)
         # Add metadata if provided
         if metadata:
-            insert_data["source"] = metadata.get("objectId", metadata.get("source", "unknown"))
-            insert_data["extraction_backend"] = metadata.get("extraction_backend", "unknown")
-            insert_data["extraction_model"] = metadata.get("extraction_model", "unknown")
+            insert_data["source"] = metadata.get("objectId", metadata.get("source", "not-in-metadata"))
+            insert_data["extraction_backend"] = metadata.get("extraction_backend", "not-in-metadata")
+            insert_data["extraction_model"] = metadata.get("extraction_model", "not-in-metadata")
         # Prepare column names and values for SQL
         columns = [f'"{key}"' for key in insert_data.keys()]

sunholo/embedder/embed_metadata.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import datetime
+import re
+from ..utils.mime import guess_mime_type
 from ..custom_logging import log
 def audit_metadata(metadata, chunk_length=None):
@@ -8,6 +12,24 @@ def audit_metadata(metadata, chunk_length=None):
         metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
     metadata['eventtime'] = metadata['eventTime']
+    # Extract time-based dimensions from eventTime
+    try:
+        # Handle timestamps in ISO format with Z suffix
+        event_time_str = metadata['eventTime']
+        if event_time_str.endswith('Z'):
+            event_time_str = event_time_str[:-1]  # Remove the Z suffix
+        event_time = datetime.datetime.fromisoformat(event_time_str)
+        # Add year dimension (e.g., 2025)
+        metadata['year'] = str(event_time.year)
+        # Add yearMonth dimension (e.g., 2025-03)
+        metadata['yearMonth'] = f"{event_time.year}-{event_time.month:02d}"
+        # Add month dimension (e.g., 03)
+        metadata['month'] = f"{event_time.month:02d}"
+    except (ValueError, TypeError) as e:
+        log.warning(f"Could not parse eventTime for time dimensions: {metadata['eventTime']}, error: {e}")
     if 'source' not in metadata:
         if 'objectId' in metadata:
             metadata['source'] = metadata['objectId']
@@ -23,5 +45,109 @@ def audit_metadata(metadata, chunk_length=None):
     if 'chunk_length' not in metadata:
         metadata['chunk_length'] = chunk_length
+     # Extract folder paths from source field
+    if 'source' in metadata and metadata['source']:
+        source_path = metadata['source']
+        metadata['mime_type'] = guess_mime_type(source_path)
+        # Extract file extension
+        if '.' in source_path.split('/')[-1]:
+            file_extension = source_path.split('/')[-1].split('.')[-1].lower()
+            metadata['file_extension'] = file_extension
+            # Add file type category
+            if file_extension in ['pdf', 'doc', 'docx', 'txt', 'rtf', 'odt']:
+                metadata['file_type'] = 'document'
+            elif file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']:
+                metadata['file_type'] = 'image'
+            elif file_extension in ['mp3', 'wav', 'ogg', 'flac', 'm4a']:
+                metadata['file_type'] = 'audio'
+            elif file_extension in ['mp4', 'avi', 'mov', 'wmv', 'mkv', 'webm']:
+                metadata['file_type'] = 'video'
+            elif file_extension in ['xls', 'xlsx', 'csv']:
+                metadata['file_type'] = 'spreadsheet'
+            elif file_extension in ['ppt', 'pptx']:
+                metadata['file_type'] = 'presentation'
+            elif file_extension in ['zip', 'rar', 'tar', 'gz', '7z']:
+                metadata['file_type'] = 'archive'
+            elif file_extension in ['html', 'htm', 'xml', 'json', 'yaml', 'yml']:
+                metadata['file_type'] = 'markup'
+            elif file_extension in ['py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php']:
+                metadata['file_type'] = 'code'
+            else:
+                metadata['file_type'] = 'other'
+        # Check if the source looks like a GCS path
+        if source_path.startswith('gs://'):
+            # Remove the gs:// prefix
+            path_without_prefix = source_path[5:]
+            # Split the path into components
+            path_components = path_without_prefix.split('/')
+            # The first component is the bucket name
+            if len(path_components) > 0:
+                metadata['bucket_name'] = path_components[0]
+            # Extract up to 5 folder levels
+            for i in range(1, min(6, len(path_components))):
+                if i < len(path_components) - 1:  # Skip the last component (filename)
+                    folder_key = f'folder_{i}'
+                    metadata[folder_key] = path_components[i]
+            # Extract the object name (last component)
+            if len(path_components) > 1:
+                metadata['object_name'] = path_components[-1]
+        # For other URL types, try to extract paths
+        elif re.match(r'^(http|https|s3|file)://', source_path):
+            # Extract path part after domain
+            match = re.search(r'://[^/]+/(.+)', source_path)
+            if match:
+                path_part = match.group(1)
+                path_components = path_part.split('/')
+                # Extract up to 5 folder levels
+                for i in range(0, min(5, len(path_components) - 1)):
+                    folder_key = f'folder_{i+1}'
+                    metadata[folder_key] = path_components[i]
+                # Extract the object name (last component)
+                if path_components:
+                    metadata['object_name'] = path_components[-1]
+    # Add file size category if size exists
+    if 'size' in metadata and isinstance(metadata['size'], (int, float)):
+        size_bytes = metadata['size']
+        if size_bytes < 10 * 1024:  # < 10KB
+            metadata['size_category'] = 'tiny'
+        elif size_bytes < 1024 * 1024:  # < 1MB
+            metadata['size_category'] = 'small'
+        elif size_bytes < 10 * 1024 * 1024:  # < 10MB
+            metadata['size_category'] = 'medium'
+        elif size_bytes < 100 * 1024 * 1024:  # < 100MB
+            metadata['size_category'] = 'large'
+        else:  # >= 100MB
+            metadata['size_category'] = 'very_large'
+    # Add day of week
+    try:
+        if 'eventTime' in metadata:
+            event_time_str = metadata['eventTime']
+            if event_time_str.endswith('Z'):
+                event_time_str = event_time_str[:-1]
+            event_time = datetime.datetime.fromisoformat(event_time_str)
+            weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
+            metadata['day_of_week'] = weekday_names[event_time.weekday()]
+            # Add quarter information
+            quarter = (event_time.month - 1) // 3 + 1
+            metadata['quarter'] = f"Q{quarter}"
+            metadata['yearQuarter'] = f"{event_time.year}-Q{quarter}"
+    except (ValueError, TypeError) as e:
+        log.warning(f"Could not extract additional time metadata: {e}")
     return metadata

sunholo/gcs/download_url.py CHANGED Viewed

@@ -36,7 +36,7 @@ def get_image_from_gcs(gs_uri: str) -> Image.Image: # type: ignore
     except IOError as e:
         raise ValueError("Unable to open image from bytes:", e)
-def get_bytes_from_gcs(gs_uri) -> Optional[bytes]:
+def get_bytes_from_gcs(gs_uri: str) -> Optional[bytes]:
     """
     Downloads a file from Google Cloud Storage and returns its bytes.

{sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sunholo
-Version: 0.123.5
+Version: 0.125.0
 Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
 Author-email: Holosun ApS <multivac@sunholo.com>
 License: Apache License, Version 2.0

{sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/RECORD RENAMED Viewed

@@ -60,7 +60,7 @@ sunholo/components/retriever.py,sha256=Wmchv3huAM4w7DIS-a5Lp9Hi7M8pE6vZdxgseiT9S
 sunholo/components/vectorstore.py,sha256=k7GS1Y5c6ZGXSDAJvyCes6dTjhDAi0fjGbVLqpyfzBc,5918
 sunholo/database/__init__.py,sha256=bpB5Nk21kwqYj-qdVnvNgXjLsbflnH4g-San7OHMqR4,283
 sunholo/database/alloydb.py,sha256=x1zUMB-EVWbE2Zvp4nAs2Z-tB_kOZmS45H2lwVHdYnk,11678
-sunholo/database/alloydb_client.py,sha256=OCAi7Gopry7tiOOdjka-cldghFpxl6IXWWGEANmFVII,27414
+sunholo/database/alloydb_client.py,sha256=s9P57k4RC_b0Dpy0rzTUHs-h9yj3ClFYL52JzXUYeU8,31487
 sunholo/database/database.py,sha256=VqhZdkXUNdvWn8sUcUV3YNby1JDVf7IykPVXWBtxo9U,7361
 sunholo/database/lancedb.py,sha256=DyfZntiFKBlVPaFooNN1Z6Pl-LAs4nxWKKuq8GBqN58,715
 sunholo/database/static_dbs.py,sha256=8cvcMwUK6c32AS2e_WguKXWMkFf5iN3g9WHzsh0C07Q,442
@@ -79,13 +79,13 @@ sunholo/discovery_engine/discovery_engine_client.py,sha256=NjIcP10I2-8yj6QZKrxGz
 sunholo/discovery_engine/get_ai_search_chunks.py,sha256=I6Dt1CznqEvE7XIZ2PkLqopmjpO96iVEWJJqL5cJjOU,5554
 sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
 sunholo/embedder/embed_chunk.py,sha256=did2pKkWM2o0KkRcb0H9l2x_WjCq6OyuHDxGbITFKPM,6530
-sunholo/embedder/embed_metadata.py,sha256=2ziUIdVwnbCUU8gOwQWEvkrRcyp-7IeyZfSsWNkMquA,866
+sunholo/embedder/embed_metadata.py,sha256=h9_L3Mkd7Mtnr8OwV4nNRrdSKoxhqh9LnSsht6j-vIY,6600
 sunholo/excel/__init__.py,sha256=AqTMN9K4qJYi4maEgoORc5oxDVGO_eqmwzDaVP37JgY,56
 sunholo/excel/plugin.py,sha256=TJJdcKWyqEIce1agCJImvqvNp2CvLhzi4wUmLYHcLc8,4032
 sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
 sunholo/gcs/add_file.py,sha256=Pd5Zc1a3gqbuBgSI-UDC2mQnYGLJbAh_-IUzkDN5s9k,8273
 sunholo/gcs/download_folder.py,sha256=ijJTnS595JqZhBH8iHFErQilMbkuKgL-bnTCMLGuvlA,1614
-sunholo/gcs/download_url.py,sha256=Ul81n1rklr8WogPsuxWWD1Nr8RHU451LzHPMJNhAKzw,6416
+sunholo/gcs/download_url.py,sha256=9QMEtZhrN-y1VAqvi-7Tw2GI9iRG_uuZzCg6Qhq8_yw,6421
 sunholo/gcs/extract_and_sign.py,sha256=paRrTCvCN5vkQwCB7OSkxWi-pfOgOtZ0bwdXE08c3Ps,1546
 sunholo/gcs/metadata.py,sha256=oQLcXi4brsZ74aegWyC1JZmhlaEV270HS5_UWtAYYWE,898
 sunholo/genai/__init__.py,sha256=TV3PYHWoR4cChdmCOaYB0PtAEQ86qol9XYYEtb1JmSA,239
@@ -168,9 +168,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
 sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
 sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
 sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
-sunholo-0.123.5.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
-sunholo-0.123.5.dist-info/METADATA,sha256=ahlMOD2O68Y-qNXEM0UmWYJt_6dZyPvjXxdDcB71T8Y,10001
-sunholo-0.123.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
-sunholo-0.123.5.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
-sunholo-0.123.5.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
-sunholo-0.123.5.dist-info/RECORD,,
+sunholo-0.125.0.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
+sunholo-0.125.0.dist-info/METADATA,sha256=NyIZ1U8SH9vnTS0ECdCISbh2o7fp0HVBMsOKRvwipkE,10001
+sunholo-0.125.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
+sunholo-0.125.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
+sunholo-0.125.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
+sunholo-0.125.0.dist-info/RECORD,,

{sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/licenses/LICENSE.txt RENAMED Viewed

File without changes

{sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

sunholo 0.123.5__py3-none-any.whl → 0.125.0__py3-none-any.whl

sunholo 0.123.5py3-none-any.whl → 0.125.0py3-none-any.whl