sunholo 0.123.5__py3-none-any.whl → 0.125.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -651,9 +651,102 @@ class AlloyDBClient:
651
651
 
652
652
  return result
653
653
 
654
+ def flatten_dict(self, nested_dict, parent_key='', separator='.'):
655
+ """
656
+ Flatten a nested dictionary into a single-level dictionary with dot notation for keys.
657
+
658
+ Args:
659
+ nested_dict (dict): The nested dictionary to flatten
660
+ parent_key (str): The parent key for the current recursion level
661
+ separator (str): The separator to use between key levels (default: '.')
662
+
663
+ Returns:
664
+ dict: A flattened dictionary with special handling for lists
665
+ """
666
+ flattened = {}
667
+
668
+ for key, value in nested_dict.items():
669
+ # Create the new key with parent_key if it exists
670
+ new_key = f"{parent_key}{separator}{key}" if parent_key else key
671
+
672
+ # If value is a dictionary, recursively flatten it
673
+ if isinstance(value, dict):
674
+ flattened.update(self.flatten_dict(value, new_key, separator))
675
+ # Handle lists containing dictionaries or other values
676
+ elif isinstance(value, list):
677
+ # Mark lists for special processing during database insertion
678
+ # We'll use a special format to indicate this is a list that needs expansion
679
+ flattened[new_key] = {
680
+ "__is_expandable_list__": True,
681
+ "items": value
682
+ }
683
+ else:
684
+ # For simple values, just add them with the new key
685
+ flattened[new_key] = value
686
+
687
+ return flattened
688
+
654
689
  async def write_data_to_table(self, table_name: str, data: dict, metadata: dict = None):
655
690
  """
656
- Writes data to the specified table.
691
+ Writes data to the specified table, with special handling for expandable lists.
692
+
693
+ Args:
694
+ table_name (str): Name of the table
695
+ data (dict): Data to write to the table
696
+ metadata (dict, optional): Additional metadata to include
697
+
698
+ Returns:
699
+ List of results from SQL executions
700
+ """
701
+ # Find any expandable lists in the data
702
+ expandable_lists = {}
703
+ regular_data = {}
704
+
705
+ for key, value in data.items():
706
+ if isinstance(value, dict) and value.get("__is_expandable_list__", False):
707
+ expandable_lists[key] = value["items"]
708
+ else:
709
+ regular_data[key] = value
710
+
711
+ # If no expandable lists are found, do a simple insert
712
+ if not expandable_lists:
713
+ return await self._insert_single_row(table_name, regular_data, metadata)
714
+
715
+ # For expandable lists, we need to create multiple rows
716
+ results = []
717
+
718
+ # Create combinations of rows based on expandable lists
719
+ if expandable_lists:
720
+ # Get the first expandable list to start with
721
+ primary_list_key = next(iter(expandable_lists))
722
+ primary_list = expandable_lists[primary_list_key]
723
+
724
+ # For each item in the primary list, create a new row
725
+ for item_idx, item in enumerate(primary_list):
726
+ # Create a copy of the regular data
727
+ row_data = dict(regular_data)
728
+
729
+ # Add the current item from the primary list
730
+ if isinstance(item, dict):
731
+ # If it's a dictionary, flatten it with the primary key as prefix
732
+ flattened_item = self.flatten_dict(item, primary_list_key, "_")
733
+ row_data.update(flattened_item)
734
+ else:
735
+ # If it's a simple value, just add it with the list key
736
+ row_data[primary_list_key] = item
737
+
738
+ # Add item index for reference
739
+ row_data[f"{primary_list_key}_index"] = item_idx
740
+
741
+ # Insert this row
742
+ result = await self._insert_single_row(table_name, row_data, metadata)
743
+ results.append(result)
744
+
745
+ return results
746
+
747
+ async def _insert_single_row(self, table_name: str, data: dict, metadata: dict = None):
748
+ """
749
+ Inserts a single row of data into the specified table.
657
750
 
658
751
  Args:
659
752
  table_name (str): Name of the table
@@ -663,14 +756,15 @@ class AlloyDBClient:
663
756
  Returns:
664
757
  Result of SQL execution
665
758
  """
759
+
666
760
  # Create copies to avoid modifying the original data
667
761
  insert_data = dict(data)
668
762
 
669
763
  # Add metadata if provided
670
764
  if metadata:
671
- insert_data["source"] = metadata.get("objectId", metadata.get("source", "unknown"))
672
- insert_data["extraction_backend"] = metadata.get("extraction_backend", "unknown")
673
- insert_data["extraction_model"] = metadata.get("extraction_model", "unknown")
765
+ insert_data["source"] = metadata.get("objectId", metadata.get("source", "not-in-metadata"))
766
+ insert_data["extraction_backend"] = metadata.get("extraction_backend", "not-in-metadata")
767
+ insert_data["extraction_model"] = metadata.get("extraction_model", "not-in-metadata")
674
768
 
675
769
  # Prepare column names and values for SQL
676
770
  columns = [f'"{key}"' for key in insert_data.keys()]
@@ -1,5 +1,9 @@
1
1
 
2
2
  import datetime
3
+ import re
4
+
5
+ from ..utils.mime import guess_mime_type
6
+
3
7
  from ..custom_logging import log
4
8
 
5
9
  def audit_metadata(metadata, chunk_length=None):
@@ -8,6 +12,24 @@ def audit_metadata(metadata, chunk_length=None):
8
12
  metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
9
13
  metadata['eventtime'] = metadata['eventTime']
10
14
 
15
+ # Extract time-based dimensions from eventTime
16
+ try:
17
+ # Handle timestamps in ISO format with Z suffix
18
+ event_time_str = metadata['eventTime']
19
+ if event_time_str.endswith('Z'):
20
+ event_time_str = event_time_str[:-1] # Remove the Z suffix
21
+
22
+ event_time = datetime.datetime.fromisoformat(event_time_str)
23
+
24
+ # Add year dimension (e.g., 2025)
25
+ metadata['year'] = str(event_time.year)
26
+ # Add yearMonth dimension (e.g., 2025-03)
27
+ metadata['yearMonth'] = f"{event_time.year}-{event_time.month:02d}"
28
+ # Add month dimension (e.g., 03)
29
+ metadata['month'] = f"{event_time.month:02d}"
30
+ except (ValueError, TypeError) as e:
31
+ log.warning(f"Could not parse eventTime for time dimensions: {metadata['eventTime']}, error: {e}")
32
+
11
33
  if 'source' not in metadata:
12
34
  if 'objectId' in metadata:
13
35
  metadata['source'] = metadata['objectId']
@@ -23,5 +45,109 @@ def audit_metadata(metadata, chunk_length=None):
23
45
 
24
46
  if 'chunk_length' not in metadata:
25
47
  metadata['chunk_length'] = chunk_length
48
+
49
+ # Extract folder paths from source field
50
+ if 'source' in metadata and metadata['source']:
51
+ source_path = metadata['source']
52
+
53
+ metadata['mime_type'] = guess_mime_type(source_path)
54
+
55
+ # Extract file extension
56
+ if '.' in source_path.split('/')[-1]:
57
+ file_extension = source_path.split('/')[-1].split('.')[-1].lower()
58
+ metadata['file_extension'] = file_extension
59
+
60
+ # Add file type category
61
+ if file_extension in ['pdf', 'doc', 'docx', 'txt', 'rtf', 'odt']:
62
+ metadata['file_type'] = 'document'
63
+ elif file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']:
64
+ metadata['file_type'] = 'image'
65
+ elif file_extension in ['mp3', 'wav', 'ogg', 'flac', 'm4a']:
66
+ metadata['file_type'] = 'audio'
67
+ elif file_extension in ['mp4', 'avi', 'mov', 'wmv', 'mkv', 'webm']:
68
+ metadata['file_type'] = 'video'
69
+ elif file_extension in ['xls', 'xlsx', 'csv']:
70
+ metadata['file_type'] = 'spreadsheet'
71
+ elif file_extension in ['ppt', 'pptx']:
72
+ metadata['file_type'] = 'presentation'
73
+ elif file_extension in ['zip', 'rar', 'tar', 'gz', '7z']:
74
+ metadata['file_type'] = 'archive'
75
+ elif file_extension in ['html', 'htm', 'xml', 'json', 'yaml', 'yml']:
76
+ metadata['file_type'] = 'markup'
77
+ elif file_extension in ['py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php']:
78
+ metadata['file_type'] = 'code'
79
+ else:
80
+ metadata['file_type'] = 'other'
81
+
82
+ # Check if the source looks like a GCS path
83
+ if source_path.startswith('gs://'):
84
+ # Remove the gs:// prefix
85
+ path_without_prefix = source_path[5:]
86
+
87
+ # Split the path into components
88
+ path_components = path_without_prefix.split('/')
89
+
90
+ # The first component is the bucket name
91
+ if len(path_components) > 0:
92
+ metadata['bucket_name'] = path_components[0]
93
+
94
+ # Extract up to 5 folder levels
95
+ for i in range(1, min(6, len(path_components))):
96
+ if i < len(path_components) - 1: # Skip the last component (filename)
97
+ folder_key = f'folder_{i}'
98
+ metadata[folder_key] = path_components[i]
99
+
100
+ # Extract the object name (last component)
101
+ if len(path_components) > 1:
102
+ metadata['object_name'] = path_components[-1]
103
+
104
+ # For other URL types, try to extract paths
105
+ elif re.match(r'^(http|https|s3|file)://', source_path):
106
+ # Extract path part after domain
107
+ match = re.search(r'://[^/]+/(.+)', source_path)
108
+ if match:
109
+ path_part = match.group(1)
110
+ path_components = path_part.split('/')
111
+
112
+ # Extract up to 5 folder levels
113
+ for i in range(0, min(5, len(path_components) - 1)):
114
+ folder_key = f'folder_{i+1}'
115
+ metadata[folder_key] = path_components[i]
116
+
117
+ # Extract the object name (last component)
118
+ if path_components:
119
+ metadata['object_name'] = path_components[-1]
120
+
121
+ # Add file size category if size exists
122
+ if 'size' in metadata and isinstance(metadata['size'], (int, float)):
123
+ size_bytes = metadata['size']
124
+ if size_bytes < 10 * 1024: # < 10KB
125
+ metadata['size_category'] = 'tiny'
126
+ elif size_bytes < 1024 * 1024: # < 1MB
127
+ metadata['size_category'] = 'small'
128
+ elif size_bytes < 10 * 1024 * 1024: # < 10MB
129
+ metadata['size_category'] = 'medium'
130
+ elif size_bytes < 100 * 1024 * 1024: # < 100MB
131
+ metadata['size_category'] = 'large'
132
+ else: # >= 100MB
133
+ metadata['size_category'] = 'very_large'
134
+
135
+ # Add day of week
136
+ try:
137
+ if 'eventTime' in metadata:
138
+ event_time_str = metadata['eventTime']
139
+ if event_time_str.endswith('Z'):
140
+ event_time_str = event_time_str[:-1]
141
+
142
+ event_time = datetime.datetime.fromisoformat(event_time_str)
143
+ weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
144
+ metadata['day_of_week'] = weekday_names[event_time.weekday()]
145
+
146
+ # Add quarter information
147
+ quarter = (event_time.month - 1) // 3 + 1
148
+ metadata['quarter'] = f"Q{quarter}"
149
+ metadata['yearQuarter'] = f"{event_time.year}-Q{quarter}"
150
+ except (ValueError, TypeError) as e:
151
+ log.warning(f"Could not extract additional time metadata: {e}")
26
152
 
27
153
  return metadata
@@ -36,7 +36,7 @@ def get_image_from_gcs(gs_uri: str) -> Image.Image: # type: ignore
36
36
  except IOError as e:
37
37
  raise ValueError("Unable to open image from bytes:", e)
38
38
 
39
- def get_bytes_from_gcs(gs_uri) -> Optional[bytes]:
39
+ def get_bytes_from_gcs(gs_uri: str) -> Optional[bytes]:
40
40
  """
41
41
  Downloads a file from Google Cloud Storage and returns its bytes.
42
42
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sunholo
3
- Version: 0.123.5
3
+ Version: 0.125.0
4
4
  Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
5
5
  Author-email: Holosun ApS <multivac@sunholo.com>
6
6
  License: Apache License, Version 2.0
@@ -60,7 +60,7 @@ sunholo/components/retriever.py,sha256=Wmchv3huAM4w7DIS-a5Lp9Hi7M8pE6vZdxgseiT9S
60
60
  sunholo/components/vectorstore.py,sha256=k7GS1Y5c6ZGXSDAJvyCes6dTjhDAi0fjGbVLqpyfzBc,5918
61
61
  sunholo/database/__init__.py,sha256=bpB5Nk21kwqYj-qdVnvNgXjLsbflnH4g-San7OHMqR4,283
62
62
  sunholo/database/alloydb.py,sha256=x1zUMB-EVWbE2Zvp4nAs2Z-tB_kOZmS45H2lwVHdYnk,11678
63
- sunholo/database/alloydb_client.py,sha256=OCAi7Gopry7tiOOdjka-cldghFpxl6IXWWGEANmFVII,27414
63
+ sunholo/database/alloydb_client.py,sha256=s9P57k4RC_b0Dpy0rzTUHs-h9yj3ClFYL52JzXUYeU8,31487
64
64
  sunholo/database/database.py,sha256=VqhZdkXUNdvWn8sUcUV3YNby1JDVf7IykPVXWBtxo9U,7361
65
65
  sunholo/database/lancedb.py,sha256=DyfZntiFKBlVPaFooNN1Z6Pl-LAs4nxWKKuq8GBqN58,715
66
66
  sunholo/database/static_dbs.py,sha256=8cvcMwUK6c32AS2e_WguKXWMkFf5iN3g9WHzsh0C07Q,442
@@ -79,13 +79,13 @@ sunholo/discovery_engine/discovery_engine_client.py,sha256=NjIcP10I2-8yj6QZKrxGz
79
79
  sunholo/discovery_engine/get_ai_search_chunks.py,sha256=I6Dt1CznqEvE7XIZ2PkLqopmjpO96iVEWJJqL5cJjOU,5554
80
80
  sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
81
81
  sunholo/embedder/embed_chunk.py,sha256=did2pKkWM2o0KkRcb0H9l2x_WjCq6OyuHDxGbITFKPM,6530
82
- sunholo/embedder/embed_metadata.py,sha256=2ziUIdVwnbCUU8gOwQWEvkrRcyp-7IeyZfSsWNkMquA,866
82
+ sunholo/embedder/embed_metadata.py,sha256=h9_L3Mkd7Mtnr8OwV4nNRrdSKoxhqh9LnSsht6j-vIY,6600
83
83
  sunholo/excel/__init__.py,sha256=AqTMN9K4qJYi4maEgoORc5oxDVGO_eqmwzDaVP37JgY,56
84
84
  sunholo/excel/plugin.py,sha256=TJJdcKWyqEIce1agCJImvqvNp2CvLhzi4wUmLYHcLc8,4032
85
85
  sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
86
86
  sunholo/gcs/add_file.py,sha256=Pd5Zc1a3gqbuBgSI-UDC2mQnYGLJbAh_-IUzkDN5s9k,8273
87
87
  sunholo/gcs/download_folder.py,sha256=ijJTnS595JqZhBH8iHFErQilMbkuKgL-bnTCMLGuvlA,1614
88
- sunholo/gcs/download_url.py,sha256=Ul81n1rklr8WogPsuxWWD1Nr8RHU451LzHPMJNhAKzw,6416
88
+ sunholo/gcs/download_url.py,sha256=9QMEtZhrN-y1VAqvi-7Tw2GI9iRG_uuZzCg6Qhq8_yw,6421
89
89
  sunholo/gcs/extract_and_sign.py,sha256=paRrTCvCN5vkQwCB7OSkxWi-pfOgOtZ0bwdXE08c3Ps,1546
90
90
  sunholo/gcs/metadata.py,sha256=oQLcXi4brsZ74aegWyC1JZmhlaEV270HS5_UWtAYYWE,898
91
91
  sunholo/genai/__init__.py,sha256=TV3PYHWoR4cChdmCOaYB0PtAEQ86qol9XYYEtb1JmSA,239
@@ -168,9 +168,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
168
168
  sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
169
169
  sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
170
170
  sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
171
- sunholo-0.123.5.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
172
- sunholo-0.123.5.dist-info/METADATA,sha256=ahlMOD2O68Y-qNXEM0UmWYJt_6dZyPvjXxdDcB71T8Y,10001
173
- sunholo-0.123.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
174
- sunholo-0.123.5.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
175
- sunholo-0.123.5.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
176
- sunholo-0.123.5.dist-info/RECORD,,
171
+ sunholo-0.125.0.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
172
+ sunholo-0.125.0.dist-info/METADATA,sha256=NyIZ1U8SH9vnTS0ECdCISbh2o7fp0HVBMsOKRvwipkE,10001
173
+ sunholo-0.125.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
174
+ sunholo-0.125.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
175
+ sunholo-0.125.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
176
+ sunholo-0.125.0.dist-info/RECORD,,