sunholo 0.123.5__py3-none-any.whl → 0.125.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunholo/database/alloydb_client.py +98 -4
- sunholo/embedder/embed_metadata.py +126 -0
- sunholo/gcs/download_url.py +1 -1
- {sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/METADATA +1 -1
- {sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/RECORD +9 -9
- {sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/WHEEL +0 -0
- {sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/entry_points.txt +0 -0
- {sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/licenses/LICENSE.txt +0 -0
- {sunholo-0.123.5.dist-info → sunholo-0.125.0.dist-info}/top_level.txt +0 -0
@@ -651,9 +651,102 @@ class AlloyDBClient:
|
|
651
651
|
|
652
652
|
return result
|
653
653
|
|
654
|
+
def flatten_dict(self, nested_dict, parent_key='', separator='.'):
|
655
|
+
"""
|
656
|
+
Flatten a nested dictionary into a single-level dictionary with dot notation for keys.
|
657
|
+
|
658
|
+
Args:
|
659
|
+
nested_dict (dict): The nested dictionary to flatten
|
660
|
+
parent_key (str): The parent key for the current recursion level
|
661
|
+
separator (str): The separator to use between key levels (default: '.')
|
662
|
+
|
663
|
+
Returns:
|
664
|
+
dict: A flattened dictionary with special handling for lists
|
665
|
+
"""
|
666
|
+
flattened = {}
|
667
|
+
|
668
|
+
for key, value in nested_dict.items():
|
669
|
+
# Create the new key with parent_key if it exists
|
670
|
+
new_key = f"{parent_key}{separator}{key}" if parent_key else key
|
671
|
+
|
672
|
+
# If value is a dictionary, recursively flatten it
|
673
|
+
if isinstance(value, dict):
|
674
|
+
flattened.update(self.flatten_dict(value, new_key, separator))
|
675
|
+
# Handle lists containing dictionaries or other values
|
676
|
+
elif isinstance(value, list):
|
677
|
+
# Mark lists for special processing during database insertion
|
678
|
+
# We'll use a special format to indicate this is a list that needs expansion
|
679
|
+
flattened[new_key] = {
|
680
|
+
"__is_expandable_list__": True,
|
681
|
+
"items": value
|
682
|
+
}
|
683
|
+
else:
|
684
|
+
# For simple values, just add them with the new key
|
685
|
+
flattened[new_key] = value
|
686
|
+
|
687
|
+
return flattened
|
688
|
+
|
654
689
|
async def write_data_to_table(self, table_name: str, data: dict, metadata: dict = None):
|
655
690
|
"""
|
656
|
-
Writes data to the specified table.
|
691
|
+
Writes data to the specified table, with special handling for expandable lists.
|
692
|
+
|
693
|
+
Args:
|
694
|
+
table_name (str): Name of the table
|
695
|
+
data (dict): Data to write to the table
|
696
|
+
metadata (dict, optional): Additional metadata to include
|
697
|
+
|
698
|
+
Returns:
|
699
|
+
List of results from SQL executions
|
700
|
+
"""
|
701
|
+
# Find any expandable lists in the data
|
702
|
+
expandable_lists = {}
|
703
|
+
regular_data = {}
|
704
|
+
|
705
|
+
for key, value in data.items():
|
706
|
+
if isinstance(value, dict) and value.get("__is_expandable_list__", False):
|
707
|
+
expandable_lists[key] = value["items"]
|
708
|
+
else:
|
709
|
+
regular_data[key] = value
|
710
|
+
|
711
|
+
# If no expandable lists are found, do a simple insert
|
712
|
+
if not expandable_lists:
|
713
|
+
return await self._insert_single_row(table_name, regular_data, metadata)
|
714
|
+
|
715
|
+
# For expandable lists, we need to create multiple rows
|
716
|
+
results = []
|
717
|
+
|
718
|
+
# Create combinations of rows based on expandable lists
|
719
|
+
if expandable_lists:
|
720
|
+
# Get the first expandable list to start with
|
721
|
+
primary_list_key = next(iter(expandable_lists))
|
722
|
+
primary_list = expandable_lists[primary_list_key]
|
723
|
+
|
724
|
+
# For each item in the primary list, create a new row
|
725
|
+
for item_idx, item in enumerate(primary_list):
|
726
|
+
# Create a copy of the regular data
|
727
|
+
row_data = dict(regular_data)
|
728
|
+
|
729
|
+
# Add the current item from the primary list
|
730
|
+
if isinstance(item, dict):
|
731
|
+
# If it's a dictionary, flatten it with the primary key as prefix
|
732
|
+
flattened_item = self.flatten_dict(item, primary_list_key, "_")
|
733
|
+
row_data.update(flattened_item)
|
734
|
+
else:
|
735
|
+
# If it's a simple value, just add it with the list key
|
736
|
+
row_data[primary_list_key] = item
|
737
|
+
|
738
|
+
# Add item index for reference
|
739
|
+
row_data[f"{primary_list_key}_index"] = item_idx
|
740
|
+
|
741
|
+
# Insert this row
|
742
|
+
result = await self._insert_single_row(table_name, row_data, metadata)
|
743
|
+
results.append(result)
|
744
|
+
|
745
|
+
return results
|
746
|
+
|
747
|
+
async def _insert_single_row(self, table_name: str, data: dict, metadata: dict = None):
|
748
|
+
"""
|
749
|
+
Inserts a single row of data into the specified table.
|
657
750
|
|
658
751
|
Args:
|
659
752
|
table_name (str): Name of the table
|
@@ -663,14 +756,15 @@ class AlloyDBClient:
|
|
663
756
|
Returns:
|
664
757
|
Result of SQL execution
|
665
758
|
"""
|
759
|
+
|
666
760
|
# Create copies to avoid modifying the original data
|
667
761
|
insert_data = dict(data)
|
668
762
|
|
669
763
|
# Add metadata if provided
|
670
764
|
if metadata:
|
671
|
-
insert_data["source"] = metadata.get("objectId", metadata.get("source", "
|
672
|
-
insert_data["extraction_backend"] = metadata.get("extraction_backend", "
|
673
|
-
insert_data["extraction_model"] = metadata.get("extraction_model", "
|
765
|
+
insert_data["source"] = metadata.get("objectId", metadata.get("source", "not-in-metadata"))
|
766
|
+
insert_data["extraction_backend"] = metadata.get("extraction_backend", "not-in-metadata")
|
767
|
+
insert_data["extraction_model"] = metadata.get("extraction_model", "not-in-metadata")
|
674
768
|
|
675
769
|
# Prepare column names and values for SQL
|
676
770
|
columns = [f'"{key}"' for key in insert_data.keys()]
|
@@ -1,5 +1,9 @@
|
|
1
1
|
|
2
2
|
import datetime
|
3
|
+
import re
|
4
|
+
|
5
|
+
from ..utils.mime import guess_mime_type
|
6
|
+
|
3
7
|
from ..custom_logging import log
|
4
8
|
|
5
9
|
def audit_metadata(metadata, chunk_length=None):
|
@@ -8,6 +12,24 @@ def audit_metadata(metadata, chunk_length=None):
|
|
8
12
|
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
9
13
|
metadata['eventtime'] = metadata['eventTime']
|
10
14
|
|
15
|
+
# Extract time-based dimensions from eventTime
|
16
|
+
try:
|
17
|
+
# Handle timestamps in ISO format with Z suffix
|
18
|
+
event_time_str = metadata['eventTime']
|
19
|
+
if event_time_str.endswith('Z'):
|
20
|
+
event_time_str = event_time_str[:-1] # Remove the Z suffix
|
21
|
+
|
22
|
+
event_time = datetime.datetime.fromisoformat(event_time_str)
|
23
|
+
|
24
|
+
# Add year dimension (e.g., 2025)
|
25
|
+
metadata['year'] = str(event_time.year)
|
26
|
+
# Add yearMonth dimension (e.g., 2025-03)
|
27
|
+
metadata['yearMonth'] = f"{event_time.year}-{event_time.month:02d}"
|
28
|
+
# Add month dimension (e.g., 03)
|
29
|
+
metadata['month'] = f"{event_time.month:02d}"
|
30
|
+
except (ValueError, TypeError) as e:
|
31
|
+
log.warning(f"Could not parse eventTime for time dimensions: {metadata['eventTime']}, error: {e}")
|
32
|
+
|
11
33
|
if 'source' not in metadata:
|
12
34
|
if 'objectId' in metadata:
|
13
35
|
metadata['source'] = metadata['objectId']
|
@@ -23,5 +45,109 @@ def audit_metadata(metadata, chunk_length=None):
|
|
23
45
|
|
24
46
|
if 'chunk_length' not in metadata:
|
25
47
|
metadata['chunk_length'] = chunk_length
|
48
|
+
|
49
|
+
# Extract folder paths from source field
|
50
|
+
if 'source' in metadata and metadata['source']:
|
51
|
+
source_path = metadata['source']
|
52
|
+
|
53
|
+
metadata['mime_type'] = guess_mime_type(source_path)
|
54
|
+
|
55
|
+
# Extract file extension
|
56
|
+
if '.' in source_path.split('/')[-1]:
|
57
|
+
file_extension = source_path.split('/')[-1].split('.')[-1].lower()
|
58
|
+
metadata['file_extension'] = file_extension
|
59
|
+
|
60
|
+
# Add file type category
|
61
|
+
if file_extension in ['pdf', 'doc', 'docx', 'txt', 'rtf', 'odt']:
|
62
|
+
metadata['file_type'] = 'document'
|
63
|
+
elif file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']:
|
64
|
+
metadata['file_type'] = 'image'
|
65
|
+
elif file_extension in ['mp3', 'wav', 'ogg', 'flac', 'm4a']:
|
66
|
+
metadata['file_type'] = 'audio'
|
67
|
+
elif file_extension in ['mp4', 'avi', 'mov', 'wmv', 'mkv', 'webm']:
|
68
|
+
metadata['file_type'] = 'video'
|
69
|
+
elif file_extension in ['xls', 'xlsx', 'csv']:
|
70
|
+
metadata['file_type'] = 'spreadsheet'
|
71
|
+
elif file_extension in ['ppt', 'pptx']:
|
72
|
+
metadata['file_type'] = 'presentation'
|
73
|
+
elif file_extension in ['zip', 'rar', 'tar', 'gz', '7z']:
|
74
|
+
metadata['file_type'] = 'archive'
|
75
|
+
elif file_extension in ['html', 'htm', 'xml', 'json', 'yaml', 'yml']:
|
76
|
+
metadata['file_type'] = 'markup'
|
77
|
+
elif file_extension in ['py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php']:
|
78
|
+
metadata['file_type'] = 'code'
|
79
|
+
else:
|
80
|
+
metadata['file_type'] = 'other'
|
81
|
+
|
82
|
+
# Check if the source looks like a GCS path
|
83
|
+
if source_path.startswith('gs://'):
|
84
|
+
# Remove the gs:// prefix
|
85
|
+
path_without_prefix = source_path[5:]
|
86
|
+
|
87
|
+
# Split the path into components
|
88
|
+
path_components = path_without_prefix.split('/')
|
89
|
+
|
90
|
+
# The first component is the bucket name
|
91
|
+
if len(path_components) > 0:
|
92
|
+
metadata['bucket_name'] = path_components[0]
|
93
|
+
|
94
|
+
# Extract up to 5 folder levels
|
95
|
+
for i in range(1, min(6, len(path_components))):
|
96
|
+
if i < len(path_components) - 1: # Skip the last component (filename)
|
97
|
+
folder_key = f'folder_{i}'
|
98
|
+
metadata[folder_key] = path_components[i]
|
99
|
+
|
100
|
+
# Extract the object name (last component)
|
101
|
+
if len(path_components) > 1:
|
102
|
+
metadata['object_name'] = path_components[-1]
|
103
|
+
|
104
|
+
# For other URL types, try to extract paths
|
105
|
+
elif re.match(r'^(http|https|s3|file)://', source_path):
|
106
|
+
# Extract path part after domain
|
107
|
+
match = re.search(r'://[^/]+/(.+)', source_path)
|
108
|
+
if match:
|
109
|
+
path_part = match.group(1)
|
110
|
+
path_components = path_part.split('/')
|
111
|
+
|
112
|
+
# Extract up to 5 folder levels
|
113
|
+
for i in range(0, min(5, len(path_components) - 1)):
|
114
|
+
folder_key = f'folder_{i+1}'
|
115
|
+
metadata[folder_key] = path_components[i]
|
116
|
+
|
117
|
+
# Extract the object name (last component)
|
118
|
+
if path_components:
|
119
|
+
metadata['object_name'] = path_components[-1]
|
120
|
+
|
121
|
+
# Add file size category if size exists
|
122
|
+
if 'size' in metadata and isinstance(metadata['size'], (int, float)):
|
123
|
+
size_bytes = metadata['size']
|
124
|
+
if size_bytes < 10 * 1024: # < 10KB
|
125
|
+
metadata['size_category'] = 'tiny'
|
126
|
+
elif size_bytes < 1024 * 1024: # < 1MB
|
127
|
+
metadata['size_category'] = 'small'
|
128
|
+
elif size_bytes < 10 * 1024 * 1024: # < 10MB
|
129
|
+
metadata['size_category'] = 'medium'
|
130
|
+
elif size_bytes < 100 * 1024 * 1024: # < 100MB
|
131
|
+
metadata['size_category'] = 'large'
|
132
|
+
else: # >= 100MB
|
133
|
+
metadata['size_category'] = 'very_large'
|
134
|
+
|
135
|
+
# Add day of week
|
136
|
+
try:
|
137
|
+
if 'eventTime' in metadata:
|
138
|
+
event_time_str = metadata['eventTime']
|
139
|
+
if event_time_str.endswith('Z'):
|
140
|
+
event_time_str = event_time_str[:-1]
|
141
|
+
|
142
|
+
event_time = datetime.datetime.fromisoformat(event_time_str)
|
143
|
+
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
|
144
|
+
metadata['day_of_week'] = weekday_names[event_time.weekday()]
|
145
|
+
|
146
|
+
# Add quarter information
|
147
|
+
quarter = (event_time.month - 1) // 3 + 1
|
148
|
+
metadata['quarter'] = f"Q{quarter}"
|
149
|
+
metadata['yearQuarter'] = f"{event_time.year}-Q{quarter}"
|
150
|
+
except (ValueError, TypeError) as e:
|
151
|
+
log.warning(f"Could not extract additional time metadata: {e}")
|
26
152
|
|
27
153
|
return metadata
|
sunholo/gcs/download_url.py
CHANGED
@@ -36,7 +36,7 @@ def get_image_from_gcs(gs_uri: str) -> Image.Image: # type: ignore
|
|
36
36
|
except IOError as e:
|
37
37
|
raise ValueError("Unable to open image from bytes:", e)
|
38
38
|
|
39
|
-
def get_bytes_from_gcs(gs_uri) -> Optional[bytes]:
|
39
|
+
def get_bytes_from_gcs(gs_uri: str) -> Optional[bytes]:
|
40
40
|
"""
|
41
41
|
Downloads a file from Google Cloud Storage and returns its bytes.
|
42
42
|
|
@@ -60,7 +60,7 @@ sunholo/components/retriever.py,sha256=Wmchv3huAM4w7DIS-a5Lp9Hi7M8pE6vZdxgseiT9S
|
|
60
60
|
sunholo/components/vectorstore.py,sha256=k7GS1Y5c6ZGXSDAJvyCes6dTjhDAi0fjGbVLqpyfzBc,5918
|
61
61
|
sunholo/database/__init__.py,sha256=bpB5Nk21kwqYj-qdVnvNgXjLsbflnH4g-San7OHMqR4,283
|
62
62
|
sunholo/database/alloydb.py,sha256=x1zUMB-EVWbE2Zvp4nAs2Z-tB_kOZmS45H2lwVHdYnk,11678
|
63
|
-
sunholo/database/alloydb_client.py,sha256=
|
63
|
+
sunholo/database/alloydb_client.py,sha256=s9P57k4RC_b0Dpy0rzTUHs-h9yj3ClFYL52JzXUYeU8,31487
|
64
64
|
sunholo/database/database.py,sha256=VqhZdkXUNdvWn8sUcUV3YNby1JDVf7IykPVXWBtxo9U,7361
|
65
65
|
sunholo/database/lancedb.py,sha256=DyfZntiFKBlVPaFooNN1Z6Pl-LAs4nxWKKuq8GBqN58,715
|
66
66
|
sunholo/database/static_dbs.py,sha256=8cvcMwUK6c32AS2e_WguKXWMkFf5iN3g9WHzsh0C07Q,442
|
@@ -79,13 +79,13 @@ sunholo/discovery_engine/discovery_engine_client.py,sha256=NjIcP10I2-8yj6QZKrxGz
|
|
79
79
|
sunholo/discovery_engine/get_ai_search_chunks.py,sha256=I6Dt1CznqEvE7XIZ2PkLqopmjpO96iVEWJJqL5cJjOU,5554
|
80
80
|
sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
|
81
81
|
sunholo/embedder/embed_chunk.py,sha256=did2pKkWM2o0KkRcb0H9l2x_WjCq6OyuHDxGbITFKPM,6530
|
82
|
-
sunholo/embedder/embed_metadata.py,sha256=
|
82
|
+
sunholo/embedder/embed_metadata.py,sha256=h9_L3Mkd7Mtnr8OwV4nNRrdSKoxhqh9LnSsht6j-vIY,6600
|
83
83
|
sunholo/excel/__init__.py,sha256=AqTMN9K4qJYi4maEgoORc5oxDVGO_eqmwzDaVP37JgY,56
|
84
84
|
sunholo/excel/plugin.py,sha256=TJJdcKWyqEIce1agCJImvqvNp2CvLhzi4wUmLYHcLc8,4032
|
85
85
|
sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
|
86
86
|
sunholo/gcs/add_file.py,sha256=Pd5Zc1a3gqbuBgSI-UDC2mQnYGLJbAh_-IUzkDN5s9k,8273
|
87
87
|
sunholo/gcs/download_folder.py,sha256=ijJTnS595JqZhBH8iHFErQilMbkuKgL-bnTCMLGuvlA,1614
|
88
|
-
sunholo/gcs/download_url.py,sha256=
|
88
|
+
sunholo/gcs/download_url.py,sha256=9QMEtZhrN-y1VAqvi-7Tw2GI9iRG_uuZzCg6Qhq8_yw,6421
|
89
89
|
sunholo/gcs/extract_and_sign.py,sha256=paRrTCvCN5vkQwCB7OSkxWi-pfOgOtZ0bwdXE08c3Ps,1546
|
90
90
|
sunholo/gcs/metadata.py,sha256=oQLcXi4brsZ74aegWyC1JZmhlaEV270HS5_UWtAYYWE,898
|
91
91
|
sunholo/genai/__init__.py,sha256=TV3PYHWoR4cChdmCOaYB0PtAEQ86qol9XYYEtb1JmSA,239
|
@@ -168,9 +168,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
|
|
168
168
|
sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
|
169
169
|
sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
|
170
170
|
sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
|
171
|
-
sunholo-0.
|
172
|
-
sunholo-0.
|
173
|
-
sunholo-0.
|
174
|
-
sunholo-0.
|
175
|
-
sunholo-0.
|
176
|
-
sunholo-0.
|
171
|
+
sunholo-0.125.0.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
|
172
|
+
sunholo-0.125.0.dist-info/METADATA,sha256=NyIZ1U8SH9vnTS0ECdCISbh2o7fp0HVBMsOKRvwipkE,10001
|
173
|
+
sunholo-0.125.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
174
|
+
sunholo-0.125.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
|
175
|
+
sunholo-0.125.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
|
176
|
+
sunholo-0.125.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|