sunholo 0.123.4__py3-none-any.whl → 0.124.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sunholo/database/alloydb_client.py +10 -4
- sunholo/embedder/embed_metadata.py +126 -0
- {sunholo-0.123.4.dist-info → sunholo-0.124.0.dist-info}/METADATA +1 -1
- {sunholo-0.123.4.dist-info → sunholo-0.124.0.dist-info}/RECORD +8 -8
- {sunholo-0.123.4.dist-info → sunholo-0.124.0.dist-info}/WHEEL +0 -0
- {sunholo-0.123.4.dist-info → sunholo-0.124.0.dist-info}/entry_points.txt +0 -0
- {sunholo-0.123.4.dist-info → sunholo-0.124.0.dist-info}/licenses/LICENSE.txt +0 -0
- {sunholo-0.123.4.dist-info → sunholo-0.124.0.dist-info}/top_level.txt +0 -0
@@ -14,7 +14,6 @@ from .uuid import generate_uuid_from_object_id
|
|
14
14
|
from ..custom_logging import log
|
15
15
|
from ..utils import ConfigManager
|
16
16
|
from ..components import get_embeddings
|
17
|
-
|
18
17
|
class AlloyDBClient:
|
19
18
|
"""
|
20
19
|
A class to manage interactions with an AlloyDB instance.
|
@@ -530,9 +529,16 @@ class AlloyDBClient:
|
|
530
529
|
bool: True if connection is valid, False otherwise
|
531
530
|
"""
|
532
531
|
try:
|
533
|
-
#
|
534
|
-
|
535
|
-
|
532
|
+
# For pg8000 engine, use synchronous connection
|
533
|
+
if self.engine_type == "pg8000":
|
534
|
+
# Use direct synchronous query
|
535
|
+
with self.engine.connect() as conn:
|
536
|
+
conn.execute(sqlalchemy.text("SELECT 1"))
|
537
|
+
return True
|
538
|
+
else:
|
539
|
+
# For langchain, use async connection
|
540
|
+
await self._execute_sql_async_langchain("SELECT 1")
|
541
|
+
return True
|
536
542
|
except Exception as e:
|
537
543
|
log.warning(f"Database connection check failed: {e}")
|
538
544
|
return False
|
@@ -1,5 +1,9 @@
|
|
1
1
|
|
2
2
|
import datetime
|
3
|
+
import re
|
4
|
+
|
5
|
+
from ..utils.mime import guess_mime_type
|
6
|
+
|
3
7
|
from ..custom_logging import log
|
4
8
|
|
5
9
|
def audit_metadata(metadata, chunk_length=None):
|
@@ -8,6 +12,24 @@ def audit_metadata(metadata, chunk_length=None):
|
|
8
12
|
metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
|
9
13
|
metadata['eventtime'] = metadata['eventTime']
|
10
14
|
|
15
|
+
# Extract time-based dimensions from eventTime
|
16
|
+
try:
|
17
|
+
# Handle timestamps in ISO format with Z suffix
|
18
|
+
event_time_str = metadata['eventTime']
|
19
|
+
if event_time_str.endswith('Z'):
|
20
|
+
event_time_str = event_time_str[:-1] # Remove the Z suffix
|
21
|
+
|
22
|
+
event_time = datetime.datetime.fromisoformat(event_time_str)
|
23
|
+
|
24
|
+
# Add year dimension (e.g., 2025)
|
25
|
+
metadata['year'] = str(event_time.year)
|
26
|
+
# Add yearMonth dimension (e.g., 2025-03)
|
27
|
+
metadata['yearMonth'] = f"{event_time.year}-{event_time.month:02d}"
|
28
|
+
# Add month dimension (e.g., 03)
|
29
|
+
metadata['month'] = f"{event_time.month:02d}"
|
30
|
+
except (ValueError, TypeError) as e:
|
31
|
+
log.warning(f"Could not parse eventTime for time dimensions: {metadata['eventTime']}, error: {e}")
|
32
|
+
|
11
33
|
if 'source' not in metadata:
|
12
34
|
if 'objectId' in metadata:
|
13
35
|
metadata['source'] = metadata['objectId']
|
@@ -23,5 +45,109 @@ def audit_metadata(metadata, chunk_length=None):
|
|
23
45
|
|
24
46
|
if 'chunk_length' not in metadata:
|
25
47
|
metadata['chunk_length'] = chunk_length
|
48
|
+
|
49
|
+
# Extract folder paths from source field
|
50
|
+
if 'source' in metadata and metadata['source']:
|
51
|
+
source_path = metadata['source']
|
52
|
+
|
53
|
+
metadata['mime_type'] = guess_mime_type(source_path)
|
54
|
+
|
55
|
+
# Extract file extension
|
56
|
+
if '.' in source_path.split('/')[-1]:
|
57
|
+
file_extension = source_path.split('/')[-1].split('.')[-1].lower()
|
58
|
+
metadata['file_extension'] = file_extension
|
59
|
+
|
60
|
+
# Add file type category
|
61
|
+
if file_extension in ['pdf', 'doc', 'docx', 'txt', 'rtf', 'odt']:
|
62
|
+
metadata['file_type'] = 'document'
|
63
|
+
elif file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']:
|
64
|
+
metadata['file_type'] = 'image'
|
65
|
+
elif file_extension in ['mp3', 'wav', 'ogg', 'flac', 'm4a']:
|
66
|
+
metadata['file_type'] = 'audio'
|
67
|
+
elif file_extension in ['mp4', 'avi', 'mov', 'wmv', 'mkv', 'webm']:
|
68
|
+
metadata['file_type'] = 'video'
|
69
|
+
elif file_extension in ['xls', 'xlsx', 'csv']:
|
70
|
+
metadata['file_type'] = 'spreadsheet'
|
71
|
+
elif file_extension in ['ppt', 'pptx']:
|
72
|
+
metadata['file_type'] = 'presentation'
|
73
|
+
elif file_extension in ['zip', 'rar', 'tar', 'gz', '7z']:
|
74
|
+
metadata['file_type'] = 'archive'
|
75
|
+
elif file_extension in ['html', 'htm', 'xml', 'json', 'yaml', 'yml']:
|
76
|
+
metadata['file_type'] = 'markup'
|
77
|
+
elif file_extension in ['py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php']:
|
78
|
+
metadata['file_type'] = 'code'
|
79
|
+
else:
|
80
|
+
metadata['file_type'] = 'other'
|
81
|
+
|
82
|
+
# Check if the source looks like a GCS path
|
83
|
+
if source_path.startswith('gs://'):
|
84
|
+
# Remove the gs:// prefix
|
85
|
+
path_without_prefix = source_path[5:]
|
86
|
+
|
87
|
+
# Split the path into components
|
88
|
+
path_components = path_without_prefix.split('/')
|
89
|
+
|
90
|
+
# The first component is the bucket name
|
91
|
+
if len(path_components) > 0:
|
92
|
+
metadata['bucket_name'] = path_components[0]
|
93
|
+
|
94
|
+
# Extract up to 5 folder levels
|
95
|
+
for i in range(1, min(6, len(path_components))):
|
96
|
+
if i < len(path_components) - 1: # Skip the last component (filename)
|
97
|
+
folder_key = f'folder_{i}'
|
98
|
+
metadata[folder_key] = path_components[i]
|
99
|
+
|
100
|
+
# Extract the object name (last component)
|
101
|
+
if len(path_components) > 1:
|
102
|
+
metadata['object_name'] = path_components[-1]
|
103
|
+
|
104
|
+
# For other URL types, try to extract paths
|
105
|
+
elif re.match(r'^(http|https|s3|file)://', source_path):
|
106
|
+
# Extract path part after domain
|
107
|
+
match = re.search(r'://[^/]+/(.+)', source_path)
|
108
|
+
if match:
|
109
|
+
path_part = match.group(1)
|
110
|
+
path_components = path_part.split('/')
|
111
|
+
|
112
|
+
# Extract up to 5 folder levels
|
113
|
+
for i in range(0, min(5, len(path_components) - 1)):
|
114
|
+
folder_key = f'folder_{i+1}'
|
115
|
+
metadata[folder_key] = path_components[i]
|
116
|
+
|
117
|
+
# Extract the object name (last component)
|
118
|
+
if path_components:
|
119
|
+
metadata['object_name'] = path_components[-1]
|
120
|
+
|
121
|
+
# Add file size category if size exists
|
122
|
+
if 'size' in metadata and isinstance(metadata['size'], (int, float)):
|
123
|
+
size_bytes = metadata['size']
|
124
|
+
if size_bytes < 10 * 1024: # < 10KB
|
125
|
+
metadata['size_category'] = 'tiny'
|
126
|
+
elif size_bytes < 1024 * 1024: # < 1MB
|
127
|
+
metadata['size_category'] = 'small'
|
128
|
+
elif size_bytes < 10 * 1024 * 1024: # < 10MB
|
129
|
+
metadata['size_category'] = 'medium'
|
130
|
+
elif size_bytes < 100 * 1024 * 1024: # < 100MB
|
131
|
+
metadata['size_category'] = 'large'
|
132
|
+
else: # >= 100MB
|
133
|
+
metadata['size_category'] = 'very_large'
|
134
|
+
|
135
|
+
# Add day of week
|
136
|
+
try:
|
137
|
+
if 'eventTime' in metadata:
|
138
|
+
event_time_str = metadata['eventTime']
|
139
|
+
if event_time_str.endswith('Z'):
|
140
|
+
event_time_str = event_time_str[:-1]
|
141
|
+
|
142
|
+
event_time = datetime.datetime.fromisoformat(event_time_str)
|
143
|
+
weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
|
144
|
+
metadata['day_of_week'] = weekday_names[event_time.weekday()]
|
145
|
+
|
146
|
+
# Add quarter information
|
147
|
+
quarter = (event_time.month - 1) // 3 + 1
|
148
|
+
metadata['quarter'] = f"Q{quarter}"
|
149
|
+
metadata['yearQuarter'] = f"{event_time.year}-Q{quarter}"
|
150
|
+
except (ValueError, TypeError) as e:
|
151
|
+
log.warning(f"Could not extract additional time metadata: {e}")
|
26
152
|
|
27
153
|
return metadata
|
@@ -60,7 +60,7 @@ sunholo/components/retriever.py,sha256=Wmchv3huAM4w7DIS-a5Lp9Hi7M8pE6vZdxgseiT9S
|
|
60
60
|
sunholo/components/vectorstore.py,sha256=k7GS1Y5c6ZGXSDAJvyCes6dTjhDAi0fjGbVLqpyfzBc,5918
|
61
61
|
sunholo/database/__init__.py,sha256=bpB5Nk21kwqYj-qdVnvNgXjLsbflnH4g-San7OHMqR4,283
|
62
62
|
sunholo/database/alloydb.py,sha256=x1zUMB-EVWbE2Zvp4nAs2Z-tB_kOZmS45H2lwVHdYnk,11678
|
63
|
-
sunholo/database/alloydb_client.py,sha256=
|
63
|
+
sunholo/database/alloydb_client.py,sha256=OCAi7Gopry7tiOOdjka-cldghFpxl6IXWWGEANmFVII,27414
|
64
64
|
sunholo/database/database.py,sha256=VqhZdkXUNdvWn8sUcUV3YNby1JDVf7IykPVXWBtxo9U,7361
|
65
65
|
sunholo/database/lancedb.py,sha256=DyfZntiFKBlVPaFooNN1Z6Pl-LAs4nxWKKuq8GBqN58,715
|
66
66
|
sunholo/database/static_dbs.py,sha256=8cvcMwUK6c32AS2e_WguKXWMkFf5iN3g9WHzsh0C07Q,442
|
@@ -79,7 +79,7 @@ sunholo/discovery_engine/discovery_engine_client.py,sha256=NjIcP10I2-8yj6QZKrxGz
|
|
79
79
|
sunholo/discovery_engine/get_ai_search_chunks.py,sha256=I6Dt1CznqEvE7XIZ2PkLqopmjpO96iVEWJJqL5cJjOU,5554
|
80
80
|
sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
|
81
81
|
sunholo/embedder/embed_chunk.py,sha256=did2pKkWM2o0KkRcb0H9l2x_WjCq6OyuHDxGbITFKPM,6530
|
82
|
-
sunholo/embedder/embed_metadata.py,sha256=
|
82
|
+
sunholo/embedder/embed_metadata.py,sha256=h9_L3Mkd7Mtnr8OwV4nNRrdSKoxhqh9LnSsht6j-vIY,6600
|
83
83
|
sunholo/excel/__init__.py,sha256=AqTMN9K4qJYi4maEgoORc5oxDVGO_eqmwzDaVP37JgY,56
|
84
84
|
sunholo/excel/plugin.py,sha256=TJJdcKWyqEIce1agCJImvqvNp2CvLhzi4wUmLYHcLc8,4032
|
85
85
|
sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
|
@@ -168,9 +168,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
|
|
168
168
|
sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
|
169
169
|
sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
|
170
170
|
sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
|
171
|
-
sunholo-0.
|
172
|
-
sunholo-0.
|
173
|
-
sunholo-0.
|
174
|
-
sunholo-0.
|
175
|
-
sunholo-0.
|
176
|
-
sunholo-0.
|
171
|
+
sunholo-0.124.0.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
|
172
|
+
sunholo-0.124.0.dist-info/METADATA,sha256=FDOT2K4fXDiUu5jZbW8q7ozxsEAaNX-YMJiKfnLI2rM,10001
|
173
|
+
sunholo-0.124.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
174
|
+
sunholo-0.124.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
|
175
|
+
sunholo-0.124.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
|
176
|
+
sunholo-0.124.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|