sunholo 0.123.5__py3-none-any.whl → 0.124.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,9 @@
1
1
 
2
2
  import datetime
3
+ import re
4
+
5
+ from ..utils.mime import guess_mime_type
6
+
3
7
  from ..custom_logging import log
4
8
 
5
9
  def audit_metadata(metadata, chunk_length=None):
@@ -8,6 +12,24 @@ def audit_metadata(metadata, chunk_length=None):
8
12
  metadata['eventTime'] = datetime.datetime.now().isoformat(timespec='microseconds') + "Z"
9
13
  metadata['eventtime'] = metadata['eventTime']
10
14
 
15
+ # Extract time-based dimensions from eventTime
16
+ try:
17
+ # Handle timestamps in ISO format with Z suffix
18
+ event_time_str = metadata['eventTime']
19
+ if event_time_str.endswith('Z'):
20
+ event_time_str = event_time_str[:-1] # Remove the Z suffix
21
+
22
+ event_time = datetime.datetime.fromisoformat(event_time_str)
23
+
24
+ # Add year dimension (e.g., 2025)
25
+ metadata['year'] = str(event_time.year)
26
+ # Add yearMonth dimension (e.g., 2025-03)
27
+ metadata['yearMonth'] = f"{event_time.year}-{event_time.month:02d}"
28
+ # Add month dimension (e.g., 03)
29
+ metadata['month'] = f"{event_time.month:02d}"
30
+ except (ValueError, TypeError) as e:
31
+ log.warning(f"Could not parse eventTime for time dimensions: {metadata['eventTime']}, error: {e}")
32
+
11
33
  if 'source' not in metadata:
12
34
  if 'objectId' in metadata:
13
35
  metadata['source'] = metadata['objectId']
@@ -23,5 +45,109 @@ def audit_metadata(metadata, chunk_length=None):
23
45
 
24
46
  if 'chunk_length' not in metadata:
25
47
  metadata['chunk_length'] = chunk_length
48
+
49
+ # Extract folder paths from source field
50
+ if 'source' in metadata and metadata['source']:
51
+ source_path = metadata['source']
52
+
53
+ metadata['mime_type'] = guess_mime_type(source_path)
54
+
55
+ # Extract file extension
56
+ if '.' in source_path.split('/')[-1]:
57
+ file_extension = source_path.split('/')[-1].split('.')[-1].lower()
58
+ metadata['file_extension'] = file_extension
59
+
60
+ # Add file type category
61
+ if file_extension in ['pdf', 'doc', 'docx', 'txt', 'rtf', 'odt']:
62
+ metadata['file_type'] = 'document'
63
+ elif file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']:
64
+ metadata['file_type'] = 'image'
65
+ elif file_extension in ['mp3', 'wav', 'ogg', 'flac', 'm4a']:
66
+ metadata['file_type'] = 'audio'
67
+ elif file_extension in ['mp4', 'avi', 'mov', 'wmv', 'mkv', 'webm']:
68
+ metadata['file_type'] = 'video'
69
+ elif file_extension in ['xls', 'xlsx', 'csv']:
70
+ metadata['file_type'] = 'spreadsheet'
71
+ elif file_extension in ['ppt', 'pptx']:
72
+ metadata['file_type'] = 'presentation'
73
+ elif file_extension in ['zip', 'rar', 'tar', 'gz', '7z']:
74
+ metadata['file_type'] = 'archive'
75
+ elif file_extension in ['html', 'htm', 'xml', 'json', 'yaml', 'yml']:
76
+ metadata['file_type'] = 'markup'
77
+ elif file_extension in ['py', 'js', 'java', 'c', 'cpp', 'cs', 'go', 'rb', 'php']:
78
+ metadata['file_type'] = 'code'
79
+ else:
80
+ metadata['file_type'] = 'other'
81
+
82
+ # Check if the source looks like a GCS path
83
+ if source_path.startswith('gs://'):
84
+ # Remove the gs:// prefix
85
+ path_without_prefix = source_path[5:]
86
+
87
+ # Split the path into components
88
+ path_components = path_without_prefix.split('/')
89
+
90
+ # The first component is the bucket name
91
+ if len(path_components) > 0:
92
+ metadata['bucket_name'] = path_components[0]
93
+
94
+ # Extract up to 5 folder levels
95
+ for i in range(1, min(6, len(path_components))):
96
+ if i < len(path_components) - 1: # Skip the last component (filename)
97
+ folder_key = f'folder_{i}'
98
+ metadata[folder_key] = path_components[i]
99
+
100
+ # Extract the object name (last component)
101
+ if len(path_components) > 1:
102
+ metadata['object_name'] = path_components[-1]
103
+
104
+ # For other URL types, try to extract paths
105
+ elif re.match(r'^(http|https|s3|file)://', source_path):
106
+ # Extract path part after domain
107
+ match = re.search(r'://[^/]+/(.+)', source_path)
108
+ if match:
109
+ path_part = match.group(1)
110
+ path_components = path_part.split('/')
111
+
112
+ # Extract up to 5 folder levels
113
+ for i in range(0, min(5, len(path_components) - 1)):
114
+ folder_key = f'folder_{i+1}'
115
+ metadata[folder_key] = path_components[i]
116
+
117
+ # Extract the object name (last component)
118
+ if path_components:
119
+ metadata['object_name'] = path_components[-1]
120
+
121
+ # Add file size category if size exists
122
+ if 'size' in metadata and isinstance(metadata['size'], (int, float)):
123
+ size_bytes = metadata['size']
124
+ if size_bytes < 10 * 1024: # < 10KB
125
+ metadata['size_category'] = 'tiny'
126
+ elif size_bytes < 1024 * 1024: # < 1MB
127
+ metadata['size_category'] = 'small'
128
+ elif size_bytes < 10 * 1024 * 1024: # < 10MB
129
+ metadata['size_category'] = 'medium'
130
+ elif size_bytes < 100 * 1024 * 1024: # < 100MB
131
+ metadata['size_category'] = 'large'
132
+ else: # >= 100MB
133
+ metadata['size_category'] = 'very_large'
134
+
135
+ # Add day of week
136
+ try:
137
+ if 'eventTime' in metadata:
138
+ event_time_str = metadata['eventTime']
139
+ if event_time_str.endswith('Z'):
140
+ event_time_str = event_time_str[:-1]
141
+
142
+ event_time = datetime.datetime.fromisoformat(event_time_str)
143
+ weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
144
+ metadata['day_of_week'] = weekday_names[event_time.weekday()]
145
+
146
+ # Add quarter information
147
+ quarter = (event_time.month - 1) // 3 + 1
148
+ metadata['quarter'] = f"Q{quarter}"
149
+ metadata['yearQuarter'] = f"{event_time.year}-Q{quarter}"
150
+ except (ValueError, TypeError) as e:
151
+ log.warning(f"Could not extract additional time metadata: {e}")
26
152
 
27
153
  return metadata
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sunholo
3
- Version: 0.123.5
3
+ Version: 0.124.0
4
4
  Summary: Large Language Model DevOps - a package to help deploy LLMs to the Cloud.
5
5
  Author-email: Holosun ApS <multivac@sunholo.com>
6
6
  License: Apache License, Version 2.0
@@ -79,7 +79,7 @@ sunholo/discovery_engine/discovery_engine_client.py,sha256=NjIcP10I2-8yj6QZKrxGz
79
79
  sunholo/discovery_engine/get_ai_search_chunks.py,sha256=I6Dt1CznqEvE7XIZ2PkLqopmjpO96iVEWJJqL5cJjOU,5554
80
80
  sunholo/embedder/__init__.py,sha256=sI4N_CqgEVcrMDxXgxKp1FsfsB4FpjoXgPGkl4N_u4I,44
81
81
  sunholo/embedder/embed_chunk.py,sha256=did2pKkWM2o0KkRcb0H9l2x_WjCq6OyuHDxGbITFKPM,6530
82
- sunholo/embedder/embed_metadata.py,sha256=2ziUIdVwnbCUU8gOwQWEvkrRcyp-7IeyZfSsWNkMquA,866
82
+ sunholo/embedder/embed_metadata.py,sha256=h9_L3Mkd7Mtnr8OwV4nNRrdSKoxhqh9LnSsht6j-vIY,6600
83
83
  sunholo/excel/__init__.py,sha256=AqTMN9K4qJYi4maEgoORc5oxDVGO_eqmwzDaVP37JgY,56
84
84
  sunholo/excel/plugin.py,sha256=TJJdcKWyqEIce1agCJImvqvNp2CvLhzi4wUmLYHcLc8,4032
85
85
  sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
@@ -168,9 +168,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
168
168
  sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
169
169
  sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
170
170
  sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
171
- sunholo-0.123.5.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
172
- sunholo-0.123.5.dist-info/METADATA,sha256=ahlMOD2O68Y-qNXEM0UmWYJt_6dZyPvjXxdDcB71T8Y,10001
173
- sunholo-0.123.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
174
- sunholo-0.123.5.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
175
- sunholo-0.123.5.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
176
- sunholo-0.123.5.dist-info/RECORD,,
171
+ sunholo-0.124.0.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
172
+ sunholo-0.124.0.dist-info/METADATA,sha256=FDOT2K4fXDiUu5jZbW8q7ozxsEAaNX-YMJiKfnLI2rM,10001
173
+ sunholo-0.124.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
174
+ sunholo-0.124.0.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
175
+ sunholo-0.124.0.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
176
+ sunholo-0.124.0.dist-info/RECORD,,