sunholo 0.138.1__py3-none-any.whl → 0.139.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ import os
2
+ import json
3
+
4
+ from ..custom_logging import log
5
+ from ..utils.mime import get_mime_type_gemini
6
+ from .metadata import check_gcs_file_size
7
+ from .download_url import get_bytes_from_gcs
8
+
9
+ def download_gcs_source_to_string(source:str, max_size_bytes: int = 1024*1024) -> str:
10
+ """
11
+ Download a file from Google Cloud Storage and convert it to a string.
12
+
13
+ Args:
14
+ source: str The Google Cloud Storage URI of the file to download (e.g., 'gs://bucket_name/file_name').
15
+ max_size_bytes: int Maximum file size to download, defaults to 1MB (1024*1024 bytes)
16
+
17
+ Returns:
18
+ str: The contents of the file as a string, or an empty string if the file could not be downloaded.
19
+ """
20
+
21
+ mime_type = get_mime_type_gemini(source)
22
+ if mime_type == "":
23
+ log.warning(f"Can not download to string file source {source}")
24
+ return ""
25
+ """
26
+ mime_types = {
27
+
28
+ # Images
29
+ 'png': 'image/png',
30
+ 'jpg': 'image/jpeg',
31
+ 'jpeg': 'image/jpeg',
32
+ 'gif': 'image/gif',
33
+ 'webp': 'image/webp',
34
+
35
+ # Document formats
36
+ 'pdf': 'application/pdf',
37
+
38
+ # Programming languages
39
+ 'js': 'text/javascript',
40
+ 'py': 'text/x-python',
41
+
42
+ # Web formats
43
+ 'html': 'text/html',
44
+ 'htm': 'text/html',
45
+ 'css': 'text/css',
46
+
47
+ # Text formats
48
+ 'txt': 'text/plain',
49
+ 'md': 'text/md',
50
+ 'csv': 'text/csv',
51
+ 'xml': 'text/xml',
52
+ 'rtf': 'text/rtf',
53
+
54
+ # Special case: JSON files are treated as plain text
55
+ 'json': 'text/plain'
56
+ }
57
+ """
58
+ if mime_type.startswith("image/") or mime_type == "application/pdf":
59
+ log.warning(f"Can not download to string file source {source} of type {mime_type}")
60
+ return ""
61
+
62
+ try:
63
+ log.info(f"Extracting text for {source}")
64
+ # Check file size before downloading
65
+ file_size = check_gcs_file_size(source)
66
+ if file_size == -1:
67
+ log.warning(f"Could not determine file size for {source}")
68
+ return ""
69
+ elif file_size > max_size_bytes:
70
+ log.warning(f"File size {file_size} bytes exceeds maximum size limit of {max_size_bytes} bytes for {source}")
71
+ return ""
72
+
73
+ bytes = get_bytes_from_gcs(source)
74
+ string = bytes.decode('utf-8', errors='replace')
75
+ log.info(f"Extracted {len(string)} characters from {source}: {string[:100]}")
76
+
77
+ except Exception as err:
78
+ log.error(f"Could not extract string text for {source}: {str(err)}")
79
+
80
+ return ""
81
+
82
+ if not string:
83
+ raise ValueError(f"No string text for {source}")
84
+
85
+ file_ext = os.path.splitext(source)[1].lower().lstrip('.')
86
+ if file_ext == "json":
87
+ try:
88
+ extracted_data = json.loads(string)
89
+ log.debug("Turning json text into markdown format so as not to confuse structured output", log_struct=extracted_data)
90
+ string = json_data_to_markdown(extracted_data)
91
+ except json.JSONDecodeError:
92
+ log.warning(f"Could not get valid json from .json file: {source}")
93
+
94
+ return string
95
+
96
+ def json_data_to_markdown(data, indent_level: int = 0) -> str:
97
+ """
98
+ Recursively converts a Python object (from parsed JSON) into a Markdown string.
99
+ """
100
+ indent = " " * indent_level # Use 2 spaces for indentation
101
+ markdown_parts = []
102
+
103
+ if isinstance(data, dict):
104
+ if not data:
105
+ return f"{indent}(empty object)"
106
+ for key, value in data.items():
107
+ # Process the value recursively
108
+ value_md = json_data_to_markdown(value, indent_level + 1)
109
+ # Determine if the rendered value is complex (multi-line or was list/dict)
110
+ is_complex_render = "\n" in value_md.strip() or (isinstance(value, (dict, list)) and value)
111
+
112
+ if is_complex_render:
113
+ markdown_parts.append(f"{indent}**{key}**:")
114
+ markdown_parts.append(value_md)
115
+ else:
116
+ # Simple value rendering, strip its own indent before adding key
117
+ markdown_parts.append(f"{indent}**{key}**: {value_md.strip()}")
118
+ return "\n".join(markdown_parts)
119
+
120
+ elif isinstance(data, list):
121
+ if not data:
122
+ return f"{indent}(empty list)"
123
+ for item in data:
124
+ # Process item recursively
125
+ item_md = json_data_to_markdown(item, indent_level + 1)
126
+ # Remove leading indent from the recursive call before processing lines
127
+ lines = item_md.lstrip(' ').split('\n')
128
+ # Add bullet point to the first line
129
+ first_line = f"{indent}- {lines[0]}"
130
+ # Ensure subsequent lines are indented correctly relative to the bullet
131
+ rest_lines = [f"{indent} {line}" for line in lines[1:]]
132
+ markdown_parts.append(first_line)
133
+ markdown_parts.extend(rest_lines)
134
+ return "\n".join(markdown_parts)
135
+
136
+ elif isinstance(data, str):
137
+ # Handle multi-line strings: indent subsequent lines
138
+ lines = data.split('\n')
139
+ if len(lines) <= 1:
140
+ return f"{indent}{data}" # Single line string
141
+ else:
142
+ indented_lines = [f"{indent}{lines[0]}"] + [f"{indent} {line}" for line in lines[1:]]
143
+ return "\n".join(indented_lines)
144
+
145
+ elif data is None:
146
+ return f"{indent}*null*" # Represent None distinctly
147
+ elif isinstance(data, bool):
148
+ return f"{indent}{str(data).lower()}" # true / false
149
+ else: # Numbers (int, float)
150
+ return f"{indent}{str(data)}"
sunholo/gcs/metadata.py CHANGED
@@ -30,4 +30,37 @@ def get_object_metadata(bucket_name, object_name):
30
30
  custom_metadata = blob.metadata
31
31
 
32
32
  log.info(f"Custom Metadata for {object_name}: {custom_metadata}")
33
- return custom_metadata
33
+ return custom_metadata
34
+
35
+ def check_gcs_file_size(source: str) -> int:
36
+ """
37
+ Check the size of a file in Google Cloud Storage without downloading the entire file.
38
+
39
+ Args:
40
+ source: str The Google Cloud Storage URI of the file to check (e.g., 'gs://bucket_name/file_name').
41
+
42
+ Returns:
43
+ int: The size of the file in bytes, or -1 if the size cannot be determined.
44
+ """
45
+ from google.cloud import storage
46
+
47
+ try:
48
+ # Parse the GCS URI
49
+ if not source.startswith('gs://'):
50
+ log.warning(f"Invalid GCS URI format: {source}")
51
+ return -1
52
+
53
+ bucket_name, blob_path = source[5:].split('/', 1)
54
+
55
+ # Create a client and get the bucket
56
+ storage_client = storage.Client()
57
+ bucket = storage_client.bucket(bucket_name)
58
+
59
+ # Get the blob (file) and retrieve its metadata
60
+ blob = bucket.blob(blob_path)
61
+ blob.reload() # Fetch the latest metadata
62
+
63
+ return blob.size
64
+ except Exception as err:
65
+ log.error(f"Error checking file size for {source}: {str(err)}")
66
+ return -1
sunholo/utils/mime.py CHANGED
@@ -66,3 +66,53 @@ def guess_mime_type(file_path: str) -> str:
66
66
 
67
67
  return mime
68
68
 
69
+
70
+ def get_mime_type_gemini(file_path:str) -> str:
71
+ """
72
+ Determine the MIME type based on file extension.
73
+ Only returns valid Gemini formats, or None if they are not supported.
74
+
75
+ Args:
76
+ file_path (str): Path to the file
77
+
78
+ Returns:
79
+ str: The appropriate MIME type for the file
80
+ """
81
+ # Extract the file extension (lowercase)
82
+ ext = os.path.splitext(file_path)[1].lower().lstrip('.')
83
+
84
+ # Define the mapping of extensions to MIME types
85
+ mime_types = {
86
+
87
+ # Images
88
+ 'png': 'image/png',
89
+ 'jpg': 'image/jpeg',
90
+ 'jpeg': 'image/jpeg',
91
+ 'gif': 'image/gif',
92
+ 'webp': 'image/webp',
93
+
94
+ # Document formats
95
+ 'pdf': 'application/pdf',
96
+
97
+ # Programming languages
98
+ 'js': 'text/javascript',
99
+ 'py': 'text/x-python',
100
+
101
+ # Web formats
102
+ 'html': 'text/html',
103
+ 'htm': 'text/html',
104
+ 'css': 'text/css',
105
+
106
+ # Text formats
107
+ 'txt': 'text/plain',
108
+ 'md': 'text/md',
109
+ 'csv': 'text/csv',
110
+ 'xml': 'text/xml',
111
+ 'rtf': 'text/rtf',
112
+
113
+ # Special case: JSON files are treated as plain text
114
+ 'json': 'text/plain'
115
+ }
116
+
117
+ # Return the appropriate MIME type, defaulting to None if unknown
118
+ return mime_types.get(ext, "")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sunholo
3
- Version: 0.138.1
3
+ Version: 0.139.1
4
4
  Summary: AI DevOps - a package to help deploy GenAI to the Cloud.
5
5
  Author-email: Holosun ApS <multivac@sunholo.com>
6
6
  License: Apache License, Version 2.0
@@ -85,9 +85,10 @@ sunholo/excel/plugin.py,sha256=TJJdcKWyqEIce1agCJImvqvNp2CvLhzi4wUmLYHcLc8,4032
85
85
  sunholo/gcs/__init__.py,sha256=SZvbsMFDko40sIRHTHppA37IijvJTae54vrhooEF5-4,90
86
86
  sunholo/gcs/add_file.py,sha256=Pd5Zc1a3gqbuBgSI-UDC2mQnYGLJbAh_-IUzkDN5s9k,8273
87
87
  sunholo/gcs/download_folder.py,sha256=ijJTnS595JqZhBH8iHFErQilMbkuKgL-bnTCMLGuvlA,1614
88
+ sunholo/gcs/download_gcs_text.py,sha256=apopEBxC6OT0KBwnlFOeNOXyiY4A8BKEVixzb1wgQrk,5583
88
89
  sunholo/gcs/download_url.py,sha256=9QMEtZhrN-y1VAqvi-7Tw2GI9iRG_uuZzCg6Qhq8_yw,6421
89
90
  sunholo/gcs/extract_and_sign.py,sha256=paRrTCvCN5vkQwCB7OSkxWi-pfOgOtZ0bwdXE08c3Ps,1546
90
- sunholo/gcs/metadata.py,sha256=oQLcXi4brsZ74aegWyC1JZmhlaEV270HS5_UWtAYYWE,898
91
+ sunholo/gcs/metadata.py,sha256=GEDxb_B_teNPGd6chPzQrK9df78R_kytKfthIlKPwKQ,2010
91
92
  sunholo/genai/__init__.py,sha256=TV3PYHWoR4cChdmCOaYB0PtAEQ86qol9XYYEtb1JmSA,239
92
93
  sunholo/genai/file_handling.py,sha256=JUFTlSnrxqKR3hczduyMiZ234UaSqiBdMOYpY2v4TYA,13720
93
94
  sunholo/genai/genaiv2.py,sha256=uqWcfvlsPVPyQo-W_cP9h2TTzyYfzj4lyJlyqPyKTkI,20269
@@ -155,7 +156,7 @@ sunholo/utils/config_class.py,sha256=uSRiJLj8t5UgWNxaq8W4KPnzxb4SkUJ1avXecDHuP-E
155
156
  sunholo/utils/config_schema.py,sha256=Wv-ncitzljOhgbDaq9qnFqH5LCuxNv59dTGDWgd1qdk,4189
156
157
  sunholo/utils/gcp.py,sha256=lus1HH8YhFInw6QRKwfvKZq-Lz-2KQg4ips9v1I_3zE,4783
157
158
  sunholo/utils/gcp_project.py,sha256=Fa0IhCX12bZ1ctF_PKN8PNYd7hihEUfb90kilBfUDjg,1411
158
- sunholo/utils/mime.py,sha256=7_J1PnWOlvAPRoHWKESAncdRVVldVwRdKvuDvi9sRfE,2020
159
+ sunholo/utils/mime.py,sha256=mELAiZcGa69PshBxV7y770E0K09YfX4Z4ZRBPL-7gXs,3352
159
160
  sunholo/utils/parsers.py,sha256=wES0fRn3GONoymRXOXt-z62HCoOiUvvFXa-MfKfjCls,6421
160
161
  sunholo/utils/timedelta.py,sha256=BbLabEx7_rbErj_YbNM0MBcaFN76DC4PTe4zD2ucezg,493
161
162
  sunholo/utils/user_ids.py,sha256=SQd5_H7FE7vcTZp9AQuQDWBXd4FEEd7TeVMQe1H4Ny8,292
@@ -168,9 +169,9 @@ sunholo/vertex/init.py,sha256=1OQwcPBKZYBTDPdyU7IM4X4OmiXLdsNV30C-fee2scQ,2875
168
169
  sunholo/vertex/memory_tools.py,sha256=tBZxqVZ4InTmdBvLlOYwoSEWu4-kGquc-gxDwZCC4FA,7667
169
170
  sunholo/vertex/safety.py,sha256=S9PgQT1O_BQAkcqauWncRJaydiP8Q_Jzmu9gxYfy1VA,2482
170
171
  sunholo/vertex/type_dict_to_json.py,sha256=uTzL4o9tJRao4u-gJOFcACgWGkBOtqACmb6ihvCErL8,4694
171
- sunholo-0.138.1.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
172
- sunholo-0.138.1.dist-info/METADATA,sha256=GC4bwGLlBT68d6uqc99tTFX7Y_WtWnombuk2fPrTzls,10067
173
- sunholo-0.138.1.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
174
- sunholo-0.138.1.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
175
- sunholo-0.138.1.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
176
- sunholo-0.138.1.dist-info/RECORD,,
172
+ sunholo-0.139.1.dist-info/licenses/LICENSE.txt,sha256=SdE3QjnD3GEmqqg9EX3TM9f7WmtOzqS1KJve8rhbYmU,11345
173
+ sunholo-0.139.1.dist-info/METADATA,sha256=vOG7X6ZpBgF3og9_BNDil-Loy2tAW38orcqYo3ObTTk,10067
174
+ sunholo-0.139.1.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
175
+ sunholo-0.139.1.dist-info/entry_points.txt,sha256=bZuN5AIHingMPt4Ro1b_T-FnQvZ3teBes-3OyO0asl4,49
176
+ sunholo-0.139.1.dist-info/top_level.txt,sha256=wt5tadn5--5JrZsjJz2LceoUvcrIvxjHJe-RxuudxAk,8
177
+ sunholo-0.139.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.3.1)
2
+ Generator: setuptools (80.4.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5