botrun-flow-lang 5.12.263__py3-none-any.whl → 6.2.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. botrun_flow_lang/api/auth_api.py +39 -39
  2. botrun_flow_lang/api/auth_utils.py +183 -183
  3. botrun_flow_lang/api/botrun_back_api.py +65 -65
  4. botrun_flow_lang/api/flow_api.py +3 -3
  5. botrun_flow_lang/api/hatch_api.py +508 -508
  6. botrun_flow_lang/api/langgraph_api.py +816 -811
  7. botrun_flow_lang/api/langgraph_constants.py +11 -0
  8. botrun_flow_lang/api/line_bot_api.py +1484 -1484
  9. botrun_flow_lang/api/model_api.py +300 -300
  10. botrun_flow_lang/api/rate_limit_api.py +32 -32
  11. botrun_flow_lang/api/routes.py +79 -79
  12. botrun_flow_lang/api/search_api.py +53 -53
  13. botrun_flow_lang/api/storage_api.py +395 -395
  14. botrun_flow_lang/api/subsidy_api.py +290 -290
  15. botrun_flow_lang/api/subsidy_api_system_prompt.txt +109 -109
  16. botrun_flow_lang/api/user_setting_api.py +70 -70
  17. botrun_flow_lang/api/version_api.py +31 -31
  18. botrun_flow_lang/api/youtube_api.py +26 -26
  19. botrun_flow_lang/constants.py +13 -13
  20. botrun_flow_lang/langgraph_agents/agents/agent_runner.py +178 -178
  21. botrun_flow_lang/langgraph_agents/agents/agent_tools/step_planner.py +77 -77
  22. botrun_flow_lang/langgraph_agents/agents/checkpointer/firestore_checkpointer.py +666 -666
  23. botrun_flow_lang/langgraph_agents/agents/gov_researcher/GOV_RESEARCHER_PRD.md +192 -192
  24. botrun_flow_lang/langgraph_agents/agents/gov_researcher/gemini_subsidy_graph.py +460 -460
  25. botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_2_graph.py +1002 -1002
  26. botrun_flow_lang/langgraph_agents/agents/gov_researcher/gov_researcher_graph.py +822 -822
  27. botrun_flow_lang/langgraph_agents/agents/langgraph_react_agent.py +730 -723
  28. botrun_flow_lang/langgraph_agents/agents/search_agent_graph.py +864 -864
  29. botrun_flow_lang/langgraph_agents/agents/tools/__init__.py +4 -4
  30. botrun_flow_lang/langgraph_agents/agents/tools/gemini_code_execution.py +376 -376
  31. botrun_flow_lang/langgraph_agents/agents/util/gemini_grounding.py +66 -66
  32. botrun_flow_lang/langgraph_agents/agents/util/html_util.py +316 -316
  33. botrun_flow_lang/langgraph_agents/agents/util/img_util.py +336 -294
  34. botrun_flow_lang/langgraph_agents/agents/util/local_files.py +419 -419
  35. botrun_flow_lang/langgraph_agents/agents/util/mermaid_util.py +86 -86
  36. botrun_flow_lang/langgraph_agents/agents/util/model_utils.py +143 -143
  37. botrun_flow_lang/langgraph_agents/agents/util/pdf_analyzer.py +562 -486
  38. botrun_flow_lang/langgraph_agents/agents/util/pdf_cache.py +250 -250
  39. botrun_flow_lang/langgraph_agents/agents/util/pdf_processor.py +204 -204
  40. botrun_flow_lang/langgraph_agents/agents/util/perplexity_search.py +464 -464
  41. botrun_flow_lang/langgraph_agents/agents/util/plotly_util.py +59 -59
  42. botrun_flow_lang/langgraph_agents/agents/util/tavily_search.py +199 -199
  43. botrun_flow_lang/langgraph_agents/agents/util/usage_metadata.py +34 -0
  44. botrun_flow_lang/langgraph_agents/agents/util/youtube_util.py +90 -90
  45. botrun_flow_lang/langgraph_agents/cache/langgraph_botrun_cache.py +197 -197
  46. botrun_flow_lang/llm_agent/llm_agent.py +19 -19
  47. botrun_flow_lang/llm_agent/llm_agent_util.py +83 -83
  48. botrun_flow_lang/log/.gitignore +2 -2
  49. botrun_flow_lang/main.py +61 -61
  50. botrun_flow_lang/main_fast.py +51 -51
  51. botrun_flow_lang/mcp_server/__init__.py +10 -10
  52. botrun_flow_lang/mcp_server/default_mcp.py +854 -744
  53. botrun_flow_lang/models/nodes/utils.py +205 -205
  54. botrun_flow_lang/models/token_usage.py +34 -34
  55. botrun_flow_lang/requirements.txt +21 -21
  56. botrun_flow_lang/services/base/firestore_base.py +30 -30
  57. botrun_flow_lang/services/hatch/hatch_factory.py +11 -11
  58. botrun_flow_lang/services/hatch/hatch_fs_store.py +419 -419
  59. botrun_flow_lang/services/storage/storage_cs_store.py +206 -206
  60. botrun_flow_lang/services/storage/storage_factory.py +12 -12
  61. botrun_flow_lang/services/storage/storage_store.py +65 -65
  62. botrun_flow_lang/services/user_setting/user_setting_factory.py +9 -9
  63. botrun_flow_lang/services/user_setting/user_setting_fs_store.py +66 -66
  64. botrun_flow_lang/static/docs/tools/index.html +926 -926
  65. botrun_flow_lang/tests/api_functional_tests.py +1525 -1525
  66. botrun_flow_lang/tests/api_stress_test.py +357 -357
  67. botrun_flow_lang/tests/shared_hatch_tests.py +333 -333
  68. botrun_flow_lang/tests/test_botrun_app.py +46 -46
  69. botrun_flow_lang/tests/test_html_util.py +31 -31
  70. botrun_flow_lang/tests/test_img_analyzer.py +190 -190
  71. botrun_flow_lang/tests/test_img_util.py +39 -39
  72. botrun_flow_lang/tests/test_local_files.py +114 -114
  73. botrun_flow_lang/tests/test_mermaid_util.py +103 -103
  74. botrun_flow_lang/tests/test_pdf_analyzer.py +104 -104
  75. botrun_flow_lang/tests/test_plotly_util.py +151 -151
  76. botrun_flow_lang/tests/test_run_workflow_engine.py +65 -65
  77. botrun_flow_lang/tools/generate_docs.py +133 -133
  78. botrun_flow_lang/tools/templates/tools.html +153 -153
  79. botrun_flow_lang/utils/__init__.py +7 -7
  80. botrun_flow_lang/utils/botrun_logger.py +344 -344
  81. botrun_flow_lang/utils/clients/rate_limit_client.py +209 -209
  82. botrun_flow_lang/utils/clients/token_verify_client.py +153 -153
  83. botrun_flow_lang/utils/google_drive_utils.py +654 -654
  84. botrun_flow_lang/utils/langchain_utils.py +324 -324
  85. botrun_flow_lang/utils/yaml_utils.py +9 -9
  86. {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-6.2.21.dist-info}/METADATA +6 -6
  87. botrun_flow_lang-6.2.21.dist-info/RECORD +104 -0
  88. botrun_flow_lang-5.12.263.dist-info/RECORD +0 -102
  89. {botrun_flow_lang-5.12.263.dist-info → botrun_flow_lang-6.2.21.dist-info}/WHEEL +0 -0
@@ -1,316 +1,316 @@
1
- import os
2
- from tempfile import NamedTemporaryFile
3
- from typing import Dict, Any, Optional, Tuple
4
- import re
5
- from urllib.parse import urlparse, urlunparse, unquote, parse_qs, urlencode
6
- import time
7
- from io import BytesIO
8
- import requests
9
-
10
- from botrun_flow_lang.constants import MODIFY_GCS_HTML_MODEL
11
- from .local_files import upload_html_and_get_public_url
12
- from botrun_flow_lang.services.storage.storage_factory import storage_store_factory
13
-
14
-
15
- async def generate_html_file(
16
- html_content: str,
17
- botrun_flow_lang_url: str,
18
- user_id: str,
19
- title: Optional[str] = None,
20
- ) -> str:
21
- """
22
- Generate HTML file from complete HTML content (including JS and CSS) and upload it to GCS.
23
-
24
- This function accepts complete HTML documents with JavaScript, CSS, and other elements.
25
- You can pass either:
26
- 1. A complete HTML document (<!DOCTYPE html><html>...<head>...</head><body>...</body></html>)
27
- 2. HTML fragment that will be wrapped in a basic HTML structure if needed
28
-
29
- The function preserves all JavaScript, CSS, and other elements in the HTML content.
30
-
31
- Args:
32
- html_content: Complete HTML content string, including head/body tags, JavaScript, CSS, etc.
33
- botrun_flow_lang_url: URL for the botrun flow lang API
34
- user_id: User ID for file upload
35
- title: Optional title for the HTML page (used only if the HTML doesn't already have a title)
36
-
37
- Returns:
38
- str: URL for the HTML file or error message starting with "Error: "
39
- """
40
- try:
41
- # Check if the content is already a complete HTML document
42
- is_complete_html = html_content.strip().lower().startswith(
43
- "<!doctype html"
44
- ) or html_content.strip().lower().startswith("<html")
45
-
46
- # Only process HTML content if it's not already a complete document
47
- if not is_complete_html:
48
- # If not a complete HTML document, check if it has a head tag
49
- if "<head>" in html_content.lower():
50
- # Has head tag but not complete doc, add title if needed and provided
51
- if title and "<title>" not in html_content.lower():
52
- html_content = html_content.replace(
53
- "<head>", f"<head>\n <title>{title}</title>", 1
54
- )
55
- else:
56
- # No head tag, wrap the content in a basic HTML structure
57
- html_content = f"""
58
- <!DOCTYPE html>
59
- <html>
60
- <head>
61
- <meta charset="utf-8">
62
- <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
63
- <title>{title if title else 'HTML Page'}</title>
64
- <style>
65
- body {{
66
- font-family: "Microsoft JhengHei", "微軟正黑體", "Heiti TC", "黑體-繁", sans-serif;
67
- }}
68
- </style>
69
- </head>
70
- <body>
71
- {html_content}
72
- </body>
73
- </html>
74
- """
75
- # If we have complete HTML but title is provided and no title exists
76
- elif title and "<title>" not in html_content.lower():
77
- # Try to insert title into the head tag
78
- if "<head>" in html_content.lower():
79
- html_content = html_content.replace(
80
- "<head>", f"<head>\n <title>{title}</title>", 1
81
- )
82
-
83
- # Create temporary file
84
- with NamedTemporaryFile(
85
- suffix=".html", mode="w", encoding="utf-8", delete=False
86
- ) as html_temp:
87
- try:
88
- # Save HTML content
89
- html_temp.write(html_content)
90
- html_temp.flush()
91
-
92
- # Upload file to GCS
93
- html_url = await upload_html_and_get_public_url(
94
- html_temp.name, botrun_flow_lang_url, user_id
95
- )
96
-
97
- # Clean up temporary file
98
- os.unlink(html_temp.name)
99
-
100
- return html_url
101
- except Exception as e:
102
- # Clean up temporary file in case of error
103
- os.unlink(html_temp.name)
104
- return f"Error: {str(e)}"
105
-
106
- except Exception as e:
107
- return f"Error: {str(e)}"
108
-
109
-
110
- # todo 還沒改完,我的測試案例測到 3 之後,就不會再增加了
111
- async def modify_gcs_html(
112
- html_url: str,
113
- modification_instruction: str,
114
- ) -> Tuple[bool, str, Optional[str]]:
115
- """
116
- Modify HTML file stored in Google Cloud Storage using Gemini 2.0 Flash LLM.
117
-
118
- The function parses the GCS URL, fetches the HTML content, sends it to Gemini with
119
- the modification instruction, executes the generated Python code to modify the HTML,
120
- and updates the original file in GCS.
121
-
122
- Args:
123
- html_url: GCS URL pointing to an HTML file
124
- (format: https://storage.googleapis.com/[bucket-name]/[doc-path])
125
- modification_instruction: Natural language instruction for how to modify the HTML
126
-
127
- Returns:
128
- Tuple[bool, str, Optional[str]]: (success, original_url, error_message)
129
- """
130
- try:
131
- # 1. Parse the GCS URL to extract bucket name and document path
132
- url_parts = urlparse(html_url)
133
-
134
- # Strip query parameters from the URL for processing
135
- clean_url_parts = url_parts._replace(query="")
136
- clean_url = urlunparse(clean_url_parts)
137
-
138
- if not url_parts.netloc.startswith("storage.googleapis.com"):
139
- return False, html_url, "Error: URL must be a Google Cloud Storage URL"
140
-
141
- # Extract bucket name and document path correctly
142
- path_segments = url_parts.path.strip("/").split("/", 1)
143
- if len(path_segments) < 2:
144
- return False, html_url, "Error: Invalid GCS URL format"
145
-
146
- bucket_name = path_segments[0]
147
- document_path = path_segments[1]
148
-
149
- # URL decode the document path to handle encoded characters like %40
150
- decoded_document_path = unquote(document_path)
151
-
152
- # 2. Fetch the HTML content from GCS
153
- try:
154
- # First try to get the HTML directly via the URL
155
- response = requests.get(clean_url)
156
- if response.status_code != 200:
157
- # If direct access fails, use the storage client
158
- storage = storage_store_factory()
159
- # Use the original (non-decoded) path for retrieval since that's how it's stored in GCS
160
- file_object = await storage.retrieve_file(document_path)
161
- if not file_object:
162
- return (
163
- False,
164
- html_url,
165
- "Error: Could not retrieve HTML file from GCS",
166
- )
167
- # Explicitly decode with UTF-8 to properly handle non-ASCII characters
168
- html_content = file_object.getvalue().decode("utf-8")
169
- else:
170
- # Set encoding for response text (use UTF-8 or detect from content)
171
- if "charset=" in response.headers.get("content-type", ""):
172
- # Extract charset from content-type header
173
- charset = (
174
- response.headers.get("content-type")
175
- .split("charset=")[1]
176
- .split(";")[0]
177
- )
178
- response.encoding = charset
179
- else:
180
- # Default to UTF-8 if not specified
181
- response.encoding = "utf-8"
182
- html_content = response.text
183
- except Exception as e:
184
- return False, html_url, f"Error retrieving HTML content: {str(e)}"
185
-
186
- # 3. Call Gemini API to generate Python code for HTML modification
187
- try:
188
- # Import here to avoid loading time and potential circular imports
189
- import google.generativeai as genai
190
- from google.generativeai.types import HarmCategory, HarmBlockThreshold
191
-
192
- # Initialize Gemini client
193
- api_key = os.getenv("GEMINI_API_KEY", "")
194
- if not api_key:
195
- return (
196
- False,
197
- html_url,
198
- "Error: GEMINI_API_KEY environment variable not set",
199
- )
200
-
201
- genai.configure(api_key=api_key)
202
- model = genai.GenerativeModel(MODIFY_GCS_HTML_MODEL)
203
-
204
- # Create prompt for Gemini
205
- prompt = f"""You are an expert HTML and Python developer.
206
- Your task is to modify an HTML document according to the following instruction:
207
- "{modification_instruction}"
208
-
209
- Here is the HTML code to modify:
210
- ```html
211
- {html_content}
212
- ```
213
-
214
- Please provide minimal Python code that makes these modifications to the HTML.
215
- Your code must:
216
- 1. Use BeautifulSoup4 to parse and modify the HTML
217
- 2. Return the modified HTML as a string
218
- 3. Use a function called 'modify_html' that takes the original HTML as input and returns the modified HTML
219
- 4. Only include essential code to make the exact change requested - no explanations or verbose comments
220
- 5. Ensure you preserve the character encoding for non-ASCII characters
221
- 6. Use BeautifulSoup with features='html.parser'
222
-
223
- Only provide the Python code, nothing else. Keep the code minimal and direct."""
224
-
225
- # Generate the Python code
226
- response = model.generate_content(prompt)
227
- generated_code = response.text
228
-
229
- # Extract Python code if it's wrapped in ```python ... ```
230
- if "```python" in generated_code:
231
- python_code_match = re.search(
232
- r"```python(.*?)```", generated_code, re.DOTALL
233
- )
234
- if python_code_match:
235
- generated_code = python_code_match.group(1).strip()
236
- elif "```" in generated_code:
237
- python_code_match = re.search(r"```(.*?)```", generated_code, re.DOTALL)
238
- if python_code_match:
239
- generated_code = python_code_match.group(1).strip()
240
-
241
- # 4. Execute the generated Python code
242
- # Create a safe execution environment
243
- try:
244
- local_vars = {"original_html": html_content}
245
- # Make sure we have BeautifulSoup available
246
- exec("from bs4 import BeautifulSoup", local_vars)
247
-
248
- # Execute the generated code
249
- exec(generated_code, local_vars)
250
-
251
- # Call the modify_html function
252
- if "modify_html" in local_vars:
253
- modified_html = local_vars["modify_html"](html_content)
254
- else:
255
- return (
256
- False,
257
- html_url,
258
- "Error: Generated code does not contain a modify_html function",
259
- )
260
-
261
- if not modified_html or not isinstance(modified_html, str):
262
- return (
263
- False,
264
- html_url,
265
- "Error: Generated code did not produce valid HTML",
266
- )
267
-
268
- # Check if the model actually made changes to the HTML
269
- if modified_html.strip() == html_content.strip():
270
- return (
271
- False,
272
- html_url,
273
- "Error: The model didn't make any changes to the HTML. It might not understand how to perform the requested modification.",
274
- )
275
-
276
- except Exception as e:
277
- return False, html_url, f"Error executing generated code: {str(e)}"
278
-
279
- # 5. Update the original HTML file in GCS
280
- try:
281
- storage = storage_store_factory()
282
- # Explicitly encode with UTF-8 to preserve non-ASCII characters
283
- file_object = BytesIO(modified_html.encode("utf-8"))
284
-
285
- # Store the modified file back to the same location using the decoded path
286
- # This ensures proper handling of special characters like @ in the path
287
- success, _ = await storage.store_file(
288
- decoded_document_path,
289
- file_object,
290
- public=True,
291
- content_type="text/html; charset=utf-8", # Explicitly set UTF-8 charset
292
- )
293
-
294
- if not success:
295
- return (
296
- False,
297
- html_url,
298
- "Error: Failed to update the HTML file in GCS",
299
- )
300
-
301
- # Add timestamp as query parameter to the URL to bypass cache
302
- timestamp = int(time.time())
303
- url_with_timestamp = urlparse(clean_url)
304
- new_query = urlencode({"t": timestamp})
305
- final_url = urlunparse(url_with_timestamp._replace(query=new_query))
306
-
307
- return True, final_url, None
308
-
309
- except Exception as e:
310
- return False, html_url, f"Error updating HTML file: {str(e)}"
311
-
312
- except Exception as e:
313
- return False, html_url, f"Error generating modification code: {str(e)}"
314
-
315
- except Exception as e:
316
- return False, html_url, f"Error: {str(e)}"
1
+ import os
2
+ from tempfile import NamedTemporaryFile
3
+ from typing import Dict, Any, Optional, Tuple
4
+ import re
5
+ from urllib.parse import urlparse, urlunparse, unquote, parse_qs, urlencode
6
+ import time
7
+ from io import BytesIO
8
+ import requests
9
+
10
+ from botrun_flow_lang.constants import MODIFY_GCS_HTML_MODEL
11
+ from .local_files import upload_html_and_get_public_url
12
+ from botrun_flow_lang.services.storage.storage_factory import storage_store_factory
13
+
14
+
15
+ async def generate_html_file(
16
+ html_content: str,
17
+ botrun_flow_lang_url: str,
18
+ user_id: str,
19
+ title: Optional[str] = None,
20
+ ) -> str:
21
+ """
22
+ Generate HTML file from complete HTML content (including JS and CSS) and upload it to GCS.
23
+
24
+ This function accepts complete HTML documents with JavaScript, CSS, and other elements.
25
+ You can pass either:
26
+ 1. A complete HTML document (<!DOCTYPE html><html>...<head>...</head><body>...</body></html>)
27
+ 2. HTML fragment that will be wrapped in a basic HTML structure if needed
28
+
29
+ The function preserves all JavaScript, CSS, and other elements in the HTML content.
30
+
31
+ Args:
32
+ html_content: Complete HTML content string, including head/body tags, JavaScript, CSS, etc.
33
+ botrun_flow_lang_url: URL for the botrun flow lang API
34
+ user_id: User ID for file upload
35
+ title: Optional title for the HTML page (used only if the HTML doesn't already have a title)
36
+
37
+ Returns:
38
+ str: URL for the HTML file or error message starting with "Error: "
39
+ """
40
+ try:
41
+ # Check if the content is already a complete HTML document
42
+ is_complete_html = html_content.strip().lower().startswith(
43
+ "<!doctype html"
44
+ ) or html_content.strip().lower().startswith("<html")
45
+
46
+ # Only process HTML content if it's not already a complete document
47
+ if not is_complete_html:
48
+ # If not a complete HTML document, check if it has a head tag
49
+ if "<head>" in html_content.lower():
50
+ # Has head tag but not complete doc, add title if needed and provided
51
+ if title and "<title>" not in html_content.lower():
52
+ html_content = html_content.replace(
53
+ "<head>", f"<head>\n <title>{title}</title>", 1
54
+ )
55
+ else:
56
+ # No head tag, wrap the content in a basic HTML structure
57
+ html_content = f"""
58
+ <!DOCTYPE html>
59
+ <html>
60
+ <head>
61
+ <meta charset="utf-8">
62
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
63
+ <title>{title if title else 'HTML Page'}</title>
64
+ <style>
65
+ body {{
66
+ font-family: "Microsoft JhengHei", "微軟正黑體", "Heiti TC", "黑體-繁", sans-serif;
67
+ }}
68
+ </style>
69
+ </head>
70
+ <body>
71
+ {html_content}
72
+ </body>
73
+ </html>
74
+ """
75
+ # If we have complete HTML but title is provided and no title exists
76
+ elif title and "<title>" not in html_content.lower():
77
+ # Try to insert title into the head tag
78
+ if "<head>" in html_content.lower():
79
+ html_content = html_content.replace(
80
+ "<head>", f"<head>\n <title>{title}</title>", 1
81
+ )
82
+
83
+ # Create temporary file
84
+ with NamedTemporaryFile(
85
+ suffix=".html", mode="w", encoding="utf-8", delete=False
86
+ ) as html_temp:
87
+ try:
88
+ # Save HTML content
89
+ html_temp.write(html_content)
90
+ html_temp.flush()
91
+
92
+ # Upload file to GCS
93
+ html_url = await upload_html_and_get_public_url(
94
+ html_temp.name, botrun_flow_lang_url, user_id
95
+ )
96
+
97
+ # Clean up temporary file
98
+ os.unlink(html_temp.name)
99
+
100
+ return html_url
101
+ except Exception as e:
102
+ # Clean up temporary file in case of error
103
+ os.unlink(html_temp.name)
104
+ return f"Error: {str(e)}"
105
+
106
+ except Exception as e:
107
+ return f"Error: {str(e)}"
108
+
109
+
110
+ # todo 還沒改完,我的測試案例測到 3 之後,就不會再增加了
111
+ async def modify_gcs_html(
112
+ html_url: str,
113
+ modification_instruction: str,
114
+ ) -> Tuple[bool, str, Optional[str]]:
115
+ """
116
+ Modify HTML file stored in Google Cloud Storage using Gemini 2.0 Flash LLM.
117
+
118
+ The function parses the GCS URL, fetches the HTML content, sends it to Gemini with
119
+ the modification instruction, executes the generated Python code to modify the HTML,
120
+ and updates the original file in GCS.
121
+
122
+ Args:
123
+ html_url: GCS URL pointing to an HTML file
124
+ (format: https://storage.googleapis.com/[bucket-name]/[doc-path])
125
+ modification_instruction: Natural language instruction for how to modify the HTML
126
+
127
+ Returns:
128
+ Tuple[bool, str, Optional[str]]: (success, original_url, error_message)
129
+ """
130
+ try:
131
+ # 1. Parse the GCS URL to extract bucket name and document path
132
+ url_parts = urlparse(html_url)
133
+
134
+ # Strip query parameters from the URL for processing
135
+ clean_url_parts = url_parts._replace(query="")
136
+ clean_url = urlunparse(clean_url_parts)
137
+
138
+ if not url_parts.netloc.startswith("storage.googleapis.com"):
139
+ return False, html_url, "Error: URL must be a Google Cloud Storage URL"
140
+
141
+ # Extract bucket name and document path correctly
142
+ path_segments = url_parts.path.strip("/").split("/", 1)
143
+ if len(path_segments) < 2:
144
+ return False, html_url, "Error: Invalid GCS URL format"
145
+
146
+ bucket_name = path_segments[0]
147
+ document_path = path_segments[1]
148
+
149
+ # URL decode the document path to handle encoded characters like %40
150
+ decoded_document_path = unquote(document_path)
151
+
152
+ # 2. Fetch the HTML content from GCS
153
+ try:
154
+ # First try to get the HTML directly via the URL
155
+ response = requests.get(clean_url)
156
+ if response.status_code != 200:
157
+ # If direct access fails, use the storage client
158
+ storage = storage_store_factory()
159
+ # Use the original (non-decoded) path for retrieval since that's how it's stored in GCS
160
+ file_object = await storage.retrieve_file(document_path)
161
+ if not file_object:
162
+ return (
163
+ False,
164
+ html_url,
165
+ "Error: Could not retrieve HTML file from GCS",
166
+ )
167
+ # Explicitly decode with UTF-8 to properly handle non-ASCII characters
168
+ html_content = file_object.getvalue().decode("utf-8")
169
+ else:
170
+ # Set encoding for response text (use UTF-8 or detect from content)
171
+ if "charset=" in response.headers.get("content-type", ""):
172
+ # Extract charset from content-type header
173
+ charset = (
174
+ response.headers.get("content-type")
175
+ .split("charset=")[1]
176
+ .split(";")[0]
177
+ )
178
+ response.encoding = charset
179
+ else:
180
+ # Default to UTF-8 if not specified
181
+ response.encoding = "utf-8"
182
+ html_content = response.text
183
+ except Exception as e:
184
+ return False, html_url, f"Error retrieving HTML content: {str(e)}"
185
+
186
+ # 3. Call Gemini API to generate Python code for HTML modification
187
+ try:
188
+ # Import here to avoid loading time and potential circular imports
189
+ import google.generativeai as genai
190
+ from google.generativeai.types import HarmCategory, HarmBlockThreshold
191
+
192
+ # Initialize Gemini client
193
+ api_key = os.getenv("GEMINI_API_KEY", "")
194
+ if not api_key:
195
+ return (
196
+ False,
197
+ html_url,
198
+ "Error: GEMINI_API_KEY environment variable not set",
199
+ )
200
+
201
+ genai.configure(api_key=api_key)
202
+ model = genai.GenerativeModel(MODIFY_GCS_HTML_MODEL)
203
+
204
+ # Create prompt for Gemini
205
+ prompt = f"""You are an expert HTML and Python developer.
206
+ Your task is to modify an HTML document according to the following instruction:
207
+ "{modification_instruction}"
208
+
209
+ Here is the HTML code to modify:
210
+ ```html
211
+ {html_content}
212
+ ```
213
+
214
+ Please provide minimal Python code that makes these modifications to the HTML.
215
+ Your code must:
216
+ 1. Use BeautifulSoup4 to parse and modify the HTML
217
+ 2. Return the modified HTML as a string
218
+ 3. Use a function called 'modify_html' that takes the original HTML as input and returns the modified HTML
219
+ 4. Only include essential code to make the exact change requested - no explanations or verbose comments
220
+ 5. Ensure you preserve the character encoding for non-ASCII characters
221
+ 6. Use BeautifulSoup with features='html.parser'
222
+
223
+ Only provide the Python code, nothing else. Keep the code minimal and direct."""
224
+
225
+ # Generate the Python code
226
+ response = model.generate_content(prompt)
227
+ generated_code = response.text
228
+
229
+ # Extract Python code if it's wrapped in ```python ... ```
230
+ if "```python" in generated_code:
231
+ python_code_match = re.search(
232
+ r"```python(.*?)```", generated_code, re.DOTALL
233
+ )
234
+ if python_code_match:
235
+ generated_code = python_code_match.group(1).strip()
236
+ elif "```" in generated_code:
237
+ python_code_match = re.search(r"```(.*?)```", generated_code, re.DOTALL)
238
+ if python_code_match:
239
+ generated_code = python_code_match.group(1).strip()
240
+
241
+ # 4. Execute the generated Python code
242
+ # Create a safe execution environment
243
+ try:
244
+ local_vars = {"original_html": html_content}
245
+ # Make sure we have BeautifulSoup available
246
+ exec("from bs4 import BeautifulSoup", local_vars)
247
+
248
+ # Execute the generated code
249
+ exec(generated_code, local_vars)
250
+
251
+ # Call the modify_html function
252
+ if "modify_html" in local_vars:
253
+ modified_html = local_vars["modify_html"](html_content)
254
+ else:
255
+ return (
256
+ False,
257
+ html_url,
258
+ "Error: Generated code does not contain a modify_html function",
259
+ )
260
+
261
+ if not modified_html or not isinstance(modified_html, str):
262
+ return (
263
+ False,
264
+ html_url,
265
+ "Error: Generated code did not produce valid HTML",
266
+ )
267
+
268
+ # Check if the model actually made changes to the HTML
269
+ if modified_html.strip() == html_content.strip():
270
+ return (
271
+ False,
272
+ html_url,
273
+ "Error: The model didn't make any changes to the HTML. It might not understand how to perform the requested modification.",
274
+ )
275
+
276
+ except Exception as e:
277
+ return False, html_url, f"Error executing generated code: {str(e)}"
278
+
279
+ # 5. Update the original HTML file in GCS
280
+ try:
281
+ storage = storage_store_factory()
282
+ # Explicitly encode with UTF-8 to preserve non-ASCII characters
283
+ file_object = BytesIO(modified_html.encode("utf-8"))
284
+
285
+ # Store the modified file back to the same location using the decoded path
286
+ # This ensures proper handling of special characters like @ in the path
287
+ success, _ = await storage.store_file(
288
+ decoded_document_path,
289
+ file_object,
290
+ public=True,
291
+ content_type="text/html; charset=utf-8", # Explicitly set UTF-8 charset
292
+ )
293
+
294
+ if not success:
295
+ return (
296
+ False,
297
+ html_url,
298
+ "Error: Failed to update the HTML file in GCS",
299
+ )
300
+
301
+ # Add timestamp as query parameter to the URL to bypass cache
302
+ timestamp = int(time.time())
303
+ url_with_timestamp = urlparse(clean_url)
304
+ new_query = urlencode({"t": timestamp})
305
+ final_url = urlunparse(url_with_timestamp._replace(query=new_query))
306
+
307
+ return True, final_url, None
308
+
309
+ except Exception as e:
310
+ return False, html_url, f"Error updating HTML file: {str(e)}"
311
+
312
+ except Exception as e:
313
+ return False, html_url, f"Error generating modification code: {str(e)}"
314
+
315
+ except Exception as e:
316
+ return False, html_url, f"Error: {str(e)}"