optexity-browser-use 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. browser_use/__init__.py +157 -0
  2. browser_use/actor/__init__.py +11 -0
  3. browser_use/actor/element.py +1175 -0
  4. browser_use/actor/mouse.py +134 -0
  5. browser_use/actor/page.py +561 -0
  6. browser_use/actor/playground/flights.py +41 -0
  7. browser_use/actor/playground/mixed_automation.py +54 -0
  8. browser_use/actor/playground/playground.py +236 -0
  9. browser_use/actor/utils.py +176 -0
  10. browser_use/agent/cloud_events.py +282 -0
  11. browser_use/agent/gif.py +424 -0
  12. browser_use/agent/judge.py +170 -0
  13. browser_use/agent/message_manager/service.py +473 -0
  14. browser_use/agent/message_manager/utils.py +52 -0
  15. browser_use/agent/message_manager/views.py +98 -0
  16. browser_use/agent/prompts.py +413 -0
  17. browser_use/agent/service.py +2316 -0
  18. browser_use/agent/system_prompt.md +185 -0
  19. browser_use/agent/system_prompt_flash.md +10 -0
  20. browser_use/agent/system_prompt_no_thinking.md +183 -0
  21. browser_use/agent/views.py +743 -0
  22. browser_use/browser/__init__.py +41 -0
  23. browser_use/browser/cloud/cloud.py +203 -0
  24. browser_use/browser/cloud/views.py +89 -0
  25. browser_use/browser/events.py +578 -0
  26. browser_use/browser/profile.py +1158 -0
  27. browser_use/browser/python_highlights.py +548 -0
  28. browser_use/browser/session.py +3225 -0
  29. browser_use/browser/session_manager.py +399 -0
  30. browser_use/browser/video_recorder.py +162 -0
  31. browser_use/browser/views.py +200 -0
  32. browser_use/browser/watchdog_base.py +260 -0
  33. browser_use/browser/watchdogs/__init__.py +0 -0
  34. browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
  35. browser_use/browser/watchdogs/crash_watchdog.py +335 -0
  36. browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
  37. browser_use/browser/watchdogs/dom_watchdog.py +817 -0
  38. browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
  39. browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
  40. browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
  41. browser_use/browser/watchdogs/popups_watchdog.py +143 -0
  42. browser_use/browser/watchdogs/recording_watchdog.py +126 -0
  43. browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
  44. browser_use/browser/watchdogs/security_watchdog.py +280 -0
  45. browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
  46. browser_use/cli.py +2359 -0
  47. browser_use/code_use/__init__.py +16 -0
  48. browser_use/code_use/formatting.py +192 -0
  49. browser_use/code_use/namespace.py +665 -0
  50. browser_use/code_use/notebook_export.py +276 -0
  51. browser_use/code_use/service.py +1340 -0
  52. browser_use/code_use/system_prompt.md +574 -0
  53. browser_use/code_use/utils.py +150 -0
  54. browser_use/code_use/views.py +171 -0
  55. browser_use/config.py +505 -0
  56. browser_use/controller/__init__.py +3 -0
  57. browser_use/dom/enhanced_snapshot.py +161 -0
  58. browser_use/dom/markdown_extractor.py +169 -0
  59. browser_use/dom/playground/extraction.py +312 -0
  60. browser_use/dom/playground/multi_act.py +32 -0
  61. browser_use/dom/serializer/clickable_elements.py +200 -0
  62. browser_use/dom/serializer/code_use_serializer.py +287 -0
  63. browser_use/dom/serializer/eval_serializer.py +478 -0
  64. browser_use/dom/serializer/html_serializer.py +212 -0
  65. browser_use/dom/serializer/paint_order.py +197 -0
  66. browser_use/dom/serializer/serializer.py +1170 -0
  67. browser_use/dom/service.py +825 -0
  68. browser_use/dom/utils.py +129 -0
  69. browser_use/dom/views.py +906 -0
  70. browser_use/exceptions.py +5 -0
  71. browser_use/filesystem/__init__.py +0 -0
  72. browser_use/filesystem/file_system.py +619 -0
  73. browser_use/init_cmd.py +376 -0
  74. browser_use/integrations/gmail/__init__.py +24 -0
  75. browser_use/integrations/gmail/actions.py +115 -0
  76. browser_use/integrations/gmail/service.py +225 -0
  77. browser_use/llm/__init__.py +155 -0
  78. browser_use/llm/anthropic/chat.py +242 -0
  79. browser_use/llm/anthropic/serializer.py +312 -0
  80. browser_use/llm/aws/__init__.py +36 -0
  81. browser_use/llm/aws/chat_anthropic.py +242 -0
  82. browser_use/llm/aws/chat_bedrock.py +289 -0
  83. browser_use/llm/aws/serializer.py +257 -0
  84. browser_use/llm/azure/chat.py +91 -0
  85. browser_use/llm/base.py +57 -0
  86. browser_use/llm/browser_use/__init__.py +3 -0
  87. browser_use/llm/browser_use/chat.py +201 -0
  88. browser_use/llm/cerebras/chat.py +193 -0
  89. browser_use/llm/cerebras/serializer.py +109 -0
  90. browser_use/llm/deepseek/chat.py +212 -0
  91. browser_use/llm/deepseek/serializer.py +109 -0
  92. browser_use/llm/exceptions.py +29 -0
  93. browser_use/llm/google/__init__.py +3 -0
  94. browser_use/llm/google/chat.py +542 -0
  95. browser_use/llm/google/serializer.py +120 -0
  96. browser_use/llm/groq/chat.py +229 -0
  97. browser_use/llm/groq/parser.py +158 -0
  98. browser_use/llm/groq/serializer.py +159 -0
  99. browser_use/llm/messages.py +238 -0
  100. browser_use/llm/models.py +271 -0
  101. browser_use/llm/oci_raw/__init__.py +10 -0
  102. browser_use/llm/oci_raw/chat.py +443 -0
  103. browser_use/llm/oci_raw/serializer.py +229 -0
  104. browser_use/llm/ollama/chat.py +97 -0
  105. browser_use/llm/ollama/serializer.py +143 -0
  106. browser_use/llm/openai/chat.py +264 -0
  107. browser_use/llm/openai/like.py +15 -0
  108. browser_use/llm/openai/serializer.py +165 -0
  109. browser_use/llm/openrouter/chat.py +211 -0
  110. browser_use/llm/openrouter/serializer.py +26 -0
  111. browser_use/llm/schema.py +176 -0
  112. browser_use/llm/views.py +48 -0
  113. browser_use/logging_config.py +330 -0
  114. browser_use/mcp/__init__.py +18 -0
  115. browser_use/mcp/__main__.py +12 -0
  116. browser_use/mcp/client.py +544 -0
  117. browser_use/mcp/controller.py +264 -0
  118. browser_use/mcp/server.py +1114 -0
  119. browser_use/observability.py +204 -0
  120. browser_use/py.typed +0 -0
  121. browser_use/sandbox/__init__.py +41 -0
  122. browser_use/sandbox/sandbox.py +637 -0
  123. browser_use/sandbox/views.py +132 -0
  124. browser_use/screenshots/__init__.py +1 -0
  125. browser_use/screenshots/service.py +52 -0
  126. browser_use/sync/__init__.py +6 -0
  127. browser_use/sync/auth.py +357 -0
  128. browser_use/sync/service.py +161 -0
  129. browser_use/telemetry/__init__.py +51 -0
  130. browser_use/telemetry/service.py +112 -0
  131. browser_use/telemetry/views.py +101 -0
  132. browser_use/tokens/__init__.py +0 -0
  133. browser_use/tokens/custom_pricing.py +24 -0
  134. browser_use/tokens/mappings.py +4 -0
  135. browser_use/tokens/service.py +580 -0
  136. browser_use/tokens/views.py +108 -0
  137. browser_use/tools/registry/service.py +572 -0
  138. browser_use/tools/registry/views.py +174 -0
  139. browser_use/tools/service.py +1675 -0
  140. browser_use/tools/utils.py +82 -0
  141. browser_use/tools/views.py +100 -0
  142. browser_use/utils.py +670 -0
  143. optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
  144. optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
  145. optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
  146. optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
  147. optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1340 @@
1
+ """Code-use agent service - Jupyter notebook-like code execution for browser automation."""
2
+
3
+ import asyncio
4
+ import datetime
5
+ import logging
6
+ import re
7
+ import traceback
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from uuid_extensions import uuid7str
12
+
13
+ from browser_use.browser import BrowserSession
14
+ from browser_use.browser.profile import BrowserProfile
15
+ from browser_use.dom.service import DomService
16
+ from browser_use.filesystem.file_system import FileSystem
17
+ from browser_use.llm.base import BaseChatModel
18
+ from browser_use.llm.messages import (
19
+ AssistantMessage,
20
+ BaseMessage,
21
+ ContentPartImageParam,
22
+ ContentPartTextParam,
23
+ ImageURL,
24
+ UserMessage,
25
+ )
26
+ from browser_use.screenshots.service import ScreenshotService
27
+ from browser_use.telemetry.service import ProductTelemetry
28
+ from browser_use.telemetry.views import AgentTelemetryEvent
29
+ from browser_use.tokens.service import TokenCost
30
+ from browser_use.tokens.views import UsageSummary
31
+ from browser_use.tools.service import CodeAgentTools, Tools
32
+ from browser_use.utils import get_browser_use_version
33
+
34
+ from .formatting import format_browser_state_for_llm
35
+ from .namespace import EvaluateError, create_namespace
36
+ from .utils import detect_token_limit_issue, extract_code_blocks, extract_url_from_task, truncate_message_content
37
+ from .views import (
38
+ CodeAgentHistory,
39
+ CodeAgentModelOutput,
40
+ CodeAgentResult,
41
+ CodeAgentState,
42
+ CodeAgentStepMetadata,
43
+ ExecutionStatus,
44
+ NotebookSession,
45
+ )
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ class CodeAgent:
51
+ """
52
+ Agent that executes Python code in a notebook-like environment for browser automation.
53
+
54
+ This agent provides a Jupyter notebook-like interface where the LLM writes Python code
55
+ that gets executed in a persistent namespace with browser control functions available.
56
+ """
57
+
58
+ def __init__(
59
+ self,
60
+ task: str,
61
+ # Optional parameters
62
+ llm: BaseChatModel | None = None,
63
+ browser_session: BrowserSession | None = None,
64
+ browser: BrowserSession | None = None, # Alias for browser_session
65
+ tools: Tools | None = None,
66
+ controller: Tools | None = None, # Alias for tools
67
+ # Agent settings
68
+ page_extraction_llm: BaseChatModel | None = None,
69
+ file_system: FileSystem | None = None,
70
+ available_file_paths: list[str] | None = None,
71
+ sensitive_data: dict[str, str | dict[str, str]] | None = None,
72
+ max_steps: int = 100,
73
+ max_failures: int = 8,
74
+ max_validations: int = 0,
75
+ use_vision: bool = True,
76
+ calculate_cost: bool = False,
77
+ **kwargs,
78
+ ):
79
+ """
80
+ Initialize the code-use agent.
81
+
82
+ Args:
83
+ task: The task description for the agent
84
+ browser_session: Optional browser session (will be created if not provided) [DEPRECATED: use browser]
85
+ browser: Optional browser session (cleaner API)
86
+ tools: Optional Tools instance (will create default if not provided)
87
+ controller: Optional Tools instance
88
+ page_extraction_llm: Optional LLM for page extraction
89
+ file_system: Optional file system for file operations
90
+ available_file_paths: Optional list of available file paths
91
+ sensitive_data: Optional sensitive data dictionary
92
+ max_steps: Maximum number of execution steps
93
+ max_failures: Maximum consecutive errors before termination (default: 8)
94
+ max_validations: Maximum number of times to run the validator agent (default: 0)
95
+ use_vision: Whether to include screenshots in LLM messages (default: True)
96
+ calculate_cost: Whether to calculate token costs (default: False)
97
+ llm: Optional ChatBrowserUse LLM instance (will create default if not provided)
98
+ **kwargs: Additional keyword arguments for compatibility (ignored)
99
+ """
100
+ # Log and ignore unknown kwargs for compatibility
101
+ if kwargs:
102
+ logger.debug(f'Ignoring additional kwargs for CodeAgent compatibility: {list(kwargs.keys())}')
103
+
104
+ if llm is None:
105
+ try:
106
+ from browser_use import ChatBrowserUse
107
+
108
+ llm = ChatBrowserUse()
109
+ logger.debug('CodeAgent using ChatBrowserUse')
110
+ except Exception as e:
111
+ raise RuntimeError(f'Failed to initialize CodeAgent LLM: {e}')
112
+
113
+ if 'ChatBrowserUse' not in llm.__class__.__name__:
114
+ raise ValueError('This agent works only with ChatBrowserUse.')
115
+
116
+ # Handle browser vs browser_session parameter (browser takes precedence)
117
+ if browser and browser_session:
118
+ raise ValueError('Cannot specify both "browser" and "browser_session" parameters. Use "browser" for the cleaner API.')
119
+ browser_session = browser or browser_session
120
+
121
+ # Handle controller vs tools parameter (controller takes precedence)
122
+ if controller and tools:
123
+ raise ValueError('Cannot specify both "controller" and "tools" parameters. Use "controller" for the cleaner API.')
124
+ tools = controller or tools
125
+
126
+ # Store browser_profile for creating browser session if needed
127
+ self._browser_profile_for_init = BrowserProfile() if browser_session is None else None
128
+
129
+ self.task = task
130
+ self.llm = llm
131
+ self.browser_session = browser_session
132
+ self.tools = tools or CodeAgentTools()
133
+ self.page_extraction_llm = page_extraction_llm
134
+ self.file_system = file_system if file_system is not None else FileSystem(base_dir='./')
135
+ self.available_file_paths = available_file_paths or []
136
+ self.sensitive_data = sensitive_data
137
+ self.max_steps = max_steps
138
+ self.max_failures = max_failures
139
+ self.max_validations = max_validations
140
+ self.use_vision = use_vision
141
+
142
+ self.session = NotebookSession()
143
+ self.namespace: dict[str, Any] = {}
144
+ self._llm_messages: list[BaseMessage] = [] # Internal LLM conversation history
145
+ self.complete_history: list[CodeAgentHistory] = [] # Type-safe history with model_output and result
146
+ self.dom_service: DomService | None = None
147
+ self._last_browser_state_text: str | None = None # Track last browser state text
148
+ self._last_screenshot: str | None = None # Track last screenshot (base64)
149
+ self._consecutive_errors = 0 # Track consecutive errors for auto-termination
150
+ self._validation_count = 0 # Track number of validator runs
151
+ self._last_llm_usage: Any | None = None # Track last LLM call usage stats
152
+ self._step_start_time = 0.0 # Track step start time for duration calculation
153
+ self.usage_summary: UsageSummary | None = None # Track usage summary across run for history property
154
+
155
+ # Initialize screenshot service for eval tracking
156
+ self.id = uuid7str()
157
+ timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
158
+ base_tmp = Path('/tmp')
159
+ self.agent_directory = base_tmp / f'browser_use_code_agent_{self.id}_{timestamp}'
160
+ self.screenshot_service = ScreenshotService(agent_directory=self.agent_directory)
161
+
162
+ # Initialize token cost service for usage tracking
163
+ self.token_cost_service = TokenCost(include_cost=calculate_cost)
164
+ self.token_cost_service.register_llm(llm)
165
+ if page_extraction_llm:
166
+ self.token_cost_service.register_llm(page_extraction_llm)
167
+
168
+ # Set version and source for telemetry
169
+ self.version = get_browser_use_version()
170
+ try:
171
+ package_root = Path(__file__).parent.parent.parent
172
+ repo_files = ['.git', 'README.md', 'docs', 'examples']
173
+ if all(Path(package_root / file).exists() for file in repo_files):
174
+ self.source = 'git'
175
+ else:
176
+ self.source = 'pip'
177
+ except Exception:
178
+ self.source = 'unknown'
179
+
180
+ # Telemetry
181
+ self.telemetry = ProductTelemetry()
182
+
183
+ async def run(self, max_steps: int | None = None) -> NotebookSession:
184
+ """
185
+ Run the agent to complete the task.
186
+
187
+ Args:
188
+ max_steps: Optional override for maximum number of steps (uses __init__ value if not provided)
189
+
190
+ Returns:
191
+ The notebook session with all executed cells
192
+ """
193
+ # Use override if provided, otherwise use value from __init__
194
+ steps_to_run = max_steps if max_steps is not None else self.max_steps
195
+ self.max_steps = steps_to_run
196
+ # Start browser if not provided
197
+ if self.browser_session is None:
198
+ assert self._browser_profile_for_init is not None
199
+ self.browser_session = BrowserSession(browser_profile=self._browser_profile_for_init)
200
+ await self.browser_session.start()
201
+
202
+ # Initialize DOM service with cross-origin iframe support enabled
203
+ self.dom_service = DomService(
204
+ browser_session=self.browser_session,
205
+ cross_origin_iframes=True, # Enable for code-use agent to access forms in iframes
206
+ )
207
+
208
+ # Create namespace with all tools
209
+ self.namespace = create_namespace(
210
+ browser_session=self.browser_session,
211
+ tools=self.tools,
212
+ page_extraction_llm=self.page_extraction_llm,
213
+ file_system=self.file_system,
214
+ available_file_paths=self.available_file_paths,
215
+ sensitive_data=self.sensitive_data,
216
+ )
217
+
218
+ # Initialize conversation with task
219
+ self._llm_messages.append(UserMessage(content=f'Task: {self.task}'))
220
+
221
+ # Track agent run error for telemetry
222
+ agent_run_error: str | None = None
223
+
224
+ # Extract URL from task and navigate if found
225
+ initial_url = extract_url_from_task(self.task)
226
+ if initial_url:
227
+ try:
228
+ logger.info(f'Extracted URL from task, navigating to: {initial_url}')
229
+ # Use the navigate action from namespace
230
+ await self.namespace['navigate'](initial_url)
231
+ # Wait for page load
232
+ await asyncio.sleep(2)
233
+
234
+ # Record this navigation as a cell in the notebook
235
+ nav_code = f"await navigate('{initial_url}')"
236
+ cell = self.session.add_cell(source=nav_code)
237
+ cell.status = ExecutionStatus.SUCCESS
238
+ cell.execution_count = self.session.increment_execution_count()
239
+ cell.output = f'Navigated to {initial_url}'
240
+
241
+ # Get browser state after navigation for the cell
242
+ if self.dom_service:
243
+ try:
244
+ browser_state_text, _ = await self._get_browser_state()
245
+ cell.browser_state = browser_state_text
246
+ except Exception as state_error:
247
+ logger.debug(f'Failed to capture browser state for initial navigation cell: {state_error}')
248
+
249
+ except Exception as e:
250
+ logger.warning(f'Failed to navigate to extracted URL {initial_url}: {e}')
251
+ # Record failed navigation as error cell
252
+ nav_code = f"await navigate('{initial_url}')"
253
+ cell = self.session.add_cell(source=nav_code)
254
+ cell.status = ExecutionStatus.ERROR
255
+ cell.execution_count = self.session.increment_execution_count()
256
+ cell.error = str(e)
257
+
258
+ # Get initial browser state before first LLM call
259
+ if self.browser_session and self.dom_service:
260
+ try:
261
+ browser_state_text, screenshot = await self._get_browser_state()
262
+ self._last_browser_state_text = browser_state_text
263
+ self._last_screenshot = screenshot
264
+ except Exception as e:
265
+ logger.warning(f'Failed to get initial browser state: {e}')
266
+
267
+ # Main execution loop
268
+ for step in range(self.max_steps):
269
+ logger.info(f'\n\n\n\n\n\n\nStep {step + 1}/{self.max_steps}')
270
+
271
+ # Start timing this step
272
+ self._step_start_time = datetime.datetime.now().timestamp()
273
+
274
+ # Check if we're approaching the step limit or error limit and inject warning
275
+ steps_remaining = self.max_steps - step - 1
276
+ errors_remaining = self.max_failures - self._consecutive_errors
277
+
278
+ should_warn = (
279
+ steps_remaining <= 1 # Last step or next to last
280
+ or errors_remaining <= 1 # One more error will terminate
281
+ or (steps_remaining <= 2 and self._consecutive_errors >= 2) # Close to both limits
282
+ )
283
+
284
+ if should_warn:
285
+ warning_message = (
286
+ f'\n\n⚠️ CRITICAL WARNING: You are approaching execution limits!\n'
287
+ f'- Steps remaining: {steps_remaining + 1}\n'
288
+ f'- Consecutive errors: {self._consecutive_errors}/{self.max_failures}\n\n'
289
+ f'YOU MUST call done() in your NEXT response, even if the task is incomplete:\n'
290
+ f"- Set success=False if you couldn't complete the task\n"
291
+ f'- Return EVERYTHING you found so far (partial data is better than nothing)\n'
292
+ f"- Include any variables you've stored (products, all_data, etc.)\n"
293
+ f"- Explain what worked and what didn't\n\n"
294
+ f'Without done(), the user will receive NOTHING.'
295
+ )
296
+ self._llm_messages.append(UserMessage(content=warning_message))
297
+
298
+ try:
299
+ # Fetch fresh browser state right before LLM call (only if not already set)
300
+ if not self._last_browser_state_text and self.browser_session and self.dom_service:
301
+ try:
302
+ logger.debug('🔍 Fetching browser state before LLM call...')
303
+ browser_state_text, screenshot = await self._get_browser_state()
304
+ self._last_browser_state_text = browser_state_text
305
+ self._last_screenshot = screenshot
306
+
307
+ # # Log browser state
308
+ # if len(browser_state_text) > 2000:
309
+ # logger.info(
310
+ # f'Browser state (before LLM):\n{browser_state_text[:2000]}...\n[Truncated, full state {len(browser_state_text)} chars sent to LLM]'
311
+ # )
312
+ # else:
313
+ # logger.info(f'Browser state (before LLM):\n{browser_state_text}')
314
+ except Exception as e:
315
+ logger.warning(f'Failed to get browser state before LLM call: {e}')
316
+
317
+ # Get code from LLM (this also adds to self._llm_messages)
318
+ try:
319
+ code, full_llm_response = await self._get_code_from_llm()
320
+ except Exception as llm_error:
321
+ # LLM call failed - count as consecutive error and retry
322
+ self._consecutive_errors += 1
323
+ logger.warning(
324
+ f'LLM call failed (consecutive errors: {self._consecutive_errors}/{self.max_failures}), retrying: {llm_error}'
325
+ )
326
+
327
+ # Check if we've hit the consecutive error limit
328
+ if self._consecutive_errors >= self.max_failures:
329
+ logger.error(f'Terminating: {self.max_failures} consecutive LLM failures')
330
+ break
331
+
332
+ await asyncio.sleep(1) # Brief pause before retry
333
+ continue
334
+
335
+ if not code or code.strip() == '':
336
+ # If task is already done, empty code is fine (LLM explaining completion)
337
+ if self._is_task_done():
338
+ logger.info('Task already marked as done, LLM provided explanation without code')
339
+ # Add the text response to history as a non-code step
340
+ await self._add_step_to_complete_history(
341
+ model_output_code='',
342
+ full_llm_response=full_llm_response,
343
+ output=full_llm_response, # Treat the explanation as output
344
+ error=None,
345
+ screenshot_path=await self._capture_screenshot(step + 1),
346
+ )
347
+ break # Exit the loop since task is done
348
+
349
+ logger.warning('LLM returned empty code')
350
+ self._consecutive_errors += 1
351
+
352
+ # new state
353
+ if self.browser_session and self.dom_service:
354
+ try:
355
+ browser_state_text, screenshot = await self._get_browser_state()
356
+ self._last_browser_state_text = browser_state_text
357
+ self._last_screenshot = screenshot
358
+ except Exception as e:
359
+ logger.warning(f'Failed to get new browser state: {e}')
360
+ continue
361
+
362
+ # Execute code blocks sequentially if multiple python blocks exist
363
+ # This allows JS/bash blocks to be injected into namespace before Python code uses them
364
+ all_blocks = self.namespace.get('_all_code_blocks', {})
365
+ python_blocks = [k for k in sorted(all_blocks.keys()) if k.startswith('python_')]
366
+
367
+ if len(python_blocks) > 1:
368
+ # Multiple Python blocks - execute each sequentially
369
+ output = None
370
+ error = None
371
+
372
+ for i, block_key in enumerate(python_blocks):
373
+ logger.info(f'Executing Python block {i + 1}/{len(python_blocks)}')
374
+ block_code = all_blocks[block_key]
375
+ block_output, block_error, _ = await self._execute_code(block_code)
376
+
377
+ # Accumulate outputs
378
+ if block_output:
379
+ output = (output or '') + block_output
380
+ if block_error:
381
+ error = block_error
382
+ # Stop on first error
383
+ break
384
+ else:
385
+ # Single Python block - execute normally
386
+ output, error, _ = await self._execute_code(code)
387
+
388
+ # Track consecutive errors
389
+ if error:
390
+ self._consecutive_errors += 1
391
+ logger.warning(f'Consecutive errors: {self._consecutive_errors}/{self.max_failures}')
392
+
393
+ # Check if we've hit the consecutive error limit
394
+ if self._consecutive_errors >= self.max_failures:
395
+ logger.error(
396
+ f'Terminating: {self.max_failures} consecutive errors reached. The agent is unable to make progress.'
397
+ )
398
+ # Add termination message to complete history before breaking
399
+ await self._add_step_to_complete_history(
400
+ model_output_code=code,
401
+ full_llm_response=f'[Terminated after {self.max_failures} consecutive errors]',
402
+ output=None,
403
+ error=f'Auto-terminated: {self.max_failures} consecutive errors without progress',
404
+ screenshot_path=None,
405
+ )
406
+ break
407
+ else:
408
+ # Reset consecutive error counter on success
409
+ self._consecutive_errors = 0
410
+
411
+ # Check if task is done - validate completion first if not at limits
412
+ if self._is_task_done():
413
+ # Get the final result from namespace (from done() call)
414
+ final_result: str | None = self.namespace.get('_task_result') # type: ignore[assignment]
415
+
416
+ # Check if we should validate (not at step/error limits and under max validations)
417
+ steps_remaining = self.max_steps - step - 1
418
+ should_validate = (
419
+ self._validation_count < self.max_validations # Haven't exceeded max validations
420
+ and steps_remaining >= 4 # At least 4 steps away from limit
421
+ and self._consecutive_errors < 3 # Not close to error limit (8 consecutive)
422
+ )
423
+
424
+ if should_validate:
425
+ self._validation_count += 1
426
+ logger.info('Validating task completion with LLM...')
427
+ from .namespace import validate_task_completion
428
+
429
+ is_complete, reasoning = await validate_task_completion(
430
+ task=self.task,
431
+ output=final_result,
432
+ llm=self.llm,
433
+ )
434
+
435
+ if not is_complete:
436
+ # Task not truly complete - inject feedback and continue
437
+ logger.warning('Validator: Task not complete, continuing...')
438
+ validation_feedback = (
439
+ f'\n\n⚠️ VALIDATOR FEEDBACK:\n'
440
+ f'Your done() call was rejected. The task is NOT complete yet.\n\n'
441
+ f'Validation reasoning:\n{reasoning}\n\n'
442
+ f'You must continue working on the task. Analyze what is missing and complete it.\n'
443
+ f'Do NOT call done() again until the task is truly finished.'
444
+ )
445
+
446
+ # Clear the done flag so execution continues
447
+ self.namespace['_task_done'] = False
448
+ self.namespace.pop('_task_result', None)
449
+ self.namespace.pop('_task_success', None)
450
+
451
+ # Add validation feedback to LLM messages
452
+ self._llm_messages.append(UserMessage(content=validation_feedback))
453
+
454
+ # Don't override output - let execution continue normally
455
+ else:
456
+ logger.info('Validator: Task complete')
457
+ # Override output with done message for final step
458
+ if final_result:
459
+ output = final_result
460
+ else:
461
+ # At limits - skip validation and accept done()
462
+ if self._validation_count >= self.max_validations:
463
+ logger.info(
464
+ f'Reached max validations ({self.max_validations}) - skipping validation and accepting done()'
465
+ )
466
+ else:
467
+ logger.info('At step/error limits - skipping validation')
468
+ if final_result:
469
+ output = final_result
470
+
471
+ if output:
472
+ # Check if this is the final done() output
473
+ if self._is_task_done():
474
+ # Show done() output more prominently
475
+ logger.info(
476
+ f'✓ Task completed - Final output from done():\n{output[:300] if len(output) > 300 else output}'
477
+ )
478
+ # Also show files_to_display if they exist in namespace
479
+ attachments: list[str] | None = self.namespace.get('_task_attachments') # type: ignore[assignment]
480
+ if attachments:
481
+ logger.info(f'Files displayed: {", ".join(attachments)}')
482
+ else:
483
+ logger.info(f'Code output:\n{output}')
484
+
485
+ # Browser state is now only logged when fetched before LLM call (not after execution)
486
+
487
+ # Take screenshot for eval tracking
488
+ screenshot_path = await self._capture_screenshot(step + 1)
489
+
490
+ # Add step to complete_history for eval system
491
+ await self._add_step_to_complete_history(
492
+ model_output_code=code,
493
+ full_llm_response=full_llm_response,
494
+ output=output,
495
+ error=error,
496
+ screenshot_path=screenshot_path,
497
+ )
498
+
499
+ # Check if task is done (after validation)
500
+ if self._is_task_done():
501
+ # Get the final result from namespace
502
+ final_result: str | None = self.namespace.get('_task_result', output) # type: ignore[assignment]
503
+ logger.info('Task completed successfully')
504
+ if final_result:
505
+ logger.info(f'Final result: {final_result}')
506
+ break
507
+ # If validation rejected done(), continue to next iteration
508
+ # The feedback message has already been added to _llm_messages
509
+
510
+ # Add result to LLM messages for next iteration (without browser state)
511
+ result_message = self._format_execution_result(code, output, error, current_step=step + 1)
512
+ truncated_result = truncate_message_content(result_message)
513
+ self._llm_messages.append(UserMessage(content=truncated_result))
514
+
515
+ except Exception as e:
516
+ logger.error(f'Error in step {step + 1}: {e}')
517
+ traceback.print_exc()
518
+ break
519
+ else:
520
+ # Loop completed without break - max_steps reached
521
+ logger.warning(f'Maximum steps ({self.max_steps}) reached without task completion')
522
+
523
+ # If task is not done, capture the last step's output as partial result
524
+ if not self._is_task_done() and self.complete_history:
525
+ # Get the last step's output/error and use it as final extracted_content
526
+ last_step = self.complete_history[-1]
527
+ last_result = last_step.result[0] if last_step.result else None
528
+ last_output = last_result.extracted_content if last_result else None
529
+ last_error = last_result.error if last_result else None
530
+
531
+ # Build a partial result message from the last step
532
+ partial_result_parts = []
533
+ partial_result_parts.append(f'Task incomplete - reached step limit ({self.max_steps} steps).')
534
+ partial_result_parts.append('Last step output:')
535
+
536
+ if last_output:
537
+ partial_result_parts.append(f'\nOutput: {last_output}')
538
+ if last_error:
539
+ partial_result_parts.append(f'\nError: {last_error}')
540
+
541
+ # Add any accumulated variables that might contain useful data
542
+ data_vars = []
543
+ for var_name in sorted(self.namespace.keys()):
544
+ if not var_name.startswith('_') and var_name not in {'json', 'asyncio', 'csv', 're', 'datetime', 'Path'}:
545
+ var_value = self.namespace[var_name]
546
+ # Check if it's a list or dict that might contain collected data
547
+ if isinstance(var_value, (list, dict)) and var_value:
548
+ data_vars.append(f' - {var_name}: {type(var_value).__name__} with {len(var_value)} items')
549
+
550
+ if data_vars:
551
+ partial_result_parts.append('\nVariables in namespace that may contain partial data:')
552
+ partial_result_parts.extend(data_vars)
553
+
554
+ partial_result = '\n'.join(partial_result_parts)
555
+
556
+ # Update the last step's extracted_content with this partial result
557
+ if last_result:
558
+ last_result.extracted_content = partial_result
559
+ last_result.is_done = False
560
+ last_result.success = False
561
+
562
+ logger.info(f'\nPartial result captured from last step:\n{partial_result}')
563
+
564
+ # Log final summary if task was completed
565
+ if self._is_task_done():
566
+ logger.info('\n' + '=' * 60)
567
+ logger.info('TASK COMPLETED SUCCESSFULLY')
568
+ logger.info('=' * 60)
569
+ final_result: str | None = self.namespace.get('_task_result') # type: ignore[assignment]
570
+ if final_result:
571
+ logger.info(f'\nFinal Output:\n{final_result}')
572
+
573
+ attachments: list[str] | None = self.namespace.get('_task_attachments') # type: ignore[assignment]
574
+ if attachments:
575
+ logger.info(f'\nFiles Attached:\n{chr(10).join(attachments)}')
576
+ logger.info('=' * 60 + '\n')
577
+
578
+ # Auto-close browser if keep_alive is False
579
+ await self.close()
580
+
581
+ # Store usage summary for history property
582
+ self.usage_summary = await self.token_cost_service.get_usage_summary()
583
+
584
+ # Log token usage summary
585
+ await self.token_cost_service.log_usage_summary()
586
+
587
+ # Log telemetry event
588
+ try:
589
+ self._log_agent_event(max_steps=self.max_steps, agent_run_error=agent_run_error)
590
+ except Exception as log_e:
591
+ logger.error(f'Failed to log telemetry event: {log_e}', exc_info=True)
592
+
593
+ return self.session
594
+
595
+ async def _get_code_from_llm(self) -> tuple[str, str]:
596
+ """Get Python code from the LLM.
597
+
598
+ Returns:
599
+ Tuple of (extracted_code, full_llm_response)
600
+ """
601
+ # Prepare messages for this request
602
+ # Include browser state as separate message if available (not accumulated in history)
603
+ messages_to_send = self._llm_messages.copy()
604
+
605
+ if self._last_browser_state_text:
606
+ # Create message with optional screenshot
607
+ if self.use_vision and self._last_screenshot:
608
+ # Build content with text + screenshot
609
+ content_parts: list[ContentPartTextParam | ContentPartImageParam] = [
610
+ ContentPartTextParam(text=self._last_browser_state_text)
611
+ ]
612
+
613
+ # Add screenshot
614
+ content_parts.append(
615
+ ContentPartImageParam(
616
+ image_url=ImageURL(
617
+ url=f'data:image/jpeg;base64,{self._last_screenshot}',
618
+ media_type='image/jpeg',
619
+ detail='auto',
620
+ ),
621
+ )
622
+ )
623
+
624
+ messages_to_send.append(UserMessage(content=content_parts))
625
+ else:
626
+ # Text only
627
+ messages_to_send.append(UserMessage(content=self._last_browser_state_text))
628
+
629
+ # Clear browser state after including it so it's only in this request
630
+ self._last_browser_state_text = None
631
+ self._last_screenshot = None
632
+
633
+ # Call LLM with message history (including temporary browser state message)
634
+ response = await self.llm.ainvoke(messages_to_send)
635
+
636
+ # Store usage stats from this LLM call
637
+ self._last_llm_usage = response.usage
638
+
639
+ # Log the LLM's raw output for debugging
640
+ logger.info(f'LLM Response:\n{response.completion}')
641
+
642
+ # Check for token limit or repetition issues
643
+ max_tokens = getattr(self.llm, 'max_tokens', None)
644
+ completion_tokens = response.usage.completion_tokens if response.usage else None
645
+ is_problematic, issue_message = detect_token_limit_issue(
646
+ completion=response.completion,
647
+ completion_tokens=completion_tokens,
648
+ max_tokens=max_tokens,
649
+ stop_reason=response.stop_reason,
650
+ )
651
+
652
+ if is_problematic:
653
+ logger.warning(f'Token limit issue detected: {issue_message}')
654
+ # Don't add the bad response to history
655
+ # Instead, inject a system message prompting recovery
656
+ recovery_prompt = (
657
+ f'Your previous response hit a token limit or became repetitive: {issue_message}\n\n'
658
+ 'Please write a SHORT plan (2 sentences) for what to do next, then execute ONE simple action.'
659
+ )
660
+ self._llm_messages.append(UserMessage(content=recovery_prompt))
661
+ # Return a controlled error message instead of corrupted code
662
+ return '', f'[Token limit error: {issue_message}]'
663
+
664
+ # Store the full response
665
+ full_response = response.completion
666
+
667
+ # Extract code blocks from response
668
+ # Support multiple code block types: python, js, bash, markdown
669
+ code_blocks = extract_code_blocks(response.completion)
670
+
671
+ # Inject non-python blocks into namespace as variables
672
+ # Track which variables are code blocks for browser state display
673
+ if '_code_block_vars' not in self.namespace:
674
+ self.namespace['_code_block_vars'] = set()
675
+
676
+ for block_type, block_content in code_blocks.items():
677
+ if not block_type.startswith('python'):
678
+ # Store js, bash, markdown blocks (and named variants) as variables in namespace
679
+ self.namespace[block_type] = block_content
680
+ self.namespace['_code_block_vars'].add(block_type)
681
+ print(f'→ Code block variable: {block_type} (str, {len(block_content)} chars)')
682
+ logger.debug(f'Injected {block_type} block into namespace ({len(block_content)} chars)')
683
+
684
+ # Store all code blocks for sequential execution
685
+ self.namespace['_all_code_blocks'] = code_blocks
686
+
687
+ # Get Python code if it exists
688
+ # If no python block exists and no other code blocks exist, return empty string to skip execution
689
+ # This prevents treating plain text explanations as code
690
+ code = code_blocks.get('python', response.completion)
691
+
692
+ # Add to LLM messages (truncate for history to save context)
693
+ truncated_completion = truncate_message_content(response.completion)
694
+ self._llm_messages.append(AssistantMessage(content=truncated_completion))
695
+
696
+ return code, full_response
697
+
698
+ def _print_variable_info(self, var_name: str, value: Any) -> None:
699
+ """Print compact info about a variable assignment."""
700
+ # Skip built-in modules and known imports
701
+ skip_names = {
702
+ 'json',
703
+ 'asyncio',
704
+ 'csv',
705
+ 're',
706
+ 'datetime',
707
+ 'Path',
708
+ 'pd',
709
+ 'np',
710
+ 'plt',
711
+ 'requests',
712
+ 'BeautifulSoup',
713
+ 'PdfReader',
714
+ 'browser',
715
+ 'file_system',
716
+ }
717
+ if var_name in skip_names:
718
+ return
719
+
720
+ # Skip code block variables (already printed)
721
+ if '_code_block_vars' in self.namespace and var_name in self.namespace.get('_code_block_vars', set()):
722
+ return
723
+
724
+ # Print compact variable info
725
+ if isinstance(value, (list, dict)):
726
+ preview = str(value)[:100]
727
+ print(f'→ Variable: {var_name} ({type(value).__name__}, len={len(value)}, preview={preview}...)')
728
+ elif isinstance(value, str) and len(value) > 50:
729
+ print(f'→ Variable: {var_name} (str, {len(value)} chars, preview={value[:50]}...)')
730
+ elif callable(value):
731
+ print(f'→ Variable: {var_name} (function)')
732
+ else:
733
+ print(f'→ Variable: {var_name} ({type(value).__name__}, value={repr(value)[:50]})')
734
+
735
+ async def _execute_code(self, code: str) -> tuple[str | None, str | None, str | None]:
736
+ """
737
+ Execute Python code in the namespace.
738
+
739
+ Args:
740
+ code: The Python code to execute
741
+
742
+ Returns:
743
+ Tuple of (output, error, browser_state)
744
+ """
745
+ # Create new cell
746
+ cell = self.session.add_cell(source=code)
747
+ cell.status = ExecutionStatus.RUNNING
748
+ cell.execution_count = self.session.increment_execution_count()
749
+
750
+ output = None
751
+ error = None
752
+ browser_state = None
753
+
754
+ try:
755
+ # Capture output
756
+ import ast
757
+ import io
758
+ import sys
759
+
760
+ old_stdout = sys.stdout
761
+ sys.stdout = io.StringIO()
762
+
763
+ try:
764
+ # Add asyncio to namespace if not already there
765
+ if 'asyncio' not in self.namespace:
766
+ self.namespace['asyncio'] = asyncio
767
+
768
+ # Store the current code in namespace for done() validation
769
+ self.namespace['_current_cell_code'] = code
770
+ # Store consecutive errors count for done() validation
771
+ self.namespace['_consecutive_errors'] = self._consecutive_errors
772
+
773
+ # Check if code contains await expressions - if so, wrap in async function
774
+ # This mimics how Jupyter/IPython handles top-level await
775
+ try:
776
+ tree = ast.parse(code, mode='exec')
777
+ has_await = any(isinstance(node, (ast.Await, ast.AsyncWith, ast.AsyncFor)) for node in ast.walk(tree))
778
+ except SyntaxError:
779
+ # If parse fails, let exec handle the error
780
+ has_await = False
781
+
782
+ if has_await:
783
+ # When code has await, we must wrap in async function
784
+ # To make variables persist naturally (like Jupyter without needing 'global'):
785
+ # 1. Extract all assigned variable names from the code
786
+ # 2. Inject 'global' declarations for variables that already exist in namespace
787
+ # 3. Extract user's explicit global declarations and pre-define those vars
788
+ # 4. Return locals() so we can update namespace with new variables
789
+
790
+ # Find all variable names being assigned + user's explicit globals
791
+ try:
792
+ assigned_names = set()
793
+ user_global_names = set()
794
+
795
+ for node in ast.walk(tree):
796
+ if isinstance(node, ast.Assign):
797
+ for target in node.targets:
798
+ if isinstance(target, ast.Name):
799
+ assigned_names.add(target.id)
800
+ elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
801
+ assigned_names.add(node.target.id)
802
+ elif isinstance(node, (ast.AnnAssign, ast.NamedExpr)):
803
+ if hasattr(node, 'target') and isinstance(node.target, ast.Name):
804
+ assigned_names.add(node.target.id)
805
+ elif isinstance(node, ast.Global):
806
+ # Track user's explicit global declarations
807
+ user_global_names.update(node.names)
808
+
809
+ # Pre-define any user-declared globals that don't exist yet
810
+ # This prevents NameError when user writes "global foo" before "foo = ..."
811
+ for name in user_global_names:
812
+ if name not in self.namespace:
813
+ self.namespace[name] = None
814
+
815
+ # Filter to only existing namespace vars (like Jupyter does)
816
+ # Include both: assigned vars that exist + user's explicit globals
817
+ existing_vars = {name for name in (assigned_names | user_global_names) if name in self.namespace}
818
+ except Exception as e:
819
+ existing_vars = set()
820
+
821
+ # Build global declaration if needed
822
+ global_decl = ''
823
+ has_global_decl = False
824
+ if existing_vars:
825
+ vars_str = ', '.join(sorted(existing_vars))
826
+ global_decl = f' global {vars_str}\n'
827
+ has_global_decl = True
828
+
829
+ indented_code = '\n'.join(' ' + line if line.strip() else line for line in code.split('\n'))
830
+ wrapped_code = f"""async def __code_exec__():
831
+ {global_decl}{indented_code}
832
+ # Return locals so we can update the namespace
833
+ return locals()
834
+
835
+ __code_exec_coro__ = __code_exec__()
836
+ """
837
+ # Store whether we added a global declaration (needed for error line mapping)
838
+ self.namespace['_has_global_decl'] = has_global_decl
839
+
840
+ # Compile and execute wrapper at module level
841
+ compiled_code = compile(wrapped_code, '<code>', 'exec')
842
+ exec(compiled_code, self.namespace, self.namespace)
843
+
844
+ # Get and await the coroutine, then update namespace with new/modified variables
845
+ coro = self.namespace.get('__code_exec_coro__')
846
+ if coro:
847
+ result_locals = await coro
848
+ # Update namespace with all variables from the function's locals
849
+ # This makes variable assignments persist across cells
850
+ if result_locals:
851
+ for key, value in result_locals.items():
852
+ if not key.startswith('_'):
853
+ self.namespace[key] = value
854
+ # Variable info is tracked in "Available" section, no need for verbose inline output
855
+
856
+ # Clean up temporary variables
857
+ self.namespace.pop('__code_exec_coro__', None)
858
+ self.namespace.pop('__code_exec__', None)
859
+ else:
860
+ # No await - execute directly at module level for natural variable scoping
861
+ # This means x = x + 10 will work without needing 'global x'
862
+
863
+ # Track variables before execution
864
+ vars_before = set(self.namespace.keys())
865
+
866
+ compiled_code = compile(code, '<code>', 'exec')
867
+ exec(compiled_code, self.namespace, self.namespace)
868
+
869
+ # Track newly created/modified variables (info shown in "Available" section)
870
+ vars_after = set(self.namespace.keys())
871
+ new_vars = vars_after - vars_before
872
+
873
+ # Get output
874
+ output_value = sys.stdout.getvalue()
875
+ if output_value:
876
+ output = output_value
877
+
878
+ finally:
879
+ sys.stdout = old_stdout
880
+
881
+ # Wait 2 seconds for page to stabilize after code execution
882
+ await asyncio.sleep(0.5)
883
+
884
+ # Note: Browser state is now fetched right before LLM call instead of after each execution
885
+ # This reduces unnecessary state fetches for operations that don't affect the browser
886
+
887
+ cell.status = ExecutionStatus.SUCCESS
888
+ cell.output = output
889
+ cell.browser_state = None # Will be captured in next iteration before LLM call
890
+
891
+ except Exception as e:
892
+ # Handle EvaluateError specially - JavaScript execution failed
893
+ if isinstance(e, EvaluateError):
894
+ error = str(e)
895
+ cell.status = ExecutionStatus.ERROR
896
+ cell.error = error
897
+ logger.error(f'Code execution error: {error}')
898
+
899
+ await asyncio.sleep(1)
900
+
901
+ # Browser state will be fetched before next LLM call
902
+ # Return immediately - do not continue executing code
903
+ return output, error, None
904
+
905
+ # Handle NameError specially - check for code block variable confusion
906
+ if isinstance(e, NameError):
907
+ error_msg = str(e)
908
+ cell.status = ExecutionStatus.ERROR
909
+ cell.error = error
910
+
911
+ # Browser state will be fetched before next LLM call
912
+ await asyncio.sleep(0.5)
913
+ return output, error, None
914
+
915
+ # For syntax errors and common parsing errors, show just the error message
916
+ # without the full traceback to keep output clean
917
+ if isinstance(e, SyntaxError):
918
+ error_msg = e.msg if e.msg else str(e)
919
+ error = f'{type(e).__name__}: {error_msg}'
920
+
921
+ # Detect common f-string issues with JSON/JavaScript code
922
+ if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower() and code:
923
+ # Check if code contains f-strings with potential JSON/JS content
924
+ has_fstring = bool(re.search(r'\bf["\']', code))
925
+ has_json_pattern = bool(re.search(r'json\.dumps|"[^"]*\{[^"]*\}[^"]*"|\'[^\']*\{[^\']*\}[^\']*\'', code))
926
+ has_js_pattern = bool(re.search(r'evaluate\(|await evaluate', code))
927
+
928
+ if has_fstring and (has_json_pattern or has_js_pattern):
929
+ error += (
930
+ '\n\n💡 TIP: Detected f-string with JSON/JavaScript code containing {}.\n'
931
+ ' Use separate ```js or ```markdown blocks instead of f-strings to avoid escaping issues.\n'
932
+ ' If your code block needs ``` inside it, wrap with 4+ backticks: ````markdown code`\n'
933
+ )
934
+
935
+ # Detect and provide helpful hints for common string literal errors
936
+ if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower():
937
+ # Detect what type of string literal is unterminated
938
+ is_triple = 'triple-quoted' in error_msg.lower()
939
+ msg_lower = error_msg.lower()
940
+
941
+ # Detect prefix type from error message
942
+ if 'f-string' in msg_lower and 'raw' in msg_lower:
943
+ prefix = 'rf or fr'
944
+ desc = 'raw f-string'
945
+ elif 'f-string' in msg_lower:
946
+ prefix = 'f'
947
+ desc = 'f-string'
948
+ elif 'raw' in msg_lower and 'bytes' in msg_lower:
949
+ prefix = 'rb or br'
950
+ desc = 'raw bytes'
951
+ elif 'raw' in msg_lower:
952
+ prefix = 'r'
953
+ desc = 'raw string'
954
+ elif 'bytes' in msg_lower:
955
+ prefix = 'b'
956
+ desc = 'bytes'
957
+ else:
958
+ prefix = ''
959
+ desc = 'string'
960
+
961
+ # Build hint based on triple-quoted vs single/double quoted
962
+ if is_triple:
963
+ if prefix:
964
+ hint = f"Hint: Unterminated {prefix}'''...''' or {prefix}\"\"\"...\"\" ({desc}). Check for missing closing quotes or unescaped quotes inside."
965
+ else:
966
+ hint = "Hint: Unterminated '''...''' or \"\"\"...\"\" detected. Check for missing closing quotes or unescaped quotes inside."
967
+ hint += '\n If you need ``` inside your string, use a ````markdown varname` code block with 4+ backticks instead.'
968
+ else:
969
+ if prefix:
970
+ hint = f'Hint: Unterminated {prefix}\'...\' or {prefix}"..." ({desc}). Check for missing closing quote or unescaped quotes inside.'
971
+ else:
972
+ hint = 'Hint: Unterminated \'...\' or "..." detected. Check for missing closing quote or unescaped quotes inside the string.'
973
+ error += f'\n{hint}'
974
+
975
+ # Show the problematic line from the code
976
+ if e.text:
977
+ error += f'\n{e.text}'
978
+ elif e.lineno and code:
979
+ # If e.text is empty, extract the line from the code
980
+ lines = code.split('\n')
981
+ if 0 < e.lineno <= len(lines):
982
+ error += f'\n{lines[e.lineno - 1]}'
983
+
984
+ else:
985
+ # For other errors, try to extract useful information
986
+ error_str = str(e)
987
+ error = f'{type(e).__name__}: {error_str}' if error_str else f'{type(e).__name__} occurred'
988
+
989
+ # For RuntimeError or other exceptions, try to extract traceback info
990
+ # to show which line in the user's code actually failed
991
+ if hasattr(e, '__traceback__'):
992
+ # Walk the traceback to find the frame with '<code>' filename
993
+ tb = e.__traceback__
994
+ user_code_lineno = None
995
+ while tb is not None:
996
+ frame = tb.tb_frame
997
+ if frame.f_code.co_filename == '<code>':
998
+ # Found the frame executing user code
999
+ # Get the line number from the traceback
1000
+ user_code_lineno = tb.tb_lineno
1001
+ break
1002
+ tb = tb.tb_next
1003
+
1004
+ cell.status = ExecutionStatus.ERROR
1005
+ cell.error = error
1006
+ logger.error(f'Code execution error: {error}')
1007
+
1008
+ await asyncio.sleep(1)
1009
+
1010
+ # Browser state will be fetched before next LLM call
1011
+
1012
+ return output, error, None
1013
+
1014
+ async def _get_browser_state(self) -> tuple[str, str | None]:
1015
+ """Get the current browser state as text with ultra-minimal DOM structure for code agents.
1016
+
1017
+ Returns:
1018
+ Tuple of (browser_state_text, screenshot_base64)
1019
+ """
1020
+ if not self.browser_session or not self.dom_service:
1021
+ return 'Browser state not available', None
1022
+
1023
+ try:
1024
+ # Get full browser state including screenshot if use_vision is enabled
1025
+ include_screenshot = True
1026
+ state = await self.browser_session.get_browser_state_summary(include_screenshot=include_screenshot)
1027
+
1028
+ # Format browser state with namespace context
1029
+ browser_state_text = await format_browser_state_for_llm(
1030
+ state=state, namespace=self.namespace, browser_session=self.browser_session
1031
+ )
1032
+
1033
+ screenshot = state.screenshot if include_screenshot else None
1034
+ return browser_state_text, screenshot
1035
+
1036
+ except Exception as e:
1037
+ logger.error(f'Failed to get browser state: {e}')
1038
+ return f'Error getting browser state: {e}', None
1039
+
1040
+ def _format_execution_result(self, code: str, output: str | None, error: str | None, current_step: int | None = None) -> str:
1041
+ """Format the execution result for the LLM (without browser state)."""
1042
+ result = []
1043
+
1044
+ # Add step progress header if step number provided
1045
+ if current_step is not None:
1046
+ progress_header = f'Step {current_step}/{self.max_steps} executed'
1047
+ # Add consecutive failure tracking if there are errors
1048
+ if error and self._consecutive_errors > 0:
1049
+ progress_header += f' | Consecutive failures: {self._consecutive_errors}/{self.max_failures}'
1050
+ result.append(progress_header)
1051
+
1052
+ if error:
1053
+ result.append(f'Error: {error}')
1054
+
1055
+ if output:
1056
+ # Truncate output if too long
1057
+ if len(output) > 10000:
1058
+ output = output[:9950] + '\n[Truncated after 10000 characters]'
1059
+ result.append(f'Output: {output}')
1060
+ if len(result) == 0:
1061
+ result.append('Executed')
1062
+ return '\n'.join(result)
1063
+
1064
+ def _is_task_done(self) -> bool:
1065
+ """Check if the task is marked as done in the namespace."""
1066
+ # Check if 'done' was called by looking for a special marker in namespace
1067
+ return self.namespace.get('_task_done', False)
1068
+
1069
+ async def _capture_screenshot(self, step_number: int) -> str | None:
1070
+ """Capture and store screenshot for eval tracking."""
1071
+ if not self.browser_session:
1072
+ return None
1073
+
1074
+ try:
1075
+ # Get browser state summary which includes screenshot
1076
+ state = await self.browser_session.get_browser_state_summary(include_screenshot=True)
1077
+ if state and state.screenshot:
1078
+ # Store screenshot using screenshot service
1079
+ screenshot_path = await self.screenshot_service.store_screenshot(state.screenshot, step_number)
1080
+ return str(screenshot_path) if screenshot_path else None
1081
+ except Exception as e:
1082
+ logger.warning(f'Failed to capture screenshot for step {step_number}: {e}')
1083
+ return None
1084
+
1085
+ async def _add_step_to_complete_history(
1086
+ self,
1087
+ model_output_code: str,
1088
+ full_llm_response: str,
1089
+ output: str | None,
1090
+ error: str | None,
1091
+ screenshot_path: str | None,
1092
+ ) -> None:
1093
+ """Add a step to complete_history using type-safe models."""
1094
+ # Get current browser URL and title for state
1095
+ url: str | None = None
1096
+ title: str | None = None
1097
+ if self.browser_session:
1098
+ try:
1099
+ url = await self.browser_session.get_current_page_url()
1100
+ # Get title from browser
1101
+ cdp_session = await self.browser_session.get_or_create_cdp_session()
1102
+ result = await cdp_session.cdp_client.send.Runtime.evaluate(
1103
+ params={'expression': 'document.title', 'returnByValue': True},
1104
+ session_id=cdp_session.session_id,
1105
+ )
1106
+ title = result.get('result', {}).get('value')
1107
+ except Exception as e:
1108
+ logger.debug(f'Failed to get browser URL/title for history: {e}')
1109
+
1110
+ # Check if this is a done result
1111
+ is_done = self._is_task_done()
1112
+
1113
+ # Get self-reported success from done() call if task is done
1114
+ self_reported_success: bool | None = None
1115
+ if is_done:
1116
+ task_success = self.namespace.get('_task_success')
1117
+ self_reported_success = task_success if isinstance(task_success, bool) else None
1118
+
1119
+ # Create result entry using typed model
1120
+ result_entry = CodeAgentResult(
1121
+ extracted_content=output if output else None,
1122
+ error=error if error else None,
1123
+ is_done=is_done,
1124
+ success=self_reported_success,
1125
+ )
1126
+
1127
+ # Create state entry using typed model
1128
+ state_entry = CodeAgentState(url=url, title=title, screenshot_path=screenshot_path)
1129
+
1130
+ # Create metadata entry using typed model
1131
+ step_end_time = datetime.datetime.now().timestamp()
1132
+ metadata_entry = CodeAgentStepMetadata(
1133
+ input_tokens=self._last_llm_usage.prompt_tokens if self._last_llm_usage else None,
1134
+ output_tokens=self._last_llm_usage.completion_tokens if self._last_llm_usage else None,
1135
+ step_start_time=self._step_start_time,
1136
+ step_end_time=step_end_time,
1137
+ )
1138
+
1139
+ # Create model output entry using typed model (if there's code to track)
1140
+ model_output_entry: CodeAgentModelOutput | None = None
1141
+ if model_output_code or full_llm_response:
1142
+ model_output_entry = CodeAgentModelOutput(
1143
+ model_output=model_output_code if model_output_code else '',
1144
+ full_response=full_llm_response if full_llm_response else '',
1145
+ )
1146
+
1147
+ # Create history entry using typed model
1148
+ history_entry = CodeAgentHistory(
1149
+ model_output=model_output_entry,
1150
+ result=[result_entry],
1151
+ state=state_entry,
1152
+ metadata=metadata_entry,
1153
+ screenshot_path=screenshot_path, # Keep for backward compatibility
1154
+ )
1155
+
1156
+ self.complete_history.append(history_entry)
1157
+
1158
+ def _log_agent_event(self, max_steps: int, agent_run_error: str | None = None) -> None:
1159
+ """Send the agent event for this run to telemetry."""
1160
+ from urllib.parse import urlparse
1161
+
1162
+ token_summary = self.token_cost_service.get_usage_tokens_for_model(self.llm.model)
1163
+
1164
+ # For CodeAgent, we don't have action history like Agent does
1165
+ # Instead we track the code execution cells
1166
+ action_history_data: list[list[dict[str, Any]] | None] = []
1167
+ for step in self.complete_history:
1168
+ # Extract code from model_output if available (type-safe access)
1169
+ if step.model_output and step.model_output.full_response:
1170
+ code = step.model_output.full_response
1171
+ # Represent each code cell as a simple action entry
1172
+ action_history_data.append([{'llm_response': code}])
1173
+ else:
1174
+ action_history_data.append(None)
1175
+
1176
+ # Get final result from the last step or namespace (type-safe)
1177
+ final_result: Any = self.namespace.get('_task_result')
1178
+ final_result_str: str | None = final_result if isinstance(final_result, str) else None
1179
+
1180
+ # Get URLs visited from complete_history (type-safe access)
1181
+ urls_visited: list[str] = []
1182
+ for step in self.complete_history:
1183
+ if step.state.url and step.state.url not in urls_visited:
1184
+ urls_visited.append(step.state.url)
1185
+
1186
+ # Get errors from complete_history (type-safe access)
1187
+ errors: list[str] = []
1188
+ for step in self.complete_history:
1189
+ for result in step.result:
1190
+ if result.error:
1191
+ errors.append(result.error)
1192
+
1193
+ # Determine success from task completion status (type-safe)
1194
+ is_done = self._is_task_done()
1195
+ task_success: Any = self.namespace.get('_task_success')
1196
+ self_reported_success: bool | None = task_success if isinstance(task_success, bool) else (False if is_done else None)
1197
+
1198
+ self.telemetry.capture(
1199
+ AgentTelemetryEvent(
1200
+ task=self.task,
1201
+ model=self.llm.model,
1202
+ model_provider=self.llm.provider,
1203
+ max_steps=max_steps,
1204
+ max_actions_per_step=1, # CodeAgent executes one code cell per step
1205
+ use_vision=self.use_vision,
1206
+ version=self.version,
1207
+ source=self.source,
1208
+ cdp_url=urlparse(self.browser_session.cdp_url).hostname
1209
+ if self.browser_session and self.browser_session.cdp_url
1210
+ else None,
1211
+ agent_type='code', # CodeAgent identifier
1212
+ action_errors=errors,
1213
+ action_history=action_history_data,
1214
+ urls_visited=urls_visited,
1215
+ steps=len(self.complete_history),
1216
+ total_input_tokens=token_summary.prompt_tokens,
1217
+ total_output_tokens=token_summary.completion_tokens,
1218
+ prompt_cached_tokens=token_summary.prompt_cached_tokens,
1219
+ total_tokens=token_summary.total_tokens,
1220
+ total_duration_seconds=sum(step.metadata.duration_seconds for step in self.complete_history if step.metadata),
1221
+ success=self_reported_success,
1222
+ final_result_response=final_result_str,
1223
+ error_message=agent_run_error,
1224
+ )
1225
+ )
1226
+
1227
+ def screenshot_paths(self, n_last: int | None = None) -> list[str | None]:
1228
+ """
1229
+ Get screenshot paths from complete_history for eval system.
1230
+
1231
+ Args:
1232
+ n_last: Optional number of last screenshots to return
1233
+
1234
+ Returns:
1235
+ List of screenshot file paths (or None for missing screenshots)
1236
+ """
1237
+ paths = [step.screenshot_path for step in self.complete_history]
1238
+
1239
+ if n_last is not None:
1240
+ return paths[-n_last:] if len(paths) > n_last else paths
1241
+
1242
+ return paths
1243
+
1244
+ @property
1245
+ def message_manager(self) -> Any:
1246
+ """
1247
+ Compatibility property for eval system.
1248
+ Returns a mock object with last_input_messages attribute.
1249
+ """
1250
+
1251
+ class MockMessageManager:
1252
+ def __init__(self, llm_messages: list[BaseMessage]) -> None:
1253
+ # Convert code-use LLM messages to format expected by eval system
1254
+ self.last_input_messages = llm_messages
1255
+
1256
+ return MockMessageManager(self._llm_messages)
1257
+
1258
+ @property
1259
+ def history(self) -> Any:
1260
+ """
1261
+ Compatibility property for eval system.
1262
+ Returns a mock AgentHistoryList object with history attribute containing complete_history.
1263
+ This is what the eval system expects when it does: agent_history = agent.history
1264
+ """
1265
+
1266
+ class DictToObject:
1267
+ """Convert dict to object with attribute access for eval compatibility."""
1268
+
1269
+ def __init__(self, data: dict[str, Any]) -> None:
1270
+ for key, value in data.items():
1271
+ if isinstance(value, dict):
1272
+ setattr(self, key, DictToObject(value))
1273
+ elif isinstance(value, list):
1274
+ setattr(self, key, [DictToObject(item) if isinstance(item, dict) else item for item in value])
1275
+ else:
1276
+ setattr(self, key, value)
1277
+
1278
+ def __getattr__(self, name: str) -> None:
1279
+ """Provide safe attribute access with defaults for missing attributes."""
1280
+ # Return None for missing attributes instead of raising AttributeError
1281
+ # This handles cases where eval system checks attributes that CodeAgent doesn't set
1282
+ return None
1283
+
1284
+ def model_dump(self) -> dict[str, Any]:
1285
+ """Support model_dump() calls from eval system."""
1286
+ result = {}
1287
+ for key, value in self.__dict__.items():
1288
+ if isinstance(value, DictToObject):
1289
+ result[key] = value.model_dump()
1290
+ elif isinstance(value, list):
1291
+ result[key] = [item.model_dump() if isinstance(item, DictToObject) else item for item in value]
1292
+ else:
1293
+ result[key] = value
1294
+ return result
1295
+
1296
+ def get_screenshot(self) -> str | None:
1297
+ """Support get_screenshot() calls for state objects."""
1298
+ # Load screenshot from disk and return as base64 string (matching BrowserStateHistory implementation)
1299
+ if not hasattr(self, 'screenshot_path') or not self.screenshot_path:
1300
+ return None
1301
+
1302
+ import base64
1303
+ from pathlib import Path
1304
+
1305
+ path_obj = Path(self.screenshot_path)
1306
+ if not path_obj.exists():
1307
+ return None
1308
+
1309
+ try:
1310
+ with open(path_obj, 'rb') as f:
1311
+ screenshot_data = f.read()
1312
+ return base64.b64encode(screenshot_data).decode('utf-8')
1313
+ except Exception:
1314
+ return None
1315
+
1316
+ class MockAgentHistoryList:
1317
+ def __init__(self, complete_history: list[CodeAgentHistory], usage_summary: UsageSummary | None) -> None:
1318
+ # Convert each CodeAgentHistory to dict, then to object with attribute access
1319
+ self.history = [DictToObject(item.model_dump()) for item in complete_history]
1320
+ # Use the provided usage summary
1321
+ self.usage = usage_summary
1322
+
1323
+ return MockAgentHistoryList(self.complete_history, self.usage_summary)
1324
+
1325
+ async def close(self) -> None:
1326
+ """Close the browser session."""
1327
+ if self.browser_session:
1328
+ # Check if we should close the browser based on keep_alive setting
1329
+ if not self.browser_session.browser_profile.keep_alive:
1330
+ await self.browser_session.kill()
1331
+ else:
1332
+ logger.debug('Browser keep_alive is True, not closing browser session')
1333
+
1334
+ async def __aenter__(self) -> 'CodeAgent':
1335
+ """Async context manager entry."""
1336
+ return self
1337
+
1338
+ async def __aexit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: Any) -> None:
1339
+ """Async context manager exit."""
1340
+ await self.close()