optexity-browser-use 0.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browser_use/__init__.py +157 -0
- browser_use/actor/__init__.py +11 -0
- browser_use/actor/element.py +1175 -0
- browser_use/actor/mouse.py +134 -0
- browser_use/actor/page.py +561 -0
- browser_use/actor/playground/flights.py +41 -0
- browser_use/actor/playground/mixed_automation.py +54 -0
- browser_use/actor/playground/playground.py +236 -0
- browser_use/actor/utils.py +176 -0
- browser_use/agent/cloud_events.py +282 -0
- browser_use/agent/gif.py +424 -0
- browser_use/agent/judge.py +170 -0
- browser_use/agent/message_manager/service.py +473 -0
- browser_use/agent/message_manager/utils.py +52 -0
- browser_use/agent/message_manager/views.py +98 -0
- browser_use/agent/prompts.py +413 -0
- browser_use/agent/service.py +2316 -0
- browser_use/agent/system_prompt.md +185 -0
- browser_use/agent/system_prompt_flash.md +10 -0
- browser_use/agent/system_prompt_no_thinking.md +183 -0
- browser_use/agent/views.py +743 -0
- browser_use/browser/__init__.py +41 -0
- browser_use/browser/cloud/cloud.py +203 -0
- browser_use/browser/cloud/views.py +89 -0
- browser_use/browser/events.py +578 -0
- browser_use/browser/profile.py +1158 -0
- browser_use/browser/python_highlights.py +548 -0
- browser_use/browser/session.py +3225 -0
- browser_use/browser/session_manager.py +399 -0
- browser_use/browser/video_recorder.py +162 -0
- browser_use/browser/views.py +200 -0
- browser_use/browser/watchdog_base.py +260 -0
- browser_use/browser/watchdogs/__init__.py +0 -0
- browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
- browser_use/browser/watchdogs/crash_watchdog.py +335 -0
- browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
- browser_use/browser/watchdogs/dom_watchdog.py +817 -0
- browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
- browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
- browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
- browser_use/browser/watchdogs/popups_watchdog.py +143 -0
- browser_use/browser/watchdogs/recording_watchdog.py +126 -0
- browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
- browser_use/browser/watchdogs/security_watchdog.py +280 -0
- browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
- browser_use/cli.py +2359 -0
- browser_use/code_use/__init__.py +16 -0
- browser_use/code_use/formatting.py +192 -0
- browser_use/code_use/namespace.py +665 -0
- browser_use/code_use/notebook_export.py +276 -0
- browser_use/code_use/service.py +1340 -0
- browser_use/code_use/system_prompt.md +574 -0
- browser_use/code_use/utils.py +150 -0
- browser_use/code_use/views.py +171 -0
- browser_use/config.py +505 -0
- browser_use/controller/__init__.py +3 -0
- browser_use/dom/enhanced_snapshot.py +161 -0
- browser_use/dom/markdown_extractor.py +169 -0
- browser_use/dom/playground/extraction.py +312 -0
- browser_use/dom/playground/multi_act.py +32 -0
- browser_use/dom/serializer/clickable_elements.py +200 -0
- browser_use/dom/serializer/code_use_serializer.py +287 -0
- browser_use/dom/serializer/eval_serializer.py +478 -0
- browser_use/dom/serializer/html_serializer.py +212 -0
- browser_use/dom/serializer/paint_order.py +197 -0
- browser_use/dom/serializer/serializer.py +1170 -0
- browser_use/dom/service.py +825 -0
- browser_use/dom/utils.py +129 -0
- browser_use/dom/views.py +906 -0
- browser_use/exceptions.py +5 -0
- browser_use/filesystem/__init__.py +0 -0
- browser_use/filesystem/file_system.py +619 -0
- browser_use/init_cmd.py +376 -0
- browser_use/integrations/gmail/__init__.py +24 -0
- browser_use/integrations/gmail/actions.py +115 -0
- browser_use/integrations/gmail/service.py +225 -0
- browser_use/llm/__init__.py +155 -0
- browser_use/llm/anthropic/chat.py +242 -0
- browser_use/llm/anthropic/serializer.py +312 -0
- browser_use/llm/aws/__init__.py +36 -0
- browser_use/llm/aws/chat_anthropic.py +242 -0
- browser_use/llm/aws/chat_bedrock.py +289 -0
- browser_use/llm/aws/serializer.py +257 -0
- browser_use/llm/azure/chat.py +91 -0
- browser_use/llm/base.py +57 -0
- browser_use/llm/browser_use/__init__.py +3 -0
- browser_use/llm/browser_use/chat.py +201 -0
- browser_use/llm/cerebras/chat.py +193 -0
- browser_use/llm/cerebras/serializer.py +109 -0
- browser_use/llm/deepseek/chat.py +212 -0
- browser_use/llm/deepseek/serializer.py +109 -0
- browser_use/llm/exceptions.py +29 -0
- browser_use/llm/google/__init__.py +3 -0
- browser_use/llm/google/chat.py +542 -0
- browser_use/llm/google/serializer.py +120 -0
- browser_use/llm/groq/chat.py +229 -0
- browser_use/llm/groq/parser.py +158 -0
- browser_use/llm/groq/serializer.py +159 -0
- browser_use/llm/messages.py +238 -0
- browser_use/llm/models.py +271 -0
- browser_use/llm/oci_raw/__init__.py +10 -0
- browser_use/llm/oci_raw/chat.py +443 -0
- browser_use/llm/oci_raw/serializer.py +229 -0
- browser_use/llm/ollama/chat.py +97 -0
- browser_use/llm/ollama/serializer.py +143 -0
- browser_use/llm/openai/chat.py +264 -0
- browser_use/llm/openai/like.py +15 -0
- browser_use/llm/openai/serializer.py +165 -0
- browser_use/llm/openrouter/chat.py +211 -0
- browser_use/llm/openrouter/serializer.py +26 -0
- browser_use/llm/schema.py +176 -0
- browser_use/llm/views.py +48 -0
- browser_use/logging_config.py +330 -0
- browser_use/mcp/__init__.py +18 -0
- browser_use/mcp/__main__.py +12 -0
- browser_use/mcp/client.py +544 -0
- browser_use/mcp/controller.py +264 -0
- browser_use/mcp/server.py +1114 -0
- browser_use/observability.py +204 -0
- browser_use/py.typed +0 -0
- browser_use/sandbox/__init__.py +41 -0
- browser_use/sandbox/sandbox.py +637 -0
- browser_use/sandbox/views.py +132 -0
- browser_use/screenshots/__init__.py +1 -0
- browser_use/screenshots/service.py +52 -0
- browser_use/sync/__init__.py +6 -0
- browser_use/sync/auth.py +357 -0
- browser_use/sync/service.py +161 -0
- browser_use/telemetry/__init__.py +51 -0
- browser_use/telemetry/service.py +112 -0
- browser_use/telemetry/views.py +101 -0
- browser_use/tokens/__init__.py +0 -0
- browser_use/tokens/custom_pricing.py +24 -0
- browser_use/tokens/mappings.py +4 -0
- browser_use/tokens/service.py +580 -0
- browser_use/tokens/views.py +108 -0
- browser_use/tools/registry/service.py +572 -0
- browser_use/tools/registry/views.py +174 -0
- browser_use/tools/service.py +1675 -0
- browser_use/tools/utils.py +82 -0
- browser_use/tools/views.py +100 -0
- browser_use/utils.py +670 -0
- optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
- optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
- optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
- optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
- optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1340 @@
|
|
|
1
|
+
"""Code-use agent service - Jupyter notebook-like code execution for browser automation."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import datetime
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import traceback
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from uuid_extensions import uuid7str
|
|
12
|
+
|
|
13
|
+
from browser_use.browser import BrowserSession
|
|
14
|
+
from browser_use.browser.profile import BrowserProfile
|
|
15
|
+
from browser_use.dom.service import DomService
|
|
16
|
+
from browser_use.filesystem.file_system import FileSystem
|
|
17
|
+
from browser_use.llm.base import BaseChatModel
|
|
18
|
+
from browser_use.llm.messages import (
|
|
19
|
+
AssistantMessage,
|
|
20
|
+
BaseMessage,
|
|
21
|
+
ContentPartImageParam,
|
|
22
|
+
ContentPartTextParam,
|
|
23
|
+
ImageURL,
|
|
24
|
+
UserMessage,
|
|
25
|
+
)
|
|
26
|
+
from browser_use.screenshots.service import ScreenshotService
|
|
27
|
+
from browser_use.telemetry.service import ProductTelemetry
|
|
28
|
+
from browser_use.telemetry.views import AgentTelemetryEvent
|
|
29
|
+
from browser_use.tokens.service import TokenCost
|
|
30
|
+
from browser_use.tokens.views import UsageSummary
|
|
31
|
+
from browser_use.tools.service import CodeAgentTools, Tools
|
|
32
|
+
from browser_use.utils import get_browser_use_version
|
|
33
|
+
|
|
34
|
+
from .formatting import format_browser_state_for_llm
|
|
35
|
+
from .namespace import EvaluateError, create_namespace
|
|
36
|
+
from .utils import detect_token_limit_issue, extract_code_blocks, extract_url_from_task, truncate_message_content
|
|
37
|
+
from .views import (
|
|
38
|
+
CodeAgentHistory,
|
|
39
|
+
CodeAgentModelOutput,
|
|
40
|
+
CodeAgentResult,
|
|
41
|
+
CodeAgentState,
|
|
42
|
+
CodeAgentStepMetadata,
|
|
43
|
+
ExecutionStatus,
|
|
44
|
+
NotebookSession,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class CodeAgent:
|
|
51
|
+
"""
|
|
52
|
+
Agent that executes Python code in a notebook-like environment for browser automation.
|
|
53
|
+
|
|
54
|
+
This agent provides a Jupyter notebook-like interface where the LLM writes Python code
|
|
55
|
+
that gets executed in a persistent namespace with browser control functions available.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
task: str,
|
|
61
|
+
# Optional parameters
|
|
62
|
+
llm: BaseChatModel | None = None,
|
|
63
|
+
browser_session: BrowserSession | None = None,
|
|
64
|
+
browser: BrowserSession | None = None, # Alias for browser_session
|
|
65
|
+
tools: Tools | None = None,
|
|
66
|
+
controller: Tools | None = None, # Alias for tools
|
|
67
|
+
# Agent settings
|
|
68
|
+
page_extraction_llm: BaseChatModel | None = None,
|
|
69
|
+
file_system: FileSystem | None = None,
|
|
70
|
+
available_file_paths: list[str] | None = None,
|
|
71
|
+
sensitive_data: dict[str, str | dict[str, str]] | None = None,
|
|
72
|
+
max_steps: int = 100,
|
|
73
|
+
max_failures: int = 8,
|
|
74
|
+
max_validations: int = 0,
|
|
75
|
+
use_vision: bool = True,
|
|
76
|
+
calculate_cost: bool = False,
|
|
77
|
+
**kwargs,
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Initialize the code-use agent.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
task: The task description for the agent
|
|
84
|
+
browser_session: Optional browser session (will be created if not provided) [DEPRECATED: use browser]
|
|
85
|
+
browser: Optional browser session (cleaner API)
|
|
86
|
+
tools: Optional Tools instance (will create default if not provided)
|
|
87
|
+
controller: Optional Tools instance
|
|
88
|
+
page_extraction_llm: Optional LLM for page extraction
|
|
89
|
+
file_system: Optional file system for file operations
|
|
90
|
+
available_file_paths: Optional list of available file paths
|
|
91
|
+
sensitive_data: Optional sensitive data dictionary
|
|
92
|
+
max_steps: Maximum number of execution steps
|
|
93
|
+
max_failures: Maximum consecutive errors before termination (default: 8)
|
|
94
|
+
max_validations: Maximum number of times to run the validator agent (default: 0)
|
|
95
|
+
use_vision: Whether to include screenshots in LLM messages (default: True)
|
|
96
|
+
calculate_cost: Whether to calculate token costs (default: False)
|
|
97
|
+
llm: Optional ChatBrowserUse LLM instance (will create default if not provided)
|
|
98
|
+
**kwargs: Additional keyword arguments for compatibility (ignored)
|
|
99
|
+
"""
|
|
100
|
+
# Log and ignore unknown kwargs for compatibility
|
|
101
|
+
if kwargs:
|
|
102
|
+
logger.debug(f'Ignoring additional kwargs for CodeAgent compatibility: {list(kwargs.keys())}')
|
|
103
|
+
|
|
104
|
+
if llm is None:
|
|
105
|
+
try:
|
|
106
|
+
from browser_use import ChatBrowserUse
|
|
107
|
+
|
|
108
|
+
llm = ChatBrowserUse()
|
|
109
|
+
logger.debug('CodeAgent using ChatBrowserUse')
|
|
110
|
+
except Exception as e:
|
|
111
|
+
raise RuntimeError(f'Failed to initialize CodeAgent LLM: {e}')
|
|
112
|
+
|
|
113
|
+
if 'ChatBrowserUse' not in llm.__class__.__name__:
|
|
114
|
+
raise ValueError('This agent works only with ChatBrowserUse.')
|
|
115
|
+
|
|
116
|
+
# Handle browser vs browser_session parameter (browser takes precedence)
|
|
117
|
+
if browser and browser_session:
|
|
118
|
+
raise ValueError('Cannot specify both "browser" and "browser_session" parameters. Use "browser" for the cleaner API.')
|
|
119
|
+
browser_session = browser or browser_session
|
|
120
|
+
|
|
121
|
+
# Handle controller vs tools parameter (controller takes precedence)
|
|
122
|
+
if controller and tools:
|
|
123
|
+
raise ValueError('Cannot specify both "controller" and "tools" parameters. Use "controller" for the cleaner API.')
|
|
124
|
+
tools = controller or tools
|
|
125
|
+
|
|
126
|
+
# Store browser_profile for creating browser session if needed
|
|
127
|
+
self._browser_profile_for_init = BrowserProfile() if browser_session is None else None
|
|
128
|
+
|
|
129
|
+
self.task = task
|
|
130
|
+
self.llm = llm
|
|
131
|
+
self.browser_session = browser_session
|
|
132
|
+
self.tools = tools or CodeAgentTools()
|
|
133
|
+
self.page_extraction_llm = page_extraction_llm
|
|
134
|
+
self.file_system = file_system if file_system is not None else FileSystem(base_dir='./')
|
|
135
|
+
self.available_file_paths = available_file_paths or []
|
|
136
|
+
self.sensitive_data = sensitive_data
|
|
137
|
+
self.max_steps = max_steps
|
|
138
|
+
self.max_failures = max_failures
|
|
139
|
+
self.max_validations = max_validations
|
|
140
|
+
self.use_vision = use_vision
|
|
141
|
+
|
|
142
|
+
self.session = NotebookSession()
|
|
143
|
+
self.namespace: dict[str, Any] = {}
|
|
144
|
+
self._llm_messages: list[BaseMessage] = [] # Internal LLM conversation history
|
|
145
|
+
self.complete_history: list[CodeAgentHistory] = [] # Type-safe history with model_output and result
|
|
146
|
+
self.dom_service: DomService | None = None
|
|
147
|
+
self._last_browser_state_text: str | None = None # Track last browser state text
|
|
148
|
+
self._last_screenshot: str | None = None # Track last screenshot (base64)
|
|
149
|
+
self._consecutive_errors = 0 # Track consecutive errors for auto-termination
|
|
150
|
+
self._validation_count = 0 # Track number of validator runs
|
|
151
|
+
self._last_llm_usage: Any | None = None # Track last LLM call usage stats
|
|
152
|
+
self._step_start_time = 0.0 # Track step start time for duration calculation
|
|
153
|
+
self.usage_summary: UsageSummary | None = None # Track usage summary across run for history property
|
|
154
|
+
|
|
155
|
+
# Initialize screenshot service for eval tracking
|
|
156
|
+
self.id = uuid7str()
|
|
157
|
+
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
158
|
+
base_tmp = Path('/tmp')
|
|
159
|
+
self.agent_directory = base_tmp / f'browser_use_code_agent_{self.id}_{timestamp}'
|
|
160
|
+
self.screenshot_service = ScreenshotService(agent_directory=self.agent_directory)
|
|
161
|
+
|
|
162
|
+
# Initialize token cost service for usage tracking
|
|
163
|
+
self.token_cost_service = TokenCost(include_cost=calculate_cost)
|
|
164
|
+
self.token_cost_service.register_llm(llm)
|
|
165
|
+
if page_extraction_llm:
|
|
166
|
+
self.token_cost_service.register_llm(page_extraction_llm)
|
|
167
|
+
|
|
168
|
+
# Set version and source for telemetry
|
|
169
|
+
self.version = get_browser_use_version()
|
|
170
|
+
try:
|
|
171
|
+
package_root = Path(__file__).parent.parent.parent
|
|
172
|
+
repo_files = ['.git', 'README.md', 'docs', 'examples']
|
|
173
|
+
if all(Path(package_root / file).exists() for file in repo_files):
|
|
174
|
+
self.source = 'git'
|
|
175
|
+
else:
|
|
176
|
+
self.source = 'pip'
|
|
177
|
+
except Exception:
|
|
178
|
+
self.source = 'unknown'
|
|
179
|
+
|
|
180
|
+
# Telemetry
|
|
181
|
+
self.telemetry = ProductTelemetry()
|
|
182
|
+
|
|
183
|
+
async def run(self, max_steps: int | None = None) -> NotebookSession:
|
|
184
|
+
"""
|
|
185
|
+
Run the agent to complete the task.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
max_steps: Optional override for maximum number of steps (uses __init__ value if not provided)
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
The notebook session with all executed cells
|
|
192
|
+
"""
|
|
193
|
+
# Use override if provided, otherwise use value from __init__
|
|
194
|
+
steps_to_run = max_steps if max_steps is not None else self.max_steps
|
|
195
|
+
self.max_steps = steps_to_run
|
|
196
|
+
# Start browser if not provided
|
|
197
|
+
if self.browser_session is None:
|
|
198
|
+
assert self._browser_profile_for_init is not None
|
|
199
|
+
self.browser_session = BrowserSession(browser_profile=self._browser_profile_for_init)
|
|
200
|
+
await self.browser_session.start()
|
|
201
|
+
|
|
202
|
+
# Initialize DOM service with cross-origin iframe support enabled
|
|
203
|
+
self.dom_service = DomService(
|
|
204
|
+
browser_session=self.browser_session,
|
|
205
|
+
cross_origin_iframes=True, # Enable for code-use agent to access forms in iframes
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Create namespace with all tools
|
|
209
|
+
self.namespace = create_namespace(
|
|
210
|
+
browser_session=self.browser_session,
|
|
211
|
+
tools=self.tools,
|
|
212
|
+
page_extraction_llm=self.page_extraction_llm,
|
|
213
|
+
file_system=self.file_system,
|
|
214
|
+
available_file_paths=self.available_file_paths,
|
|
215
|
+
sensitive_data=self.sensitive_data,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Initialize conversation with task
|
|
219
|
+
self._llm_messages.append(UserMessage(content=f'Task: {self.task}'))
|
|
220
|
+
|
|
221
|
+
# Track agent run error for telemetry
|
|
222
|
+
agent_run_error: str | None = None
|
|
223
|
+
|
|
224
|
+
# Extract URL from task and navigate if found
|
|
225
|
+
initial_url = extract_url_from_task(self.task)
|
|
226
|
+
if initial_url:
|
|
227
|
+
try:
|
|
228
|
+
logger.info(f'Extracted URL from task, navigating to: {initial_url}')
|
|
229
|
+
# Use the navigate action from namespace
|
|
230
|
+
await self.namespace['navigate'](initial_url)
|
|
231
|
+
# Wait for page load
|
|
232
|
+
await asyncio.sleep(2)
|
|
233
|
+
|
|
234
|
+
# Record this navigation as a cell in the notebook
|
|
235
|
+
nav_code = f"await navigate('{initial_url}')"
|
|
236
|
+
cell = self.session.add_cell(source=nav_code)
|
|
237
|
+
cell.status = ExecutionStatus.SUCCESS
|
|
238
|
+
cell.execution_count = self.session.increment_execution_count()
|
|
239
|
+
cell.output = f'Navigated to {initial_url}'
|
|
240
|
+
|
|
241
|
+
# Get browser state after navigation for the cell
|
|
242
|
+
if self.dom_service:
|
|
243
|
+
try:
|
|
244
|
+
browser_state_text, _ = await self._get_browser_state()
|
|
245
|
+
cell.browser_state = browser_state_text
|
|
246
|
+
except Exception as state_error:
|
|
247
|
+
logger.debug(f'Failed to capture browser state for initial navigation cell: {state_error}')
|
|
248
|
+
|
|
249
|
+
except Exception as e:
|
|
250
|
+
logger.warning(f'Failed to navigate to extracted URL {initial_url}: {e}')
|
|
251
|
+
# Record failed navigation as error cell
|
|
252
|
+
nav_code = f"await navigate('{initial_url}')"
|
|
253
|
+
cell = self.session.add_cell(source=nav_code)
|
|
254
|
+
cell.status = ExecutionStatus.ERROR
|
|
255
|
+
cell.execution_count = self.session.increment_execution_count()
|
|
256
|
+
cell.error = str(e)
|
|
257
|
+
|
|
258
|
+
# Get initial browser state before first LLM call
|
|
259
|
+
if self.browser_session and self.dom_service:
|
|
260
|
+
try:
|
|
261
|
+
browser_state_text, screenshot = await self._get_browser_state()
|
|
262
|
+
self._last_browser_state_text = browser_state_text
|
|
263
|
+
self._last_screenshot = screenshot
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.warning(f'Failed to get initial browser state: {e}')
|
|
266
|
+
|
|
267
|
+
# Main execution loop
|
|
268
|
+
for step in range(self.max_steps):
|
|
269
|
+
logger.info(f'\n\n\n\n\n\n\nStep {step + 1}/{self.max_steps}')
|
|
270
|
+
|
|
271
|
+
# Start timing this step
|
|
272
|
+
self._step_start_time = datetime.datetime.now().timestamp()
|
|
273
|
+
|
|
274
|
+
# Check if we're approaching the step limit or error limit and inject warning
|
|
275
|
+
steps_remaining = self.max_steps - step - 1
|
|
276
|
+
errors_remaining = self.max_failures - self._consecutive_errors
|
|
277
|
+
|
|
278
|
+
should_warn = (
|
|
279
|
+
steps_remaining <= 1 # Last step or next to last
|
|
280
|
+
or errors_remaining <= 1 # One more error will terminate
|
|
281
|
+
or (steps_remaining <= 2 and self._consecutive_errors >= 2) # Close to both limits
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
if should_warn:
|
|
285
|
+
warning_message = (
|
|
286
|
+
f'\n\n⚠️ CRITICAL WARNING: You are approaching execution limits!\n'
|
|
287
|
+
f'- Steps remaining: {steps_remaining + 1}\n'
|
|
288
|
+
f'- Consecutive errors: {self._consecutive_errors}/{self.max_failures}\n\n'
|
|
289
|
+
f'YOU MUST call done() in your NEXT response, even if the task is incomplete:\n'
|
|
290
|
+
f"- Set success=False if you couldn't complete the task\n"
|
|
291
|
+
f'- Return EVERYTHING you found so far (partial data is better than nothing)\n'
|
|
292
|
+
f"- Include any variables you've stored (products, all_data, etc.)\n"
|
|
293
|
+
f"- Explain what worked and what didn't\n\n"
|
|
294
|
+
f'Without done(), the user will receive NOTHING.'
|
|
295
|
+
)
|
|
296
|
+
self._llm_messages.append(UserMessage(content=warning_message))
|
|
297
|
+
|
|
298
|
+
try:
|
|
299
|
+
# Fetch fresh browser state right before LLM call (only if not already set)
|
|
300
|
+
if not self._last_browser_state_text and self.browser_session and self.dom_service:
|
|
301
|
+
try:
|
|
302
|
+
logger.debug('🔍 Fetching browser state before LLM call...')
|
|
303
|
+
browser_state_text, screenshot = await self._get_browser_state()
|
|
304
|
+
self._last_browser_state_text = browser_state_text
|
|
305
|
+
self._last_screenshot = screenshot
|
|
306
|
+
|
|
307
|
+
# # Log browser state
|
|
308
|
+
# if len(browser_state_text) > 2000:
|
|
309
|
+
# logger.info(
|
|
310
|
+
# f'Browser state (before LLM):\n{browser_state_text[:2000]}...\n[Truncated, full state {len(browser_state_text)} chars sent to LLM]'
|
|
311
|
+
# )
|
|
312
|
+
# else:
|
|
313
|
+
# logger.info(f'Browser state (before LLM):\n{browser_state_text}')
|
|
314
|
+
except Exception as e:
|
|
315
|
+
logger.warning(f'Failed to get browser state before LLM call: {e}')
|
|
316
|
+
|
|
317
|
+
# Get code from LLM (this also adds to self._llm_messages)
|
|
318
|
+
try:
|
|
319
|
+
code, full_llm_response = await self._get_code_from_llm()
|
|
320
|
+
except Exception as llm_error:
|
|
321
|
+
# LLM call failed - count as consecutive error and retry
|
|
322
|
+
self._consecutive_errors += 1
|
|
323
|
+
logger.warning(
|
|
324
|
+
f'LLM call failed (consecutive errors: {self._consecutive_errors}/{self.max_failures}), retrying: {llm_error}'
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# Check if we've hit the consecutive error limit
|
|
328
|
+
if self._consecutive_errors >= self.max_failures:
|
|
329
|
+
logger.error(f'Terminating: {self.max_failures} consecutive LLM failures')
|
|
330
|
+
break
|
|
331
|
+
|
|
332
|
+
await asyncio.sleep(1) # Brief pause before retry
|
|
333
|
+
continue
|
|
334
|
+
|
|
335
|
+
if not code or code.strip() == '':
|
|
336
|
+
# If task is already done, empty code is fine (LLM explaining completion)
|
|
337
|
+
if self._is_task_done():
|
|
338
|
+
logger.info('Task already marked as done, LLM provided explanation without code')
|
|
339
|
+
# Add the text response to history as a non-code step
|
|
340
|
+
await self._add_step_to_complete_history(
|
|
341
|
+
model_output_code='',
|
|
342
|
+
full_llm_response=full_llm_response,
|
|
343
|
+
output=full_llm_response, # Treat the explanation as output
|
|
344
|
+
error=None,
|
|
345
|
+
screenshot_path=await self._capture_screenshot(step + 1),
|
|
346
|
+
)
|
|
347
|
+
break # Exit the loop since task is done
|
|
348
|
+
|
|
349
|
+
logger.warning('LLM returned empty code')
|
|
350
|
+
self._consecutive_errors += 1
|
|
351
|
+
|
|
352
|
+
# new state
|
|
353
|
+
if self.browser_session and self.dom_service:
|
|
354
|
+
try:
|
|
355
|
+
browser_state_text, screenshot = await self._get_browser_state()
|
|
356
|
+
self._last_browser_state_text = browser_state_text
|
|
357
|
+
self._last_screenshot = screenshot
|
|
358
|
+
except Exception as e:
|
|
359
|
+
logger.warning(f'Failed to get new browser state: {e}')
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
# Execute code blocks sequentially if multiple python blocks exist
|
|
363
|
+
# This allows JS/bash blocks to be injected into namespace before Python code uses them
|
|
364
|
+
all_blocks = self.namespace.get('_all_code_blocks', {})
|
|
365
|
+
python_blocks = [k for k in sorted(all_blocks.keys()) if k.startswith('python_')]
|
|
366
|
+
|
|
367
|
+
if len(python_blocks) > 1:
|
|
368
|
+
# Multiple Python blocks - execute each sequentially
|
|
369
|
+
output = None
|
|
370
|
+
error = None
|
|
371
|
+
|
|
372
|
+
for i, block_key in enumerate(python_blocks):
|
|
373
|
+
logger.info(f'Executing Python block {i + 1}/{len(python_blocks)}')
|
|
374
|
+
block_code = all_blocks[block_key]
|
|
375
|
+
block_output, block_error, _ = await self._execute_code(block_code)
|
|
376
|
+
|
|
377
|
+
# Accumulate outputs
|
|
378
|
+
if block_output:
|
|
379
|
+
output = (output or '') + block_output
|
|
380
|
+
if block_error:
|
|
381
|
+
error = block_error
|
|
382
|
+
# Stop on first error
|
|
383
|
+
break
|
|
384
|
+
else:
|
|
385
|
+
# Single Python block - execute normally
|
|
386
|
+
output, error, _ = await self._execute_code(code)
|
|
387
|
+
|
|
388
|
+
# Track consecutive errors
|
|
389
|
+
if error:
|
|
390
|
+
self._consecutive_errors += 1
|
|
391
|
+
logger.warning(f'Consecutive errors: {self._consecutive_errors}/{self.max_failures}')
|
|
392
|
+
|
|
393
|
+
# Check if we've hit the consecutive error limit
|
|
394
|
+
if self._consecutive_errors >= self.max_failures:
|
|
395
|
+
logger.error(
|
|
396
|
+
f'Terminating: {self.max_failures} consecutive errors reached. The agent is unable to make progress.'
|
|
397
|
+
)
|
|
398
|
+
# Add termination message to complete history before breaking
|
|
399
|
+
await self._add_step_to_complete_history(
|
|
400
|
+
model_output_code=code,
|
|
401
|
+
full_llm_response=f'[Terminated after {self.max_failures} consecutive errors]',
|
|
402
|
+
output=None,
|
|
403
|
+
error=f'Auto-terminated: {self.max_failures} consecutive errors without progress',
|
|
404
|
+
screenshot_path=None,
|
|
405
|
+
)
|
|
406
|
+
break
|
|
407
|
+
else:
|
|
408
|
+
# Reset consecutive error counter on success
|
|
409
|
+
self._consecutive_errors = 0
|
|
410
|
+
|
|
411
|
+
# Check if task is done - validate completion first if not at limits
|
|
412
|
+
if self._is_task_done():
|
|
413
|
+
# Get the final result from namespace (from done() call)
|
|
414
|
+
final_result: str | None = self.namespace.get('_task_result') # type: ignore[assignment]
|
|
415
|
+
|
|
416
|
+
# Check if we should validate (not at step/error limits and under max validations)
|
|
417
|
+
steps_remaining = self.max_steps - step - 1
|
|
418
|
+
should_validate = (
|
|
419
|
+
self._validation_count < self.max_validations # Haven't exceeded max validations
|
|
420
|
+
and steps_remaining >= 4 # At least 4 steps away from limit
|
|
421
|
+
and self._consecutive_errors < 3 # Not close to error limit (8 consecutive)
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
if should_validate:
|
|
425
|
+
self._validation_count += 1
|
|
426
|
+
logger.info('Validating task completion with LLM...')
|
|
427
|
+
from .namespace import validate_task_completion
|
|
428
|
+
|
|
429
|
+
is_complete, reasoning = await validate_task_completion(
|
|
430
|
+
task=self.task,
|
|
431
|
+
output=final_result,
|
|
432
|
+
llm=self.llm,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
if not is_complete:
|
|
436
|
+
# Task not truly complete - inject feedback and continue
|
|
437
|
+
logger.warning('Validator: Task not complete, continuing...')
|
|
438
|
+
validation_feedback = (
|
|
439
|
+
f'\n\n⚠️ VALIDATOR FEEDBACK:\n'
|
|
440
|
+
f'Your done() call was rejected. The task is NOT complete yet.\n\n'
|
|
441
|
+
f'Validation reasoning:\n{reasoning}\n\n'
|
|
442
|
+
f'You must continue working on the task. Analyze what is missing and complete it.\n'
|
|
443
|
+
f'Do NOT call done() again until the task is truly finished.'
|
|
444
|
+
)
|
|
445
|
+
|
|
446
|
+
# Clear the done flag so execution continues
|
|
447
|
+
self.namespace['_task_done'] = False
|
|
448
|
+
self.namespace.pop('_task_result', None)
|
|
449
|
+
self.namespace.pop('_task_success', None)
|
|
450
|
+
|
|
451
|
+
# Add validation feedback to LLM messages
|
|
452
|
+
self._llm_messages.append(UserMessage(content=validation_feedback))
|
|
453
|
+
|
|
454
|
+
# Don't override output - let execution continue normally
|
|
455
|
+
else:
|
|
456
|
+
logger.info('Validator: Task complete')
|
|
457
|
+
# Override output with done message for final step
|
|
458
|
+
if final_result:
|
|
459
|
+
output = final_result
|
|
460
|
+
else:
|
|
461
|
+
# At limits - skip validation and accept done()
|
|
462
|
+
if self._validation_count >= self.max_validations:
|
|
463
|
+
logger.info(
|
|
464
|
+
f'Reached max validations ({self.max_validations}) - skipping validation and accepting done()'
|
|
465
|
+
)
|
|
466
|
+
else:
|
|
467
|
+
logger.info('At step/error limits - skipping validation')
|
|
468
|
+
if final_result:
|
|
469
|
+
output = final_result
|
|
470
|
+
|
|
471
|
+
if output:
|
|
472
|
+
# Check if this is the final done() output
|
|
473
|
+
if self._is_task_done():
|
|
474
|
+
# Show done() output more prominently
|
|
475
|
+
logger.info(
|
|
476
|
+
f'✓ Task completed - Final output from done():\n{output[:300] if len(output) > 300 else output}'
|
|
477
|
+
)
|
|
478
|
+
# Also show files_to_display if they exist in namespace
|
|
479
|
+
attachments: list[str] | None = self.namespace.get('_task_attachments') # type: ignore[assignment]
|
|
480
|
+
if attachments:
|
|
481
|
+
logger.info(f'Files displayed: {", ".join(attachments)}')
|
|
482
|
+
else:
|
|
483
|
+
logger.info(f'Code output:\n{output}')
|
|
484
|
+
|
|
485
|
+
# Browser state is now only logged when fetched before LLM call (not after execution)
|
|
486
|
+
|
|
487
|
+
# Take screenshot for eval tracking
|
|
488
|
+
screenshot_path = await self._capture_screenshot(step + 1)
|
|
489
|
+
|
|
490
|
+
# Add step to complete_history for eval system
|
|
491
|
+
await self._add_step_to_complete_history(
|
|
492
|
+
model_output_code=code,
|
|
493
|
+
full_llm_response=full_llm_response,
|
|
494
|
+
output=output,
|
|
495
|
+
error=error,
|
|
496
|
+
screenshot_path=screenshot_path,
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Check if task is done (after validation)
|
|
500
|
+
if self._is_task_done():
|
|
501
|
+
# Get the final result from namespace
|
|
502
|
+
final_result: str | None = self.namespace.get('_task_result', output) # type: ignore[assignment]
|
|
503
|
+
logger.info('Task completed successfully')
|
|
504
|
+
if final_result:
|
|
505
|
+
logger.info(f'Final result: {final_result}')
|
|
506
|
+
break
|
|
507
|
+
# If validation rejected done(), continue to next iteration
|
|
508
|
+
# The feedback message has already been added to _llm_messages
|
|
509
|
+
|
|
510
|
+
# Add result to LLM messages for next iteration (without browser state)
|
|
511
|
+
result_message = self._format_execution_result(code, output, error, current_step=step + 1)
|
|
512
|
+
truncated_result = truncate_message_content(result_message)
|
|
513
|
+
self._llm_messages.append(UserMessage(content=truncated_result))
|
|
514
|
+
|
|
515
|
+
except Exception as e:
|
|
516
|
+
logger.error(f'Error in step {step + 1}: {e}')
|
|
517
|
+
traceback.print_exc()
|
|
518
|
+
break
|
|
519
|
+
else:
|
|
520
|
+
# Loop completed without break - max_steps reached
|
|
521
|
+
logger.warning(f'Maximum steps ({self.max_steps}) reached without task completion')
|
|
522
|
+
|
|
523
|
+
# If task is not done, capture the last step's output as partial result
|
|
524
|
+
if not self._is_task_done() and self.complete_history:
|
|
525
|
+
# Get the last step's output/error and use it as final extracted_content
|
|
526
|
+
last_step = self.complete_history[-1]
|
|
527
|
+
last_result = last_step.result[0] if last_step.result else None
|
|
528
|
+
last_output = last_result.extracted_content if last_result else None
|
|
529
|
+
last_error = last_result.error if last_result else None
|
|
530
|
+
|
|
531
|
+
# Build a partial result message from the last step
|
|
532
|
+
partial_result_parts = []
|
|
533
|
+
partial_result_parts.append(f'Task incomplete - reached step limit ({self.max_steps} steps).')
|
|
534
|
+
partial_result_parts.append('Last step output:')
|
|
535
|
+
|
|
536
|
+
if last_output:
|
|
537
|
+
partial_result_parts.append(f'\nOutput: {last_output}')
|
|
538
|
+
if last_error:
|
|
539
|
+
partial_result_parts.append(f'\nError: {last_error}')
|
|
540
|
+
|
|
541
|
+
# Add any accumulated variables that might contain useful data
|
|
542
|
+
data_vars = []
|
|
543
|
+
for var_name in sorted(self.namespace.keys()):
|
|
544
|
+
if not var_name.startswith('_') and var_name not in {'json', 'asyncio', 'csv', 're', 'datetime', 'Path'}:
|
|
545
|
+
var_value = self.namespace[var_name]
|
|
546
|
+
# Check if it's a list or dict that might contain collected data
|
|
547
|
+
if isinstance(var_value, (list, dict)) and var_value:
|
|
548
|
+
data_vars.append(f' - {var_name}: {type(var_value).__name__} with {len(var_value)} items')
|
|
549
|
+
|
|
550
|
+
if data_vars:
|
|
551
|
+
partial_result_parts.append('\nVariables in namespace that may contain partial data:')
|
|
552
|
+
partial_result_parts.extend(data_vars)
|
|
553
|
+
|
|
554
|
+
partial_result = '\n'.join(partial_result_parts)
|
|
555
|
+
|
|
556
|
+
# Update the last step's extracted_content with this partial result
|
|
557
|
+
if last_result:
|
|
558
|
+
last_result.extracted_content = partial_result
|
|
559
|
+
last_result.is_done = False
|
|
560
|
+
last_result.success = False
|
|
561
|
+
|
|
562
|
+
logger.info(f'\nPartial result captured from last step:\n{partial_result}')
|
|
563
|
+
|
|
564
|
+
# Log final summary if task was completed
|
|
565
|
+
if self._is_task_done():
|
|
566
|
+
logger.info('\n' + '=' * 60)
|
|
567
|
+
logger.info('TASK COMPLETED SUCCESSFULLY')
|
|
568
|
+
logger.info('=' * 60)
|
|
569
|
+
final_result: str | None = self.namespace.get('_task_result') # type: ignore[assignment]
|
|
570
|
+
if final_result:
|
|
571
|
+
logger.info(f'\nFinal Output:\n{final_result}')
|
|
572
|
+
|
|
573
|
+
attachments: list[str] | None = self.namespace.get('_task_attachments') # type: ignore[assignment]
|
|
574
|
+
if attachments:
|
|
575
|
+
logger.info(f'\nFiles Attached:\n{chr(10).join(attachments)}')
|
|
576
|
+
logger.info('=' * 60 + '\n')
|
|
577
|
+
|
|
578
|
+
# Auto-close browser if keep_alive is False
|
|
579
|
+
await self.close()
|
|
580
|
+
|
|
581
|
+
# Store usage summary for history property
|
|
582
|
+
self.usage_summary = await self.token_cost_service.get_usage_summary()
|
|
583
|
+
|
|
584
|
+
# Log token usage summary
|
|
585
|
+
await self.token_cost_service.log_usage_summary()
|
|
586
|
+
|
|
587
|
+
# Log telemetry event
|
|
588
|
+
try:
|
|
589
|
+
self._log_agent_event(max_steps=self.max_steps, agent_run_error=agent_run_error)
|
|
590
|
+
except Exception as log_e:
|
|
591
|
+
logger.error(f'Failed to log telemetry event: {log_e}', exc_info=True)
|
|
592
|
+
|
|
593
|
+
return self.session
|
|
594
|
+
|
|
595
|
+
async def _get_code_from_llm(self) -> tuple[str, str]:
|
|
596
|
+
"""Get Python code from the LLM.
|
|
597
|
+
|
|
598
|
+
Returns:
|
|
599
|
+
Tuple of (extracted_code, full_llm_response)
|
|
600
|
+
"""
|
|
601
|
+
# Prepare messages for this request
|
|
602
|
+
# Include browser state as separate message if available (not accumulated in history)
|
|
603
|
+
messages_to_send = self._llm_messages.copy()
|
|
604
|
+
|
|
605
|
+
if self._last_browser_state_text:
|
|
606
|
+
# Create message with optional screenshot
|
|
607
|
+
if self.use_vision and self._last_screenshot:
|
|
608
|
+
# Build content with text + screenshot
|
|
609
|
+
content_parts: list[ContentPartTextParam | ContentPartImageParam] = [
|
|
610
|
+
ContentPartTextParam(text=self._last_browser_state_text)
|
|
611
|
+
]
|
|
612
|
+
|
|
613
|
+
# Add screenshot
|
|
614
|
+
content_parts.append(
|
|
615
|
+
ContentPartImageParam(
|
|
616
|
+
image_url=ImageURL(
|
|
617
|
+
url=f'data:image/jpeg;base64,{self._last_screenshot}',
|
|
618
|
+
media_type='image/jpeg',
|
|
619
|
+
detail='auto',
|
|
620
|
+
),
|
|
621
|
+
)
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
messages_to_send.append(UserMessage(content=content_parts))
|
|
625
|
+
else:
|
|
626
|
+
# Text only
|
|
627
|
+
messages_to_send.append(UserMessage(content=self._last_browser_state_text))
|
|
628
|
+
|
|
629
|
+
# Clear browser state after including it so it's only in this request
|
|
630
|
+
self._last_browser_state_text = None
|
|
631
|
+
self._last_screenshot = None
|
|
632
|
+
|
|
633
|
+
# Call LLM with message history (including temporary browser state message)
|
|
634
|
+
response = await self.llm.ainvoke(messages_to_send)
|
|
635
|
+
|
|
636
|
+
# Store usage stats from this LLM call
|
|
637
|
+
self._last_llm_usage = response.usage
|
|
638
|
+
|
|
639
|
+
# Log the LLM's raw output for debugging
|
|
640
|
+
logger.info(f'LLM Response:\n{response.completion}')
|
|
641
|
+
|
|
642
|
+
# Check for token limit or repetition issues
|
|
643
|
+
max_tokens = getattr(self.llm, 'max_tokens', None)
|
|
644
|
+
completion_tokens = response.usage.completion_tokens if response.usage else None
|
|
645
|
+
is_problematic, issue_message = detect_token_limit_issue(
|
|
646
|
+
completion=response.completion,
|
|
647
|
+
completion_tokens=completion_tokens,
|
|
648
|
+
max_tokens=max_tokens,
|
|
649
|
+
stop_reason=response.stop_reason,
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
if is_problematic:
|
|
653
|
+
logger.warning(f'Token limit issue detected: {issue_message}')
|
|
654
|
+
# Don't add the bad response to history
|
|
655
|
+
# Instead, inject a system message prompting recovery
|
|
656
|
+
recovery_prompt = (
|
|
657
|
+
f'Your previous response hit a token limit or became repetitive: {issue_message}\n\n'
|
|
658
|
+
'Please write a SHORT plan (2 sentences) for what to do next, then execute ONE simple action.'
|
|
659
|
+
)
|
|
660
|
+
self._llm_messages.append(UserMessage(content=recovery_prompt))
|
|
661
|
+
# Return a controlled error message instead of corrupted code
|
|
662
|
+
return '', f'[Token limit error: {issue_message}]'
|
|
663
|
+
|
|
664
|
+
# Store the full response
|
|
665
|
+
full_response = response.completion
|
|
666
|
+
|
|
667
|
+
# Extract code blocks from response
|
|
668
|
+
# Support multiple code block types: python, js, bash, markdown
|
|
669
|
+
code_blocks = extract_code_blocks(response.completion)
|
|
670
|
+
|
|
671
|
+
# Inject non-python blocks into namespace as variables
|
|
672
|
+
# Track which variables are code blocks for browser state display
|
|
673
|
+
if '_code_block_vars' not in self.namespace:
|
|
674
|
+
self.namespace['_code_block_vars'] = set()
|
|
675
|
+
|
|
676
|
+
for block_type, block_content in code_blocks.items():
|
|
677
|
+
if not block_type.startswith('python'):
|
|
678
|
+
# Store js, bash, markdown blocks (and named variants) as variables in namespace
|
|
679
|
+
self.namespace[block_type] = block_content
|
|
680
|
+
self.namespace['_code_block_vars'].add(block_type)
|
|
681
|
+
print(f'→ Code block variable: {block_type} (str, {len(block_content)} chars)')
|
|
682
|
+
logger.debug(f'Injected {block_type} block into namespace ({len(block_content)} chars)')
|
|
683
|
+
|
|
684
|
+
# Store all code blocks for sequential execution
|
|
685
|
+
self.namespace['_all_code_blocks'] = code_blocks
|
|
686
|
+
|
|
687
|
+
# Get Python code if it exists
|
|
688
|
+
# If no python block exists and no other code blocks exist, return empty string to skip execution
|
|
689
|
+
# This prevents treating plain text explanations as code
|
|
690
|
+
code = code_blocks.get('python', response.completion)
|
|
691
|
+
|
|
692
|
+
# Add to LLM messages (truncate for history to save context)
|
|
693
|
+
truncated_completion = truncate_message_content(response.completion)
|
|
694
|
+
self._llm_messages.append(AssistantMessage(content=truncated_completion))
|
|
695
|
+
|
|
696
|
+
return code, full_response
|
|
697
|
+
|
|
698
|
+
def _print_variable_info(self, var_name: str, value: Any) -> None:
|
|
699
|
+
"""Print compact info about a variable assignment."""
|
|
700
|
+
# Skip built-in modules and known imports
|
|
701
|
+
skip_names = {
|
|
702
|
+
'json',
|
|
703
|
+
'asyncio',
|
|
704
|
+
'csv',
|
|
705
|
+
're',
|
|
706
|
+
'datetime',
|
|
707
|
+
'Path',
|
|
708
|
+
'pd',
|
|
709
|
+
'np',
|
|
710
|
+
'plt',
|
|
711
|
+
'requests',
|
|
712
|
+
'BeautifulSoup',
|
|
713
|
+
'PdfReader',
|
|
714
|
+
'browser',
|
|
715
|
+
'file_system',
|
|
716
|
+
}
|
|
717
|
+
if var_name in skip_names:
|
|
718
|
+
return
|
|
719
|
+
|
|
720
|
+
# Skip code block variables (already printed)
|
|
721
|
+
if '_code_block_vars' in self.namespace and var_name in self.namespace.get('_code_block_vars', set()):
|
|
722
|
+
return
|
|
723
|
+
|
|
724
|
+
# Print compact variable info
|
|
725
|
+
if isinstance(value, (list, dict)):
|
|
726
|
+
preview = str(value)[:100]
|
|
727
|
+
print(f'→ Variable: {var_name} ({type(value).__name__}, len={len(value)}, preview={preview}...)')
|
|
728
|
+
elif isinstance(value, str) and len(value) > 50:
|
|
729
|
+
print(f'→ Variable: {var_name} (str, {len(value)} chars, preview={value[:50]}...)')
|
|
730
|
+
elif callable(value):
|
|
731
|
+
print(f'→ Variable: {var_name} (function)')
|
|
732
|
+
else:
|
|
733
|
+
print(f'→ Variable: {var_name} ({type(value).__name__}, value={repr(value)[:50]})')
|
|
734
|
+
|
|
735
|
+
async def _execute_code(self, code: str) -> tuple[str | None, str | None, str | None]:
|
|
736
|
+
"""
|
|
737
|
+
Execute Python code in the namespace.
|
|
738
|
+
|
|
739
|
+
Args:
|
|
740
|
+
code: The Python code to execute
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
Tuple of (output, error, browser_state)
|
|
744
|
+
"""
|
|
745
|
+
# Create new cell
|
|
746
|
+
cell = self.session.add_cell(source=code)
|
|
747
|
+
cell.status = ExecutionStatus.RUNNING
|
|
748
|
+
cell.execution_count = self.session.increment_execution_count()
|
|
749
|
+
|
|
750
|
+
output = None
|
|
751
|
+
error = None
|
|
752
|
+
browser_state = None
|
|
753
|
+
|
|
754
|
+
try:
|
|
755
|
+
# Capture output
|
|
756
|
+
import ast
|
|
757
|
+
import io
|
|
758
|
+
import sys
|
|
759
|
+
|
|
760
|
+
old_stdout = sys.stdout
|
|
761
|
+
sys.stdout = io.StringIO()
|
|
762
|
+
|
|
763
|
+
try:
|
|
764
|
+
# Add asyncio to namespace if not already there
|
|
765
|
+
if 'asyncio' not in self.namespace:
|
|
766
|
+
self.namespace['asyncio'] = asyncio
|
|
767
|
+
|
|
768
|
+
# Store the current code in namespace for done() validation
|
|
769
|
+
self.namespace['_current_cell_code'] = code
|
|
770
|
+
# Store consecutive errors count for done() validation
|
|
771
|
+
self.namespace['_consecutive_errors'] = self._consecutive_errors
|
|
772
|
+
|
|
773
|
+
# Check if code contains await expressions - if so, wrap in async function
|
|
774
|
+
# This mimics how Jupyter/IPython handles top-level await
|
|
775
|
+
try:
|
|
776
|
+
tree = ast.parse(code, mode='exec')
|
|
777
|
+
has_await = any(isinstance(node, (ast.Await, ast.AsyncWith, ast.AsyncFor)) for node in ast.walk(tree))
|
|
778
|
+
except SyntaxError:
|
|
779
|
+
# If parse fails, let exec handle the error
|
|
780
|
+
has_await = False
|
|
781
|
+
|
|
782
|
+
if has_await:
|
|
783
|
+
# When code has await, we must wrap in async function
|
|
784
|
+
# To make variables persist naturally (like Jupyter without needing 'global'):
|
|
785
|
+
# 1. Extract all assigned variable names from the code
|
|
786
|
+
# 2. Inject 'global' declarations for variables that already exist in namespace
|
|
787
|
+
# 3. Extract user's explicit global declarations and pre-define those vars
|
|
788
|
+
# 4. Return locals() so we can update namespace with new variables
|
|
789
|
+
|
|
790
|
+
# Find all variable names being assigned + user's explicit globals
|
|
791
|
+
try:
|
|
792
|
+
assigned_names = set()
|
|
793
|
+
user_global_names = set()
|
|
794
|
+
|
|
795
|
+
for node in ast.walk(tree):
|
|
796
|
+
if isinstance(node, ast.Assign):
|
|
797
|
+
for target in node.targets:
|
|
798
|
+
if isinstance(target, ast.Name):
|
|
799
|
+
assigned_names.add(target.id)
|
|
800
|
+
elif isinstance(node, ast.AugAssign) and isinstance(node.target, ast.Name):
|
|
801
|
+
assigned_names.add(node.target.id)
|
|
802
|
+
elif isinstance(node, (ast.AnnAssign, ast.NamedExpr)):
|
|
803
|
+
if hasattr(node, 'target') and isinstance(node.target, ast.Name):
|
|
804
|
+
assigned_names.add(node.target.id)
|
|
805
|
+
elif isinstance(node, ast.Global):
|
|
806
|
+
# Track user's explicit global declarations
|
|
807
|
+
user_global_names.update(node.names)
|
|
808
|
+
|
|
809
|
+
# Pre-define any user-declared globals that don't exist yet
|
|
810
|
+
# This prevents NameError when user writes "global foo" before "foo = ..."
|
|
811
|
+
for name in user_global_names:
|
|
812
|
+
if name not in self.namespace:
|
|
813
|
+
self.namespace[name] = None
|
|
814
|
+
|
|
815
|
+
# Filter to only existing namespace vars (like Jupyter does)
|
|
816
|
+
# Include both: assigned vars that exist + user's explicit globals
|
|
817
|
+
existing_vars = {name for name in (assigned_names | user_global_names) if name in self.namespace}
|
|
818
|
+
except Exception as e:
|
|
819
|
+
existing_vars = set()
|
|
820
|
+
|
|
821
|
+
# Build global declaration if needed
|
|
822
|
+
global_decl = ''
|
|
823
|
+
has_global_decl = False
|
|
824
|
+
if existing_vars:
|
|
825
|
+
vars_str = ', '.join(sorted(existing_vars))
|
|
826
|
+
global_decl = f' global {vars_str}\n'
|
|
827
|
+
has_global_decl = True
|
|
828
|
+
|
|
829
|
+
indented_code = '\n'.join(' ' + line if line.strip() else line for line in code.split('\n'))
|
|
830
|
+
wrapped_code = f"""async def __code_exec__():
|
|
831
|
+
{global_decl}{indented_code}
|
|
832
|
+
# Return locals so we can update the namespace
|
|
833
|
+
return locals()
|
|
834
|
+
|
|
835
|
+
__code_exec_coro__ = __code_exec__()
|
|
836
|
+
"""
|
|
837
|
+
# Store whether we added a global declaration (needed for error line mapping)
|
|
838
|
+
self.namespace['_has_global_decl'] = has_global_decl
|
|
839
|
+
|
|
840
|
+
# Compile and execute wrapper at module level
|
|
841
|
+
compiled_code = compile(wrapped_code, '<code>', 'exec')
|
|
842
|
+
exec(compiled_code, self.namespace, self.namespace)
|
|
843
|
+
|
|
844
|
+
# Get and await the coroutine, then update namespace with new/modified variables
|
|
845
|
+
coro = self.namespace.get('__code_exec_coro__')
|
|
846
|
+
if coro:
|
|
847
|
+
result_locals = await coro
|
|
848
|
+
# Update namespace with all variables from the function's locals
|
|
849
|
+
# This makes variable assignments persist across cells
|
|
850
|
+
if result_locals:
|
|
851
|
+
for key, value in result_locals.items():
|
|
852
|
+
if not key.startswith('_'):
|
|
853
|
+
self.namespace[key] = value
|
|
854
|
+
# Variable info is tracked in "Available" section, no need for verbose inline output
|
|
855
|
+
|
|
856
|
+
# Clean up temporary variables
|
|
857
|
+
self.namespace.pop('__code_exec_coro__', None)
|
|
858
|
+
self.namespace.pop('__code_exec__', None)
|
|
859
|
+
else:
|
|
860
|
+
# No await - execute directly at module level for natural variable scoping
|
|
861
|
+
# This means x = x + 10 will work without needing 'global x'
|
|
862
|
+
|
|
863
|
+
# Track variables before execution
|
|
864
|
+
vars_before = set(self.namespace.keys())
|
|
865
|
+
|
|
866
|
+
compiled_code = compile(code, '<code>', 'exec')
|
|
867
|
+
exec(compiled_code, self.namespace, self.namespace)
|
|
868
|
+
|
|
869
|
+
# Track newly created/modified variables (info shown in "Available" section)
|
|
870
|
+
vars_after = set(self.namespace.keys())
|
|
871
|
+
new_vars = vars_after - vars_before
|
|
872
|
+
|
|
873
|
+
# Get output
|
|
874
|
+
output_value = sys.stdout.getvalue()
|
|
875
|
+
if output_value:
|
|
876
|
+
output = output_value
|
|
877
|
+
|
|
878
|
+
finally:
|
|
879
|
+
sys.stdout = old_stdout
|
|
880
|
+
|
|
881
|
+
# Wait 2 seconds for page to stabilize after code execution
|
|
882
|
+
await asyncio.sleep(0.5)
|
|
883
|
+
|
|
884
|
+
# Note: Browser state is now fetched right before LLM call instead of after each execution
|
|
885
|
+
# This reduces unnecessary state fetches for operations that don't affect the browser
|
|
886
|
+
|
|
887
|
+
cell.status = ExecutionStatus.SUCCESS
|
|
888
|
+
cell.output = output
|
|
889
|
+
cell.browser_state = None # Will be captured in next iteration before LLM call
|
|
890
|
+
|
|
891
|
+
except Exception as e:
|
|
892
|
+
# Handle EvaluateError specially - JavaScript execution failed
|
|
893
|
+
if isinstance(e, EvaluateError):
|
|
894
|
+
error = str(e)
|
|
895
|
+
cell.status = ExecutionStatus.ERROR
|
|
896
|
+
cell.error = error
|
|
897
|
+
logger.error(f'Code execution error: {error}')
|
|
898
|
+
|
|
899
|
+
await asyncio.sleep(1)
|
|
900
|
+
|
|
901
|
+
# Browser state will be fetched before next LLM call
|
|
902
|
+
# Return immediately - do not continue executing code
|
|
903
|
+
return output, error, None
|
|
904
|
+
|
|
905
|
+
# Handle NameError specially - check for code block variable confusion
|
|
906
|
+
if isinstance(e, NameError):
|
|
907
|
+
error_msg = str(e)
|
|
908
|
+
cell.status = ExecutionStatus.ERROR
|
|
909
|
+
cell.error = error
|
|
910
|
+
|
|
911
|
+
# Browser state will be fetched before next LLM call
|
|
912
|
+
await asyncio.sleep(0.5)
|
|
913
|
+
return output, error, None
|
|
914
|
+
|
|
915
|
+
# For syntax errors and common parsing errors, show just the error message
|
|
916
|
+
# without the full traceback to keep output clean
|
|
917
|
+
if isinstance(e, SyntaxError):
|
|
918
|
+
error_msg = e.msg if e.msg else str(e)
|
|
919
|
+
error = f'{type(e).__name__}: {error_msg}'
|
|
920
|
+
|
|
921
|
+
# Detect common f-string issues with JSON/JavaScript code
|
|
922
|
+
if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower() and code:
|
|
923
|
+
# Check if code contains f-strings with potential JSON/JS content
|
|
924
|
+
has_fstring = bool(re.search(r'\bf["\']', code))
|
|
925
|
+
has_json_pattern = bool(re.search(r'json\.dumps|"[^"]*\{[^"]*\}[^"]*"|\'[^\']*\{[^\']*\}[^\']*\'', code))
|
|
926
|
+
has_js_pattern = bool(re.search(r'evaluate\(|await evaluate', code))
|
|
927
|
+
|
|
928
|
+
if has_fstring and (has_json_pattern or has_js_pattern):
|
|
929
|
+
error += (
|
|
930
|
+
'\n\n💡 TIP: Detected f-string with JSON/JavaScript code containing {}.\n'
|
|
931
|
+
' Use separate ```js or ```markdown blocks instead of f-strings to avoid escaping issues.\n'
|
|
932
|
+
' If your code block needs ``` inside it, wrap with 4+ backticks: ````markdown code`\n'
|
|
933
|
+
)
|
|
934
|
+
|
|
935
|
+
# Detect and provide helpful hints for common string literal errors
|
|
936
|
+
if 'unterminated' in error_msg.lower() and 'string' in error_msg.lower():
|
|
937
|
+
# Detect what type of string literal is unterminated
|
|
938
|
+
is_triple = 'triple-quoted' in error_msg.lower()
|
|
939
|
+
msg_lower = error_msg.lower()
|
|
940
|
+
|
|
941
|
+
# Detect prefix type from error message
|
|
942
|
+
if 'f-string' in msg_lower and 'raw' in msg_lower:
|
|
943
|
+
prefix = 'rf or fr'
|
|
944
|
+
desc = 'raw f-string'
|
|
945
|
+
elif 'f-string' in msg_lower:
|
|
946
|
+
prefix = 'f'
|
|
947
|
+
desc = 'f-string'
|
|
948
|
+
elif 'raw' in msg_lower and 'bytes' in msg_lower:
|
|
949
|
+
prefix = 'rb or br'
|
|
950
|
+
desc = 'raw bytes'
|
|
951
|
+
elif 'raw' in msg_lower:
|
|
952
|
+
prefix = 'r'
|
|
953
|
+
desc = 'raw string'
|
|
954
|
+
elif 'bytes' in msg_lower:
|
|
955
|
+
prefix = 'b'
|
|
956
|
+
desc = 'bytes'
|
|
957
|
+
else:
|
|
958
|
+
prefix = ''
|
|
959
|
+
desc = 'string'
|
|
960
|
+
|
|
961
|
+
# Build hint based on triple-quoted vs single/double quoted
|
|
962
|
+
if is_triple:
|
|
963
|
+
if prefix:
|
|
964
|
+
hint = f"Hint: Unterminated {prefix}'''...''' or {prefix}\"\"\"...\"\" ({desc}). Check for missing closing quotes or unescaped quotes inside."
|
|
965
|
+
else:
|
|
966
|
+
hint = "Hint: Unterminated '''...''' or \"\"\"...\"\" detected. Check for missing closing quotes or unescaped quotes inside."
|
|
967
|
+
hint += '\n If you need ``` inside your string, use a ````markdown varname` code block with 4+ backticks instead.'
|
|
968
|
+
else:
|
|
969
|
+
if prefix:
|
|
970
|
+
hint = f'Hint: Unterminated {prefix}\'...\' or {prefix}"..." ({desc}). Check for missing closing quote or unescaped quotes inside.'
|
|
971
|
+
else:
|
|
972
|
+
hint = 'Hint: Unterminated \'...\' or "..." detected. Check for missing closing quote or unescaped quotes inside the string.'
|
|
973
|
+
error += f'\n{hint}'
|
|
974
|
+
|
|
975
|
+
# Show the problematic line from the code
|
|
976
|
+
if e.text:
|
|
977
|
+
error += f'\n{e.text}'
|
|
978
|
+
elif e.lineno and code:
|
|
979
|
+
# If e.text is empty, extract the line from the code
|
|
980
|
+
lines = code.split('\n')
|
|
981
|
+
if 0 < e.lineno <= len(lines):
|
|
982
|
+
error += f'\n{lines[e.lineno - 1]}'
|
|
983
|
+
|
|
984
|
+
else:
|
|
985
|
+
# For other errors, try to extract useful information
|
|
986
|
+
error_str = str(e)
|
|
987
|
+
error = f'{type(e).__name__}: {error_str}' if error_str else f'{type(e).__name__} occurred'
|
|
988
|
+
|
|
989
|
+
# For RuntimeError or other exceptions, try to extract traceback info
|
|
990
|
+
# to show which line in the user's code actually failed
|
|
991
|
+
if hasattr(e, '__traceback__'):
|
|
992
|
+
# Walk the traceback to find the frame with '<code>' filename
|
|
993
|
+
tb = e.__traceback__
|
|
994
|
+
user_code_lineno = None
|
|
995
|
+
while tb is not None:
|
|
996
|
+
frame = tb.tb_frame
|
|
997
|
+
if frame.f_code.co_filename == '<code>':
|
|
998
|
+
# Found the frame executing user code
|
|
999
|
+
# Get the line number from the traceback
|
|
1000
|
+
user_code_lineno = tb.tb_lineno
|
|
1001
|
+
break
|
|
1002
|
+
tb = tb.tb_next
|
|
1003
|
+
|
|
1004
|
+
cell.status = ExecutionStatus.ERROR
|
|
1005
|
+
cell.error = error
|
|
1006
|
+
logger.error(f'Code execution error: {error}')
|
|
1007
|
+
|
|
1008
|
+
await asyncio.sleep(1)
|
|
1009
|
+
|
|
1010
|
+
# Browser state will be fetched before next LLM call
|
|
1011
|
+
|
|
1012
|
+
return output, error, None
|
|
1013
|
+
|
|
1014
|
+
async def _get_browser_state(self) -> tuple[str, str | None]:
|
|
1015
|
+
"""Get the current browser state as text with ultra-minimal DOM structure for code agents.
|
|
1016
|
+
|
|
1017
|
+
Returns:
|
|
1018
|
+
Tuple of (browser_state_text, screenshot_base64)
|
|
1019
|
+
"""
|
|
1020
|
+
if not self.browser_session or not self.dom_service:
|
|
1021
|
+
return 'Browser state not available', None
|
|
1022
|
+
|
|
1023
|
+
try:
|
|
1024
|
+
# Get full browser state including screenshot if use_vision is enabled
|
|
1025
|
+
include_screenshot = True
|
|
1026
|
+
state = await self.browser_session.get_browser_state_summary(include_screenshot=include_screenshot)
|
|
1027
|
+
|
|
1028
|
+
# Format browser state with namespace context
|
|
1029
|
+
browser_state_text = await format_browser_state_for_llm(
|
|
1030
|
+
state=state, namespace=self.namespace, browser_session=self.browser_session
|
|
1031
|
+
)
|
|
1032
|
+
|
|
1033
|
+
screenshot = state.screenshot if include_screenshot else None
|
|
1034
|
+
return browser_state_text, screenshot
|
|
1035
|
+
|
|
1036
|
+
except Exception as e:
|
|
1037
|
+
logger.error(f'Failed to get browser state: {e}')
|
|
1038
|
+
return f'Error getting browser state: {e}', None
|
|
1039
|
+
|
|
1040
|
+
def _format_execution_result(self, code: str, output: str | None, error: str | None, current_step: int | None = None) -> str:
|
|
1041
|
+
"""Format the execution result for the LLM (without browser state)."""
|
|
1042
|
+
result = []
|
|
1043
|
+
|
|
1044
|
+
# Add step progress header if step number provided
|
|
1045
|
+
if current_step is not None:
|
|
1046
|
+
progress_header = f'Step {current_step}/{self.max_steps} executed'
|
|
1047
|
+
# Add consecutive failure tracking if there are errors
|
|
1048
|
+
if error and self._consecutive_errors > 0:
|
|
1049
|
+
progress_header += f' | Consecutive failures: {self._consecutive_errors}/{self.max_failures}'
|
|
1050
|
+
result.append(progress_header)
|
|
1051
|
+
|
|
1052
|
+
if error:
|
|
1053
|
+
result.append(f'Error: {error}')
|
|
1054
|
+
|
|
1055
|
+
if output:
|
|
1056
|
+
# Truncate output if too long
|
|
1057
|
+
if len(output) > 10000:
|
|
1058
|
+
output = output[:9950] + '\n[Truncated after 10000 characters]'
|
|
1059
|
+
result.append(f'Output: {output}')
|
|
1060
|
+
if len(result) == 0:
|
|
1061
|
+
result.append('Executed')
|
|
1062
|
+
return '\n'.join(result)
|
|
1063
|
+
|
|
1064
|
+
def _is_task_done(self) -> bool:
|
|
1065
|
+
"""Check if the task is marked as done in the namespace."""
|
|
1066
|
+
# Check if 'done' was called by looking for a special marker in namespace
|
|
1067
|
+
return self.namespace.get('_task_done', False)
|
|
1068
|
+
|
|
1069
|
+
async def _capture_screenshot(self, step_number: int) -> str | None:
|
|
1070
|
+
"""Capture and store screenshot for eval tracking."""
|
|
1071
|
+
if not self.browser_session:
|
|
1072
|
+
return None
|
|
1073
|
+
|
|
1074
|
+
try:
|
|
1075
|
+
# Get browser state summary which includes screenshot
|
|
1076
|
+
state = await self.browser_session.get_browser_state_summary(include_screenshot=True)
|
|
1077
|
+
if state and state.screenshot:
|
|
1078
|
+
# Store screenshot using screenshot service
|
|
1079
|
+
screenshot_path = await self.screenshot_service.store_screenshot(state.screenshot, step_number)
|
|
1080
|
+
return str(screenshot_path) if screenshot_path else None
|
|
1081
|
+
except Exception as e:
|
|
1082
|
+
logger.warning(f'Failed to capture screenshot for step {step_number}: {e}')
|
|
1083
|
+
return None
|
|
1084
|
+
|
|
1085
|
+
async def _add_step_to_complete_history(
|
|
1086
|
+
self,
|
|
1087
|
+
model_output_code: str,
|
|
1088
|
+
full_llm_response: str,
|
|
1089
|
+
output: str | None,
|
|
1090
|
+
error: str | None,
|
|
1091
|
+
screenshot_path: str | None,
|
|
1092
|
+
) -> None:
|
|
1093
|
+
"""Add a step to complete_history using type-safe models."""
|
|
1094
|
+
# Get current browser URL and title for state
|
|
1095
|
+
url: str | None = None
|
|
1096
|
+
title: str | None = None
|
|
1097
|
+
if self.browser_session:
|
|
1098
|
+
try:
|
|
1099
|
+
url = await self.browser_session.get_current_page_url()
|
|
1100
|
+
# Get title from browser
|
|
1101
|
+
cdp_session = await self.browser_session.get_or_create_cdp_session()
|
|
1102
|
+
result = await cdp_session.cdp_client.send.Runtime.evaluate(
|
|
1103
|
+
params={'expression': 'document.title', 'returnByValue': True},
|
|
1104
|
+
session_id=cdp_session.session_id,
|
|
1105
|
+
)
|
|
1106
|
+
title = result.get('result', {}).get('value')
|
|
1107
|
+
except Exception as e:
|
|
1108
|
+
logger.debug(f'Failed to get browser URL/title for history: {e}')
|
|
1109
|
+
|
|
1110
|
+
# Check if this is a done result
|
|
1111
|
+
is_done = self._is_task_done()
|
|
1112
|
+
|
|
1113
|
+
# Get self-reported success from done() call if task is done
|
|
1114
|
+
self_reported_success: bool | None = None
|
|
1115
|
+
if is_done:
|
|
1116
|
+
task_success = self.namespace.get('_task_success')
|
|
1117
|
+
self_reported_success = task_success if isinstance(task_success, bool) else None
|
|
1118
|
+
|
|
1119
|
+
# Create result entry using typed model
|
|
1120
|
+
result_entry = CodeAgentResult(
|
|
1121
|
+
extracted_content=output if output else None,
|
|
1122
|
+
error=error if error else None,
|
|
1123
|
+
is_done=is_done,
|
|
1124
|
+
success=self_reported_success,
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
# Create state entry using typed model
|
|
1128
|
+
state_entry = CodeAgentState(url=url, title=title, screenshot_path=screenshot_path)
|
|
1129
|
+
|
|
1130
|
+
# Create metadata entry using typed model
|
|
1131
|
+
step_end_time = datetime.datetime.now().timestamp()
|
|
1132
|
+
metadata_entry = CodeAgentStepMetadata(
|
|
1133
|
+
input_tokens=self._last_llm_usage.prompt_tokens if self._last_llm_usage else None,
|
|
1134
|
+
output_tokens=self._last_llm_usage.completion_tokens if self._last_llm_usage else None,
|
|
1135
|
+
step_start_time=self._step_start_time,
|
|
1136
|
+
step_end_time=step_end_time,
|
|
1137
|
+
)
|
|
1138
|
+
|
|
1139
|
+
# Create model output entry using typed model (if there's code to track)
|
|
1140
|
+
model_output_entry: CodeAgentModelOutput | None = None
|
|
1141
|
+
if model_output_code or full_llm_response:
|
|
1142
|
+
model_output_entry = CodeAgentModelOutput(
|
|
1143
|
+
model_output=model_output_code if model_output_code else '',
|
|
1144
|
+
full_response=full_llm_response if full_llm_response else '',
|
|
1145
|
+
)
|
|
1146
|
+
|
|
1147
|
+
# Create history entry using typed model
|
|
1148
|
+
history_entry = CodeAgentHistory(
|
|
1149
|
+
model_output=model_output_entry,
|
|
1150
|
+
result=[result_entry],
|
|
1151
|
+
state=state_entry,
|
|
1152
|
+
metadata=metadata_entry,
|
|
1153
|
+
screenshot_path=screenshot_path, # Keep for backward compatibility
|
|
1154
|
+
)
|
|
1155
|
+
|
|
1156
|
+
self.complete_history.append(history_entry)
|
|
1157
|
+
|
|
1158
|
+
def _log_agent_event(self, max_steps: int, agent_run_error: str | None = None) -> None:
|
|
1159
|
+
"""Send the agent event for this run to telemetry."""
|
|
1160
|
+
from urllib.parse import urlparse
|
|
1161
|
+
|
|
1162
|
+
token_summary = self.token_cost_service.get_usage_tokens_for_model(self.llm.model)
|
|
1163
|
+
|
|
1164
|
+
# For CodeAgent, we don't have action history like Agent does
|
|
1165
|
+
# Instead we track the code execution cells
|
|
1166
|
+
action_history_data: list[list[dict[str, Any]] | None] = []
|
|
1167
|
+
for step in self.complete_history:
|
|
1168
|
+
# Extract code from model_output if available (type-safe access)
|
|
1169
|
+
if step.model_output and step.model_output.full_response:
|
|
1170
|
+
code = step.model_output.full_response
|
|
1171
|
+
# Represent each code cell as a simple action entry
|
|
1172
|
+
action_history_data.append([{'llm_response': code}])
|
|
1173
|
+
else:
|
|
1174
|
+
action_history_data.append(None)
|
|
1175
|
+
|
|
1176
|
+
# Get final result from the last step or namespace (type-safe)
|
|
1177
|
+
final_result: Any = self.namespace.get('_task_result')
|
|
1178
|
+
final_result_str: str | None = final_result if isinstance(final_result, str) else None
|
|
1179
|
+
|
|
1180
|
+
# Get URLs visited from complete_history (type-safe access)
|
|
1181
|
+
urls_visited: list[str] = []
|
|
1182
|
+
for step in self.complete_history:
|
|
1183
|
+
if step.state.url and step.state.url not in urls_visited:
|
|
1184
|
+
urls_visited.append(step.state.url)
|
|
1185
|
+
|
|
1186
|
+
# Get errors from complete_history (type-safe access)
|
|
1187
|
+
errors: list[str] = []
|
|
1188
|
+
for step in self.complete_history:
|
|
1189
|
+
for result in step.result:
|
|
1190
|
+
if result.error:
|
|
1191
|
+
errors.append(result.error)
|
|
1192
|
+
|
|
1193
|
+
# Determine success from task completion status (type-safe)
|
|
1194
|
+
is_done = self._is_task_done()
|
|
1195
|
+
task_success: Any = self.namespace.get('_task_success')
|
|
1196
|
+
self_reported_success: bool | None = task_success if isinstance(task_success, bool) else (False if is_done else None)
|
|
1197
|
+
|
|
1198
|
+
self.telemetry.capture(
|
|
1199
|
+
AgentTelemetryEvent(
|
|
1200
|
+
task=self.task,
|
|
1201
|
+
model=self.llm.model,
|
|
1202
|
+
model_provider=self.llm.provider,
|
|
1203
|
+
max_steps=max_steps,
|
|
1204
|
+
max_actions_per_step=1, # CodeAgent executes one code cell per step
|
|
1205
|
+
use_vision=self.use_vision,
|
|
1206
|
+
version=self.version,
|
|
1207
|
+
source=self.source,
|
|
1208
|
+
cdp_url=urlparse(self.browser_session.cdp_url).hostname
|
|
1209
|
+
if self.browser_session and self.browser_session.cdp_url
|
|
1210
|
+
else None,
|
|
1211
|
+
agent_type='code', # CodeAgent identifier
|
|
1212
|
+
action_errors=errors,
|
|
1213
|
+
action_history=action_history_data,
|
|
1214
|
+
urls_visited=urls_visited,
|
|
1215
|
+
steps=len(self.complete_history),
|
|
1216
|
+
total_input_tokens=token_summary.prompt_tokens,
|
|
1217
|
+
total_output_tokens=token_summary.completion_tokens,
|
|
1218
|
+
prompt_cached_tokens=token_summary.prompt_cached_tokens,
|
|
1219
|
+
total_tokens=token_summary.total_tokens,
|
|
1220
|
+
total_duration_seconds=sum(step.metadata.duration_seconds for step in self.complete_history if step.metadata),
|
|
1221
|
+
success=self_reported_success,
|
|
1222
|
+
final_result_response=final_result_str,
|
|
1223
|
+
error_message=agent_run_error,
|
|
1224
|
+
)
|
|
1225
|
+
)
|
|
1226
|
+
|
|
1227
|
+
def screenshot_paths(self, n_last: int | None = None) -> list[str | None]:
|
|
1228
|
+
"""
|
|
1229
|
+
Get screenshot paths from complete_history for eval system.
|
|
1230
|
+
|
|
1231
|
+
Args:
|
|
1232
|
+
n_last: Optional number of last screenshots to return
|
|
1233
|
+
|
|
1234
|
+
Returns:
|
|
1235
|
+
List of screenshot file paths (or None for missing screenshots)
|
|
1236
|
+
"""
|
|
1237
|
+
paths = [step.screenshot_path for step in self.complete_history]
|
|
1238
|
+
|
|
1239
|
+
if n_last is not None:
|
|
1240
|
+
return paths[-n_last:] if len(paths) > n_last else paths
|
|
1241
|
+
|
|
1242
|
+
return paths
|
|
1243
|
+
|
|
1244
|
+
@property
|
|
1245
|
+
def message_manager(self) -> Any:
|
|
1246
|
+
"""
|
|
1247
|
+
Compatibility property for eval system.
|
|
1248
|
+
Returns a mock object with last_input_messages attribute.
|
|
1249
|
+
"""
|
|
1250
|
+
|
|
1251
|
+
class MockMessageManager:
|
|
1252
|
+
def __init__(self, llm_messages: list[BaseMessage]) -> None:
|
|
1253
|
+
# Convert code-use LLM messages to format expected by eval system
|
|
1254
|
+
self.last_input_messages = llm_messages
|
|
1255
|
+
|
|
1256
|
+
return MockMessageManager(self._llm_messages)
|
|
1257
|
+
|
|
1258
|
+
@property
|
|
1259
|
+
def history(self) -> Any:
|
|
1260
|
+
"""
|
|
1261
|
+
Compatibility property for eval system.
|
|
1262
|
+
Returns a mock AgentHistoryList object with history attribute containing complete_history.
|
|
1263
|
+
This is what the eval system expects when it does: agent_history = agent.history
|
|
1264
|
+
"""
|
|
1265
|
+
|
|
1266
|
+
class DictToObject:
|
|
1267
|
+
"""Convert dict to object with attribute access for eval compatibility."""
|
|
1268
|
+
|
|
1269
|
+
def __init__(self, data: dict[str, Any]) -> None:
|
|
1270
|
+
for key, value in data.items():
|
|
1271
|
+
if isinstance(value, dict):
|
|
1272
|
+
setattr(self, key, DictToObject(value))
|
|
1273
|
+
elif isinstance(value, list):
|
|
1274
|
+
setattr(self, key, [DictToObject(item) if isinstance(item, dict) else item for item in value])
|
|
1275
|
+
else:
|
|
1276
|
+
setattr(self, key, value)
|
|
1277
|
+
|
|
1278
|
+
def __getattr__(self, name: str) -> None:
|
|
1279
|
+
"""Provide safe attribute access with defaults for missing attributes."""
|
|
1280
|
+
# Return None for missing attributes instead of raising AttributeError
|
|
1281
|
+
# This handles cases where eval system checks attributes that CodeAgent doesn't set
|
|
1282
|
+
return None
|
|
1283
|
+
|
|
1284
|
+
def model_dump(self) -> dict[str, Any]:
|
|
1285
|
+
"""Support model_dump() calls from eval system."""
|
|
1286
|
+
result = {}
|
|
1287
|
+
for key, value in self.__dict__.items():
|
|
1288
|
+
if isinstance(value, DictToObject):
|
|
1289
|
+
result[key] = value.model_dump()
|
|
1290
|
+
elif isinstance(value, list):
|
|
1291
|
+
result[key] = [item.model_dump() if isinstance(item, DictToObject) else item for item in value]
|
|
1292
|
+
else:
|
|
1293
|
+
result[key] = value
|
|
1294
|
+
return result
|
|
1295
|
+
|
|
1296
|
+
def get_screenshot(self) -> str | None:
|
|
1297
|
+
"""Support get_screenshot() calls for state objects."""
|
|
1298
|
+
# Load screenshot from disk and return as base64 string (matching BrowserStateHistory implementation)
|
|
1299
|
+
if not hasattr(self, 'screenshot_path') or not self.screenshot_path:
|
|
1300
|
+
return None
|
|
1301
|
+
|
|
1302
|
+
import base64
|
|
1303
|
+
from pathlib import Path
|
|
1304
|
+
|
|
1305
|
+
path_obj = Path(self.screenshot_path)
|
|
1306
|
+
if not path_obj.exists():
|
|
1307
|
+
return None
|
|
1308
|
+
|
|
1309
|
+
try:
|
|
1310
|
+
with open(path_obj, 'rb') as f:
|
|
1311
|
+
screenshot_data = f.read()
|
|
1312
|
+
return base64.b64encode(screenshot_data).decode('utf-8')
|
|
1313
|
+
except Exception:
|
|
1314
|
+
return None
|
|
1315
|
+
|
|
1316
|
+
class MockAgentHistoryList:
|
|
1317
|
+
def __init__(self, complete_history: list[CodeAgentHistory], usage_summary: UsageSummary | None) -> None:
|
|
1318
|
+
# Convert each CodeAgentHistory to dict, then to object with attribute access
|
|
1319
|
+
self.history = [DictToObject(item.model_dump()) for item in complete_history]
|
|
1320
|
+
# Use the provided usage summary
|
|
1321
|
+
self.usage = usage_summary
|
|
1322
|
+
|
|
1323
|
+
return MockAgentHistoryList(self.complete_history, self.usage_summary)
|
|
1324
|
+
|
|
1325
|
+
async def close(self) -> None:
|
|
1326
|
+
"""Close the browser session."""
|
|
1327
|
+
if self.browser_session:
|
|
1328
|
+
# Check if we should close the browser based on keep_alive setting
|
|
1329
|
+
if not self.browser_session.browser_profile.keep_alive:
|
|
1330
|
+
await self.browser_session.kill()
|
|
1331
|
+
else:
|
|
1332
|
+
logger.debug('Browser keep_alive is True, not closing browser session')
|
|
1333
|
+
|
|
1334
|
+
async def __aenter__(self) -> 'CodeAgent':
|
|
1335
|
+
"""Async context manager entry."""
|
|
1336
|
+
return self
|
|
1337
|
+
|
|
1338
|
+
async def __aexit__(self, exc_type: type[BaseException] | None, exc_val: BaseException | None, exc_tb: Any) -> None:
|
|
1339
|
+
"""Async context manager exit."""
|
|
1340
|
+
await self.close()
|