optexity-browser-use 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. browser_use/__init__.py +157 -0
  2. browser_use/actor/__init__.py +11 -0
  3. browser_use/actor/element.py +1175 -0
  4. browser_use/actor/mouse.py +134 -0
  5. browser_use/actor/page.py +561 -0
  6. browser_use/actor/playground/flights.py +41 -0
  7. browser_use/actor/playground/mixed_automation.py +54 -0
  8. browser_use/actor/playground/playground.py +236 -0
  9. browser_use/actor/utils.py +176 -0
  10. browser_use/agent/cloud_events.py +282 -0
  11. browser_use/agent/gif.py +424 -0
  12. browser_use/agent/judge.py +170 -0
  13. browser_use/agent/message_manager/service.py +473 -0
  14. browser_use/agent/message_manager/utils.py +52 -0
  15. browser_use/agent/message_manager/views.py +98 -0
  16. browser_use/agent/prompts.py +413 -0
  17. browser_use/agent/service.py +2316 -0
  18. browser_use/agent/system_prompt.md +185 -0
  19. browser_use/agent/system_prompt_flash.md +10 -0
  20. browser_use/agent/system_prompt_no_thinking.md +183 -0
  21. browser_use/agent/views.py +743 -0
  22. browser_use/browser/__init__.py +41 -0
  23. browser_use/browser/cloud/cloud.py +203 -0
  24. browser_use/browser/cloud/views.py +89 -0
  25. browser_use/browser/events.py +578 -0
  26. browser_use/browser/profile.py +1158 -0
  27. browser_use/browser/python_highlights.py +548 -0
  28. browser_use/browser/session.py +3225 -0
  29. browser_use/browser/session_manager.py +399 -0
  30. browser_use/browser/video_recorder.py +162 -0
  31. browser_use/browser/views.py +200 -0
  32. browser_use/browser/watchdog_base.py +260 -0
  33. browser_use/browser/watchdogs/__init__.py +0 -0
  34. browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
  35. browser_use/browser/watchdogs/crash_watchdog.py +335 -0
  36. browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
  37. browser_use/browser/watchdogs/dom_watchdog.py +817 -0
  38. browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
  39. browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
  40. browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
  41. browser_use/browser/watchdogs/popups_watchdog.py +143 -0
  42. browser_use/browser/watchdogs/recording_watchdog.py +126 -0
  43. browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
  44. browser_use/browser/watchdogs/security_watchdog.py +280 -0
  45. browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
  46. browser_use/cli.py +2359 -0
  47. browser_use/code_use/__init__.py +16 -0
  48. browser_use/code_use/formatting.py +192 -0
  49. browser_use/code_use/namespace.py +665 -0
  50. browser_use/code_use/notebook_export.py +276 -0
  51. browser_use/code_use/service.py +1340 -0
  52. browser_use/code_use/system_prompt.md +574 -0
  53. browser_use/code_use/utils.py +150 -0
  54. browser_use/code_use/views.py +171 -0
  55. browser_use/config.py +505 -0
  56. browser_use/controller/__init__.py +3 -0
  57. browser_use/dom/enhanced_snapshot.py +161 -0
  58. browser_use/dom/markdown_extractor.py +169 -0
  59. browser_use/dom/playground/extraction.py +312 -0
  60. browser_use/dom/playground/multi_act.py +32 -0
  61. browser_use/dom/serializer/clickable_elements.py +200 -0
  62. browser_use/dom/serializer/code_use_serializer.py +287 -0
  63. browser_use/dom/serializer/eval_serializer.py +478 -0
  64. browser_use/dom/serializer/html_serializer.py +212 -0
  65. browser_use/dom/serializer/paint_order.py +197 -0
  66. browser_use/dom/serializer/serializer.py +1170 -0
  67. browser_use/dom/service.py +825 -0
  68. browser_use/dom/utils.py +129 -0
  69. browser_use/dom/views.py +906 -0
  70. browser_use/exceptions.py +5 -0
  71. browser_use/filesystem/__init__.py +0 -0
  72. browser_use/filesystem/file_system.py +619 -0
  73. browser_use/init_cmd.py +376 -0
  74. browser_use/integrations/gmail/__init__.py +24 -0
  75. browser_use/integrations/gmail/actions.py +115 -0
  76. browser_use/integrations/gmail/service.py +225 -0
  77. browser_use/llm/__init__.py +155 -0
  78. browser_use/llm/anthropic/chat.py +242 -0
  79. browser_use/llm/anthropic/serializer.py +312 -0
  80. browser_use/llm/aws/__init__.py +36 -0
  81. browser_use/llm/aws/chat_anthropic.py +242 -0
  82. browser_use/llm/aws/chat_bedrock.py +289 -0
  83. browser_use/llm/aws/serializer.py +257 -0
  84. browser_use/llm/azure/chat.py +91 -0
  85. browser_use/llm/base.py +57 -0
  86. browser_use/llm/browser_use/__init__.py +3 -0
  87. browser_use/llm/browser_use/chat.py +201 -0
  88. browser_use/llm/cerebras/chat.py +193 -0
  89. browser_use/llm/cerebras/serializer.py +109 -0
  90. browser_use/llm/deepseek/chat.py +212 -0
  91. browser_use/llm/deepseek/serializer.py +109 -0
  92. browser_use/llm/exceptions.py +29 -0
  93. browser_use/llm/google/__init__.py +3 -0
  94. browser_use/llm/google/chat.py +542 -0
  95. browser_use/llm/google/serializer.py +120 -0
  96. browser_use/llm/groq/chat.py +229 -0
  97. browser_use/llm/groq/parser.py +158 -0
  98. browser_use/llm/groq/serializer.py +159 -0
  99. browser_use/llm/messages.py +238 -0
  100. browser_use/llm/models.py +271 -0
  101. browser_use/llm/oci_raw/__init__.py +10 -0
  102. browser_use/llm/oci_raw/chat.py +443 -0
  103. browser_use/llm/oci_raw/serializer.py +229 -0
  104. browser_use/llm/ollama/chat.py +97 -0
  105. browser_use/llm/ollama/serializer.py +143 -0
  106. browser_use/llm/openai/chat.py +264 -0
  107. browser_use/llm/openai/like.py +15 -0
  108. browser_use/llm/openai/serializer.py +165 -0
  109. browser_use/llm/openrouter/chat.py +211 -0
  110. browser_use/llm/openrouter/serializer.py +26 -0
  111. browser_use/llm/schema.py +176 -0
  112. browser_use/llm/views.py +48 -0
  113. browser_use/logging_config.py +330 -0
  114. browser_use/mcp/__init__.py +18 -0
  115. browser_use/mcp/__main__.py +12 -0
  116. browser_use/mcp/client.py +544 -0
  117. browser_use/mcp/controller.py +264 -0
  118. browser_use/mcp/server.py +1114 -0
  119. browser_use/observability.py +204 -0
  120. browser_use/py.typed +0 -0
  121. browser_use/sandbox/__init__.py +41 -0
  122. browser_use/sandbox/sandbox.py +637 -0
  123. browser_use/sandbox/views.py +132 -0
  124. browser_use/screenshots/__init__.py +1 -0
  125. browser_use/screenshots/service.py +52 -0
  126. browser_use/sync/__init__.py +6 -0
  127. browser_use/sync/auth.py +357 -0
  128. browser_use/sync/service.py +161 -0
  129. browser_use/telemetry/__init__.py +51 -0
  130. browser_use/telemetry/service.py +112 -0
  131. browser_use/telemetry/views.py +101 -0
  132. browser_use/tokens/__init__.py +0 -0
  133. browser_use/tokens/custom_pricing.py +24 -0
  134. browser_use/tokens/mappings.py +4 -0
  135. browser_use/tokens/service.py +580 -0
  136. browser_use/tokens/views.py +108 -0
  137. browser_use/tools/registry/service.py +572 -0
  138. browser_use/tools/registry/views.py +174 -0
  139. browser_use/tools/service.py +1675 -0
  140. browser_use/tools/utils.py +82 -0
  141. browser_use/tools/views.py +100 -0
  142. browser_use/utils.py +670 -0
  143. optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
  144. optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
  145. optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
  146. optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
  147. optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,282 @@
1
+ import base64
2
+ import os
3
+ from datetime import datetime, timezone
4
+ from pathlib import Path
5
+
6
+ import anyio
7
+ from bubus import BaseEvent
8
+ from pydantic import Field, field_validator
9
+ from uuid_extensions import uuid7str
10
+
11
+ MAX_STRING_LENGTH = 100000 # 100K chars ~ 25k tokens should be enough
12
+ MAX_URL_LENGTH = 100000
13
+ MAX_TASK_LENGTH = 100000
14
+ MAX_COMMENT_LENGTH = 2000
15
+ MAX_FILE_CONTENT_SIZE = 50 * 1024 * 1024 # 50MB
16
+
17
+
18
+ class UpdateAgentTaskEvent(BaseEvent):
19
+ # Required fields for identification
20
+ id: str # The task ID to update
21
+ user_id: str = Field(max_length=255) # For authorization
22
+ device_id: str | None = Field(None, max_length=255) # Device ID for auth lookup
23
+
24
+ # Optional fields that can be updated
25
+ stopped: bool | None = None
26
+ paused: bool | None = None
27
+ done_output: str | None = Field(None, max_length=MAX_STRING_LENGTH)
28
+ finished_at: datetime | None = None
29
+ agent_state: dict | None = None
30
+ user_feedback_type: str | None = Field(None, max_length=10) # UserFeedbackType enum value as string
31
+ user_comment: str | None = Field(None, max_length=MAX_COMMENT_LENGTH)
32
+ gif_url: str | None = Field(None, max_length=MAX_URL_LENGTH)
33
+
34
+ @classmethod
35
+ def from_agent(cls, agent) -> 'UpdateAgentTaskEvent':
36
+ """Create an UpdateAgentTaskEvent from an Agent instance"""
37
+ if not hasattr(agent, '_task_start_time'):
38
+ raise ValueError('Agent must have _task_start_time attribute')
39
+
40
+ done_output = agent.history.final_result() if agent.history else None
41
+ return cls(
42
+ id=str(agent.task_id),
43
+ user_id='', # To be filled by cloud handler
44
+ device_id=agent.cloud_sync.auth_client.device_id
45
+ if hasattr(agent, 'cloud_sync') and agent.cloud_sync and agent.cloud_sync.auth_client
46
+ else None,
47
+ stopped=agent.state.stopped if hasattr(agent.state, 'stopped') else False,
48
+ paused=agent.state.paused if hasattr(agent.state, 'paused') else False,
49
+ done_output=done_output,
50
+ finished_at=datetime.now(timezone.utc) if agent.history and agent.history.is_done() else None,
51
+ agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
52
+ user_feedback_type=None,
53
+ user_comment=None,
54
+ gif_url=None,
55
+ # user_feedback_type and user_comment would be set by the API/frontend
56
+ # gif_url would be set after GIF generation if needed
57
+ )
58
+
59
+
60
+ class CreateAgentOutputFileEvent(BaseEvent):
61
+ # Model fields
62
+ id: str = Field(default_factory=uuid7str)
63
+ user_id: str = Field(max_length=255)
64
+ device_id: str | None = Field(None, max_length=255) # Device ID for auth lookup
65
+ task_id: str
66
+ file_name: str = Field(max_length=255)
67
+ file_content: str | None = None # Base64 encoded file content
68
+ content_type: str | None = Field(None, max_length=100) # MIME type for file uploads
69
+ created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
70
+
71
+ @field_validator('file_content')
72
+ @classmethod
73
+ def validate_file_size(cls, v: str | None) -> str | None:
74
+ """Validate base64 file content size."""
75
+ if v is None:
76
+ return v
77
+ # Remove data URL prefix if present
78
+ if ',' in v:
79
+ v = v.split(',')[1]
80
+ # Estimate decoded size (base64 is ~33% larger)
81
+ estimated_size = len(v) * 3 / 4
82
+ if estimated_size > MAX_FILE_CONTENT_SIZE:
83
+ raise ValueError(f'File content exceeds maximum size of {MAX_FILE_CONTENT_SIZE / 1024 / 1024}MB')
84
+ return v
85
+
86
+ @classmethod
87
+ async def from_agent_and_file(cls, agent, output_path: str) -> 'CreateAgentOutputFileEvent':
88
+ """Create a CreateAgentOutputFileEvent from a file path"""
89
+
90
+ gif_path = Path(output_path)
91
+ if not gif_path.exists():
92
+ raise FileNotFoundError(f'File not found: {output_path}')
93
+
94
+ gif_size = os.path.getsize(gif_path)
95
+
96
+ # Read GIF content for base64 encoding if needed
97
+ gif_content = None
98
+ if gif_size < 50 * 1024 * 1024: # Only read if < 50MB
99
+ async with await anyio.open_file(gif_path, 'rb') as f:
100
+ gif_bytes = await f.read()
101
+ gif_content = base64.b64encode(gif_bytes).decode('utf-8')
102
+
103
+ return cls(
104
+ user_id='', # To be filled by cloud handler
105
+ device_id=agent.cloud_sync.auth_client.device_id
106
+ if hasattr(agent, 'cloud_sync') and agent.cloud_sync and agent.cloud_sync.auth_client
107
+ else None,
108
+ task_id=str(agent.task_id),
109
+ file_name=gif_path.name,
110
+ file_content=gif_content, # Base64 encoded
111
+ content_type='image/gif',
112
+ )
113
+
114
+
115
+ class CreateAgentStepEvent(BaseEvent):
116
+ # Model fields
117
+ id: str = Field(default_factory=uuid7str)
118
+ user_id: str = Field(max_length=255) # Added for authorization checks
119
+ device_id: str | None = Field(None, max_length=255) # Device ID for auth lookup
120
+ created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
121
+ agent_task_id: str
122
+ step: int
123
+ evaluation_previous_goal: str = Field(max_length=MAX_STRING_LENGTH)
124
+ memory: str = Field(max_length=MAX_STRING_LENGTH)
125
+ next_goal: str = Field(max_length=MAX_STRING_LENGTH)
126
+ actions: list[dict]
127
+ screenshot_url: str | None = Field(None, max_length=MAX_FILE_CONTENT_SIZE) # ~50MB for base64 images
128
+ url: str = Field(default='', max_length=MAX_URL_LENGTH)
129
+
130
+ @field_validator('screenshot_url')
131
+ @classmethod
132
+ def validate_screenshot_size(cls, v: str | None) -> str | None:
133
+ """Validate screenshot URL or base64 content size."""
134
+ if v is None or not v.startswith('data:'):
135
+ return v
136
+ # It's base64 data, check size
137
+ if ',' in v:
138
+ base64_part = v.split(',')[1]
139
+ estimated_size = len(base64_part) * 3 / 4
140
+ if estimated_size > MAX_FILE_CONTENT_SIZE:
141
+ raise ValueError(f'Screenshot content exceeds maximum size of {MAX_FILE_CONTENT_SIZE / 1024 / 1024}MB')
142
+ return v
143
+
144
+ @classmethod
145
+ def from_agent_step(
146
+ cls, agent, model_output, result: list, actions_data: list[dict], browser_state_summary
147
+ ) -> 'CreateAgentStepEvent':
148
+ """Create a CreateAgentStepEvent from agent step data"""
149
+ # Get first action details if available
150
+ first_action = model_output.action[0] if model_output.action else None
151
+
152
+ # Extract current state from model output
153
+ current_state = model_output.current_state if hasattr(model_output, 'current_state') else None
154
+
155
+ # Capture screenshot as base64 data URL if available
156
+ screenshot_url = None
157
+ if browser_state_summary.screenshot:
158
+ screenshot_url = f'data:image/jpeg;base64,{browser_state_summary.screenshot}'
159
+ import logging
160
+
161
+ logger = logging.getLogger(__name__)
162
+ logger.debug(f'📸 Including screenshot in CreateAgentStepEvent, length: {len(browser_state_summary.screenshot)}')
163
+ else:
164
+ import logging
165
+
166
+ logger = logging.getLogger(__name__)
167
+ logger.debug('📸 No screenshot in browser_state_summary for CreateAgentStepEvent')
168
+
169
+ return cls(
170
+ user_id='', # To be filled by cloud handler
171
+ device_id=agent.cloud_sync.auth_client.device_id
172
+ if hasattr(agent, 'cloud_sync') and agent.cloud_sync and agent.cloud_sync.auth_client
173
+ else None,
174
+ agent_task_id=str(agent.task_id),
175
+ step=agent.state.n_steps,
176
+ evaluation_previous_goal=current_state.evaluation_previous_goal if current_state else '',
177
+ memory=current_state.memory if current_state else '',
178
+ next_goal=current_state.next_goal if current_state else '',
179
+ actions=actions_data, # List of action dicts
180
+ url=browser_state_summary.url,
181
+ screenshot_url=screenshot_url,
182
+ )
183
+
184
+
185
+ class CreateAgentTaskEvent(BaseEvent):
186
+ # Model fields
187
+ id: str = Field(default_factory=uuid7str)
188
+ user_id: str = Field(max_length=255) # Added for authorization checks
189
+ device_id: str | None = Field(None, max_length=255) # Device ID for auth lookup
190
+ agent_session_id: str
191
+ llm_model: str = Field(max_length=200) # LLMModel enum value as string
192
+ stopped: bool = False
193
+ paused: bool = False
194
+ task: str = Field(max_length=MAX_TASK_LENGTH)
195
+ done_output: str | None = Field(None, max_length=MAX_STRING_LENGTH)
196
+ scheduled_task_id: str | None = None
197
+ started_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
198
+ finished_at: datetime | None = None
199
+ agent_state: dict = Field(default_factory=dict)
200
+ user_feedback_type: str | None = Field(None, max_length=10) # UserFeedbackType enum value as string
201
+ user_comment: str | None = Field(None, max_length=MAX_COMMENT_LENGTH)
202
+ gif_url: str | None = Field(None, max_length=MAX_URL_LENGTH)
203
+
204
+ @classmethod
205
+ def from_agent(cls, agent) -> 'CreateAgentTaskEvent':
206
+ """Create a CreateAgentTaskEvent from an Agent instance"""
207
+ return cls(
208
+ id=str(agent.task_id),
209
+ user_id='', # To be filled by cloud handler
210
+ device_id=agent.cloud_sync.auth_client.device_id
211
+ if hasattr(agent, 'cloud_sync') and agent.cloud_sync and agent.cloud_sync.auth_client
212
+ else None,
213
+ agent_session_id=str(agent.session_id),
214
+ task=agent.task,
215
+ llm_model=agent.llm.model_name,
216
+ agent_state=agent.state.model_dump() if hasattr(agent.state, 'model_dump') else {},
217
+ stopped=False,
218
+ paused=False,
219
+ done_output=None,
220
+ started_at=datetime.fromtimestamp(agent._task_start_time, tz=timezone.utc),
221
+ finished_at=None,
222
+ user_feedback_type=None,
223
+ user_comment=None,
224
+ gif_url=None,
225
+ )
226
+
227
+
228
+ class CreateAgentSessionEvent(BaseEvent):
229
+ # Model fields
230
+ id: str = Field(default_factory=uuid7str)
231
+ user_id: str = Field(max_length=255)
232
+ device_id: str | None = Field(None, max_length=255) # Device ID for auth lookup
233
+ browser_session_id: str = Field(max_length=255)
234
+ browser_session_live_url: str = Field(max_length=MAX_URL_LENGTH)
235
+ browser_session_cdp_url: str = Field(max_length=MAX_URL_LENGTH)
236
+ browser_session_stopped: bool = False
237
+ browser_session_stopped_at: datetime | None = None
238
+ is_source_api: bool | None = None
239
+ browser_state: dict = Field(default_factory=dict)
240
+ browser_session_data: dict | None = None
241
+
242
+ @classmethod
243
+ def from_agent(cls, agent) -> 'CreateAgentSessionEvent':
244
+ """Create a CreateAgentSessionEvent from an Agent instance"""
245
+ return cls(
246
+ id=str(agent.session_id),
247
+ user_id='', # To be filled by cloud handler
248
+ device_id=agent.cloud_sync.auth_client.device_id
249
+ if hasattr(agent, 'cloud_sync') and agent.cloud_sync and agent.cloud_sync.auth_client
250
+ else None,
251
+ browser_session_id=agent.browser_session.id,
252
+ browser_session_live_url='', # To be filled by cloud handler
253
+ browser_session_cdp_url='', # To be filled by cloud handler
254
+ browser_state={
255
+ 'viewport': agent.browser_profile.viewport if agent.browser_profile else {'width': 1280, 'height': 720},
256
+ 'user_agent': agent.browser_profile.user_agent if agent.browser_profile else None,
257
+ 'headless': agent.browser_profile.headless if agent.browser_profile else True,
258
+ 'initial_url': None, # Will be updated during execution
259
+ 'final_url': None, # Will be updated during execution
260
+ 'total_pages_visited': 0, # Will be updated during execution
261
+ 'session_duration_seconds': 0, # Will be updated during execution
262
+ },
263
+ browser_session_data={
264
+ 'cookies': [],
265
+ 'secrets': {},
266
+ # TODO: send secrets safely so tasks can be replayed on cloud seamlessly
267
+ # 'secrets': dict(agent.sensitive_data) if agent.sensitive_data else {},
268
+ 'allowed_domains': agent.browser_profile.allowed_domains if agent.browser_profile else [],
269
+ },
270
+ )
271
+
272
+
273
+ class UpdateAgentSessionEvent(BaseEvent):
274
+ """Event to update an existing agent session"""
275
+
276
+ # Model fields
277
+ id: str # Session ID to update
278
+ user_id: str = Field(max_length=255)
279
+ device_id: str | None = Field(None, max_length=255)
280
+ browser_session_stopped: bool | None = None
281
+ browser_session_stopped_at: datetime | None = None
282
+ end_reason: str | None = Field(None, max_length=100) # Why the session ended
@@ -0,0 +1,424 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import io
5
+ import logging
6
+ import os
7
+ import platform
8
+ from typing import TYPE_CHECKING
9
+
10
+ from browser_use.agent.views import AgentHistoryList
11
+ from browser_use.browser.views import PLACEHOLDER_4PX_SCREENSHOT
12
+ from browser_use.config import CONFIG
13
+
14
+ if TYPE_CHECKING:
15
+ from PIL import Image, ImageFont
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def decode_unicode_escapes_to_utf8(text: str) -> str:
21
+ """Handle decoding any unicode escape sequences embedded in a string (needed to render non-ASCII languages like chinese or arabic in the GIF overlay text)"""
22
+
23
+ if r'\u' not in text:
24
+ # doesn't have any escape sequences that need to be decoded
25
+ return text
26
+
27
+ try:
28
+ # Try to decode Unicode escape sequences
29
+ return text.encode('latin1').decode('unicode_escape')
30
+ except (UnicodeEncodeError, UnicodeDecodeError):
31
+ # logger.debug(f"Failed to decode unicode escape sequences while generating gif text: {text}")
32
+ return text
33
+
34
+
35
+ def create_history_gif(
36
+ task: str,
37
+ history: AgentHistoryList,
38
+ #
39
+ output_path: str = 'agent_history.gif',
40
+ duration: int = 3000,
41
+ show_goals: bool = True,
42
+ show_task: bool = True,
43
+ show_logo: bool = False,
44
+ font_size: int = 40,
45
+ title_font_size: int = 56,
46
+ goal_font_size: int = 44,
47
+ margin: int = 40,
48
+ line_spacing: float = 1.5,
49
+ ) -> None:
50
+ """Create a GIF from the agent's history with overlaid task and goal text."""
51
+ if not history.history:
52
+ logger.warning('No history to create GIF from')
53
+ return
54
+
55
+ from PIL import Image, ImageFont
56
+
57
+ images = []
58
+
59
+ # if history is empty, we can't create a gif
60
+ if not history.history:
61
+ logger.warning('No history to create GIF from')
62
+ return
63
+
64
+ # Get all screenshots from history (including None placeholders)
65
+ screenshots = history.screenshots(return_none_if_not_screenshot=True)
66
+
67
+ if not screenshots:
68
+ logger.warning('No screenshots found in history')
69
+ return
70
+
71
+ # Find the first non-placeholder screenshot
72
+ # A screenshot is considered a placeholder if:
73
+ # 1. It's the exact 4px placeholder for about:blank pages, OR
74
+ # 2. It comes from a new tab page (chrome://newtab/, about:blank, etc.)
75
+ first_real_screenshot = None
76
+ for screenshot in screenshots:
77
+ if screenshot and screenshot != PLACEHOLDER_4PX_SCREENSHOT:
78
+ first_real_screenshot = screenshot
79
+ break
80
+
81
+ if not first_real_screenshot:
82
+ logger.warning('No valid screenshots found (all are placeholders or from new tab pages)')
83
+ return
84
+
85
+ # Try to load nicer fonts
86
+ try:
87
+ # Try different font options in order of preference
88
+ # ArialUni is a font that comes with Office and can render most non-alphabet characters
89
+ font_options = [
90
+ 'PingFang',
91
+ 'STHeiti Medium',
92
+ 'Microsoft YaHei', # 微软雅黑
93
+ 'SimHei', # 黑体
94
+ 'SimSun', # 宋体
95
+ 'Noto Sans CJK SC', # 思源黑体
96
+ 'WenQuanYi Micro Hei', # 文泉驿微米黑
97
+ 'Helvetica',
98
+ 'Arial',
99
+ 'DejaVuSans',
100
+ 'Verdana',
101
+ ]
102
+ font_loaded = False
103
+
104
+ for font_name in font_options:
105
+ try:
106
+ if platform.system() == 'Windows':
107
+ # Need to specify the abs font path on Windows
108
+ font_name = os.path.join(CONFIG.WIN_FONT_DIR, font_name + '.ttf')
109
+ regular_font = ImageFont.truetype(font_name, font_size)
110
+ title_font = ImageFont.truetype(font_name, title_font_size)
111
+ goal_font = ImageFont.truetype(font_name, goal_font_size)
112
+ font_loaded = True
113
+ break
114
+ except OSError:
115
+ continue
116
+
117
+ if not font_loaded:
118
+ raise OSError('No preferred fonts found')
119
+
120
+ except OSError:
121
+ regular_font = ImageFont.load_default()
122
+ title_font = ImageFont.load_default()
123
+
124
+ goal_font = regular_font
125
+
126
+ # Load logo if requested
127
+ logo = None
128
+ if show_logo:
129
+ try:
130
+ logo = Image.open('./static/browser-use.png')
131
+ # Resize logo to be small (e.g., 40px height)
132
+ logo_height = 150
133
+ aspect_ratio = logo.width / logo.height
134
+ logo_width = int(logo_height * aspect_ratio)
135
+ logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
136
+ except Exception as e:
137
+ logger.warning(f'Could not load logo: {e}')
138
+
139
+ # Create task frame if requested
140
+ if show_task and task:
141
+ # Find the first non-placeholder screenshot for the task frame
142
+ first_real_screenshot = None
143
+ for item in history.history:
144
+ screenshot_b64 = item.state.get_screenshot()
145
+ if screenshot_b64 and screenshot_b64 != PLACEHOLDER_4PX_SCREENSHOT:
146
+ first_real_screenshot = screenshot_b64
147
+ break
148
+
149
+ if first_real_screenshot:
150
+ task_frame = _create_task_frame(
151
+ task,
152
+ first_real_screenshot,
153
+ title_font, # type: ignore
154
+ regular_font, # type: ignore
155
+ logo,
156
+ line_spacing,
157
+ )
158
+ images.append(task_frame)
159
+ else:
160
+ logger.warning('No real screenshots found for task frame, skipping task frame')
161
+
162
+ # Process each history item with its corresponding screenshot
163
+ for i, (item, screenshot) in enumerate(zip(history.history, screenshots), 1):
164
+ if not screenshot:
165
+ continue
166
+
167
+ # Skip placeholder screenshots from about:blank pages
168
+ # These are 4x4 white PNGs encoded as a specific base64 string
169
+ if screenshot == PLACEHOLDER_4PX_SCREENSHOT:
170
+ logger.debug(f'Skipping placeholder screenshot from about:blank page at step {i}')
171
+ continue
172
+
173
+ # Skip screenshots from new tab pages
174
+ from browser_use.utils import is_new_tab_page
175
+
176
+ if is_new_tab_page(item.state.url):
177
+ logger.debug(f'Skipping screenshot from new tab page ({item.state.url}) at step {i}')
178
+ continue
179
+
180
+ # Convert base64 screenshot to PIL Image
181
+ img_data = base64.b64decode(screenshot)
182
+ image = Image.open(io.BytesIO(img_data))
183
+
184
+ if show_goals and item.model_output:
185
+ image = _add_overlay_to_image(
186
+ image=image,
187
+ step_number=i,
188
+ goal_text=item.model_output.current_state.next_goal,
189
+ regular_font=regular_font, # type: ignore
190
+ title_font=title_font, # type: ignore
191
+ margin=margin,
192
+ logo=logo,
193
+ )
194
+
195
+ images.append(image)
196
+
197
+ if images:
198
+ # Save the GIF
199
+ images[0].save(
200
+ output_path,
201
+ save_all=True,
202
+ append_images=images[1:],
203
+ duration=duration,
204
+ loop=0,
205
+ optimize=False,
206
+ )
207
+ logger.info(f'Created GIF at {output_path}')
208
+ else:
209
+ logger.warning('No images found in history to create GIF')
210
+
211
+
212
+ def _create_task_frame(
213
+ task: str,
214
+ first_screenshot: str,
215
+ title_font: ImageFont.FreeTypeFont,
216
+ regular_font: ImageFont.FreeTypeFont,
217
+ logo: Image.Image | None = None,
218
+ line_spacing: float = 1.5,
219
+ ) -> Image.Image:
220
+ """Create initial frame showing the task."""
221
+ from PIL import Image, ImageDraw, ImageFont
222
+
223
+ img_data = base64.b64decode(first_screenshot)
224
+ template = Image.open(io.BytesIO(img_data))
225
+ image = Image.new('RGB', template.size, (0, 0, 0))
226
+ draw = ImageDraw.Draw(image)
227
+
228
+ # Calculate vertical center of image
229
+ center_y = image.height // 2
230
+
231
+ # Draw task text with dynamic font size based on task length
232
+ margin = 140 # Increased margin
233
+ max_width = image.width - (2 * margin)
234
+
235
+ # Dynamic font size calculation based on task length
236
+ # Start with base font size (regular + 16)
237
+ base_font_size = regular_font.size + 16
238
+ min_font_size = max(regular_font.size - 10, 16) # Don't go below 16pt
239
+ max_font_size = base_font_size # Cap at the base font size
240
+
241
+ # Calculate dynamic font size based on text length and complexity
242
+ # Longer texts get progressively smaller fonts
243
+ text_length = len(task)
244
+ if text_length > 200:
245
+ # For very long text, reduce font size logarithmically
246
+ font_size = max(base_font_size - int(10 * (text_length / 200)), min_font_size)
247
+ else:
248
+ font_size = base_font_size
249
+
250
+ # Try to create a larger font, but fall back to regular font if it fails
251
+ try:
252
+ larger_font = ImageFont.truetype(regular_font.path, font_size) # type: ignore
253
+ except (OSError, AttributeError):
254
+ # Fall back to regular font if .path is not available or font loading fails
255
+ larger_font = regular_font
256
+
257
+ # Generate wrapped text with the calculated font size
258
+ wrapped_text = _wrap_text(task, larger_font, max_width)
259
+
260
+ # Calculate line height with spacing
261
+ line_height = larger_font.size * line_spacing
262
+
263
+ # Split text into lines and draw with custom spacing
264
+ lines = wrapped_text.split('\n')
265
+ total_height = line_height * len(lines)
266
+
267
+ # Start position for first line
268
+ text_y = center_y - (total_height / 2) + 50 # Shifted down slightly
269
+
270
+ for line in lines:
271
+ # Get line width for centering
272
+ line_bbox = draw.textbbox((0, 0), line, font=larger_font)
273
+ text_x = (image.width - (line_bbox[2] - line_bbox[0])) // 2
274
+
275
+ draw.text(
276
+ (text_x, text_y),
277
+ line,
278
+ font=larger_font,
279
+ fill=(255, 255, 255),
280
+ )
281
+ text_y += line_height
282
+
283
+ # Add logo if provided (top right corner)
284
+ if logo:
285
+ logo_margin = 20
286
+ logo_x = image.width - logo.width - logo_margin
287
+ image.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
288
+
289
+ return image
290
+
291
+
292
+ def _add_overlay_to_image(
293
+ image: Image.Image,
294
+ step_number: int,
295
+ goal_text: str,
296
+ regular_font: ImageFont.FreeTypeFont,
297
+ title_font: ImageFont.FreeTypeFont,
298
+ margin: int,
299
+ logo: Image.Image | None = None,
300
+ display_step: bool = True,
301
+ text_color: tuple[int, int, int, int] = (255, 255, 255, 255),
302
+ text_box_color: tuple[int, int, int, int] = (0, 0, 0, 255),
303
+ ) -> Image.Image:
304
+ """Add step number and goal overlay to an image."""
305
+
306
+ from PIL import Image, ImageDraw
307
+
308
+ goal_text = decode_unicode_escapes_to_utf8(goal_text)
309
+ image = image.convert('RGBA')
310
+ txt_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
311
+ draw = ImageDraw.Draw(txt_layer)
312
+ if display_step:
313
+ # Add step number (bottom left)
314
+ step_text = str(step_number)
315
+ step_bbox = draw.textbbox((0, 0), step_text, font=title_font)
316
+ step_width = step_bbox[2] - step_bbox[0]
317
+ step_height = step_bbox[3] - step_bbox[1]
318
+
319
+ # Position step number in bottom left
320
+ x_step = margin + 10 # Slight additional offset from edge
321
+ y_step = image.height - margin - step_height - 10 # Slight offset from bottom
322
+
323
+ # Draw rounded rectangle background for step number
324
+ padding = 20 # Increased padding
325
+ step_bg_bbox = (
326
+ x_step - padding,
327
+ y_step - padding,
328
+ x_step + step_width + padding,
329
+ y_step + step_height + padding,
330
+ )
331
+ draw.rounded_rectangle(
332
+ step_bg_bbox,
333
+ radius=15, # Add rounded corners
334
+ fill=text_box_color,
335
+ )
336
+
337
+ # Draw step number
338
+ draw.text(
339
+ (x_step, y_step),
340
+ step_text,
341
+ font=title_font,
342
+ fill=text_color,
343
+ )
344
+
345
+ # Draw goal text (centered, bottom)
346
+ max_width = image.width - (4 * margin)
347
+ wrapped_goal = _wrap_text(goal_text, title_font, max_width)
348
+ goal_bbox = draw.multiline_textbbox((0, 0), wrapped_goal, font=title_font)
349
+ goal_width = goal_bbox[2] - goal_bbox[0]
350
+ goal_height = goal_bbox[3] - goal_bbox[1]
351
+
352
+ # Center goal text horizontally, place above step number
353
+ x_goal = (image.width - goal_width) // 2
354
+ y_goal = y_step - goal_height - padding * 4 # More space between step and goal
355
+
356
+ # Draw rounded rectangle background for goal
357
+ padding_goal = 25 # Increased padding for goal
358
+ goal_bg_bbox = (
359
+ x_goal - padding_goal, # Remove extra space for logo
360
+ y_goal - padding_goal,
361
+ x_goal + goal_width + padding_goal,
362
+ y_goal + goal_height + padding_goal,
363
+ )
364
+ draw.rounded_rectangle(
365
+ goal_bg_bbox,
366
+ radius=15, # Add rounded corners
367
+ fill=text_box_color,
368
+ )
369
+
370
+ # Draw goal text
371
+ draw.multiline_text(
372
+ (x_goal, y_goal),
373
+ wrapped_goal,
374
+ font=title_font,
375
+ fill=text_color,
376
+ align='center',
377
+ )
378
+
379
+ # Add logo if provided (top right corner)
380
+ if logo:
381
+ logo_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
382
+ logo_margin = 20
383
+ logo_x = image.width - logo.width - logo_margin
384
+ logo_layer.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
385
+ txt_layer = Image.alpha_composite(logo_layer, txt_layer)
386
+
387
+ # Composite and convert
388
+ result = Image.alpha_composite(image, txt_layer)
389
+ return result.convert('RGB')
390
+
391
+
392
+ def _wrap_text(text: str, font: ImageFont.FreeTypeFont, max_width: int) -> str:
393
+ """
394
+ Wrap text to fit within a given width.
395
+
396
+ Args:
397
+ text: Text to wrap
398
+ font: Font to use for text
399
+ max_width: Maximum width in pixels
400
+
401
+ Returns:
402
+ Wrapped text with newlines
403
+ """
404
+ text = decode_unicode_escapes_to_utf8(text)
405
+ words = text.split()
406
+ lines = []
407
+ current_line = []
408
+
409
+ for word in words:
410
+ current_line.append(word)
411
+ line = ' '.join(current_line)
412
+ bbox = font.getbbox(line)
413
+ if bbox[2] > max_width:
414
+ if len(current_line) == 1:
415
+ lines.append(current_line.pop())
416
+ else:
417
+ current_line.pop()
418
+ lines.append(' '.join(current_line))
419
+ current_line = [word]
420
+
421
+ if current_line:
422
+ lines.append(' '.join(current_line))
423
+
424
+ return '\n'.join(lines)