optexity-browser-use 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. browser_use/__init__.py +157 -0
  2. browser_use/actor/__init__.py +11 -0
  3. browser_use/actor/element.py +1175 -0
  4. browser_use/actor/mouse.py +134 -0
  5. browser_use/actor/page.py +561 -0
  6. browser_use/actor/playground/flights.py +41 -0
  7. browser_use/actor/playground/mixed_automation.py +54 -0
  8. browser_use/actor/playground/playground.py +236 -0
  9. browser_use/actor/utils.py +176 -0
  10. browser_use/agent/cloud_events.py +282 -0
  11. browser_use/agent/gif.py +424 -0
  12. browser_use/agent/judge.py +170 -0
  13. browser_use/agent/message_manager/service.py +473 -0
  14. browser_use/agent/message_manager/utils.py +52 -0
  15. browser_use/agent/message_manager/views.py +98 -0
  16. browser_use/agent/prompts.py +413 -0
  17. browser_use/agent/service.py +2316 -0
  18. browser_use/agent/system_prompt.md +185 -0
  19. browser_use/agent/system_prompt_flash.md +10 -0
  20. browser_use/agent/system_prompt_no_thinking.md +183 -0
  21. browser_use/agent/views.py +743 -0
  22. browser_use/browser/__init__.py +41 -0
  23. browser_use/browser/cloud/cloud.py +203 -0
  24. browser_use/browser/cloud/views.py +89 -0
  25. browser_use/browser/events.py +578 -0
  26. browser_use/browser/profile.py +1158 -0
  27. browser_use/browser/python_highlights.py +548 -0
  28. browser_use/browser/session.py +3225 -0
  29. browser_use/browser/session_manager.py +399 -0
  30. browser_use/browser/video_recorder.py +162 -0
  31. browser_use/browser/views.py +200 -0
  32. browser_use/browser/watchdog_base.py +260 -0
  33. browser_use/browser/watchdogs/__init__.py +0 -0
  34. browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
  35. browser_use/browser/watchdogs/crash_watchdog.py +335 -0
  36. browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
  37. browser_use/browser/watchdogs/dom_watchdog.py +817 -0
  38. browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
  39. browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
  40. browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
  41. browser_use/browser/watchdogs/popups_watchdog.py +143 -0
  42. browser_use/browser/watchdogs/recording_watchdog.py +126 -0
  43. browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
  44. browser_use/browser/watchdogs/security_watchdog.py +280 -0
  45. browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
  46. browser_use/cli.py +2359 -0
  47. browser_use/code_use/__init__.py +16 -0
  48. browser_use/code_use/formatting.py +192 -0
  49. browser_use/code_use/namespace.py +665 -0
  50. browser_use/code_use/notebook_export.py +276 -0
  51. browser_use/code_use/service.py +1340 -0
  52. browser_use/code_use/system_prompt.md +574 -0
  53. browser_use/code_use/utils.py +150 -0
  54. browser_use/code_use/views.py +171 -0
  55. browser_use/config.py +505 -0
  56. browser_use/controller/__init__.py +3 -0
  57. browser_use/dom/enhanced_snapshot.py +161 -0
  58. browser_use/dom/markdown_extractor.py +169 -0
  59. browser_use/dom/playground/extraction.py +312 -0
  60. browser_use/dom/playground/multi_act.py +32 -0
  61. browser_use/dom/serializer/clickable_elements.py +200 -0
  62. browser_use/dom/serializer/code_use_serializer.py +287 -0
  63. browser_use/dom/serializer/eval_serializer.py +478 -0
  64. browser_use/dom/serializer/html_serializer.py +212 -0
  65. browser_use/dom/serializer/paint_order.py +197 -0
  66. browser_use/dom/serializer/serializer.py +1170 -0
  67. browser_use/dom/service.py +825 -0
  68. browser_use/dom/utils.py +129 -0
  69. browser_use/dom/views.py +906 -0
  70. browser_use/exceptions.py +5 -0
  71. browser_use/filesystem/__init__.py +0 -0
  72. browser_use/filesystem/file_system.py +619 -0
  73. browser_use/init_cmd.py +376 -0
  74. browser_use/integrations/gmail/__init__.py +24 -0
  75. browser_use/integrations/gmail/actions.py +115 -0
  76. browser_use/integrations/gmail/service.py +225 -0
  77. browser_use/llm/__init__.py +155 -0
  78. browser_use/llm/anthropic/chat.py +242 -0
  79. browser_use/llm/anthropic/serializer.py +312 -0
  80. browser_use/llm/aws/__init__.py +36 -0
  81. browser_use/llm/aws/chat_anthropic.py +242 -0
  82. browser_use/llm/aws/chat_bedrock.py +289 -0
  83. browser_use/llm/aws/serializer.py +257 -0
  84. browser_use/llm/azure/chat.py +91 -0
  85. browser_use/llm/base.py +57 -0
  86. browser_use/llm/browser_use/__init__.py +3 -0
  87. browser_use/llm/browser_use/chat.py +201 -0
  88. browser_use/llm/cerebras/chat.py +193 -0
  89. browser_use/llm/cerebras/serializer.py +109 -0
  90. browser_use/llm/deepseek/chat.py +212 -0
  91. browser_use/llm/deepseek/serializer.py +109 -0
  92. browser_use/llm/exceptions.py +29 -0
  93. browser_use/llm/google/__init__.py +3 -0
  94. browser_use/llm/google/chat.py +542 -0
  95. browser_use/llm/google/serializer.py +120 -0
  96. browser_use/llm/groq/chat.py +229 -0
  97. browser_use/llm/groq/parser.py +158 -0
  98. browser_use/llm/groq/serializer.py +159 -0
  99. browser_use/llm/messages.py +238 -0
  100. browser_use/llm/models.py +271 -0
  101. browser_use/llm/oci_raw/__init__.py +10 -0
  102. browser_use/llm/oci_raw/chat.py +443 -0
  103. browser_use/llm/oci_raw/serializer.py +229 -0
  104. browser_use/llm/ollama/chat.py +97 -0
  105. browser_use/llm/ollama/serializer.py +143 -0
  106. browser_use/llm/openai/chat.py +264 -0
  107. browser_use/llm/openai/like.py +15 -0
  108. browser_use/llm/openai/serializer.py +165 -0
  109. browser_use/llm/openrouter/chat.py +211 -0
  110. browser_use/llm/openrouter/serializer.py +26 -0
  111. browser_use/llm/schema.py +176 -0
  112. browser_use/llm/views.py +48 -0
  113. browser_use/logging_config.py +330 -0
  114. browser_use/mcp/__init__.py +18 -0
  115. browser_use/mcp/__main__.py +12 -0
  116. browser_use/mcp/client.py +544 -0
  117. browser_use/mcp/controller.py +264 -0
  118. browser_use/mcp/server.py +1114 -0
  119. browser_use/observability.py +204 -0
  120. browser_use/py.typed +0 -0
  121. browser_use/sandbox/__init__.py +41 -0
  122. browser_use/sandbox/sandbox.py +637 -0
  123. browser_use/sandbox/views.py +132 -0
  124. browser_use/screenshots/__init__.py +1 -0
  125. browser_use/screenshots/service.py +52 -0
  126. browser_use/sync/__init__.py +6 -0
  127. browser_use/sync/auth.py +357 -0
  128. browser_use/sync/service.py +161 -0
  129. browser_use/telemetry/__init__.py +51 -0
  130. browser_use/telemetry/service.py +112 -0
  131. browser_use/telemetry/views.py +101 -0
  132. browser_use/tokens/__init__.py +0 -0
  133. browser_use/tokens/custom_pricing.py +24 -0
  134. browser_use/tokens/mappings.py +4 -0
  135. browser_use/tokens/service.py +580 -0
  136. browser_use/tokens/views.py +108 -0
  137. browser_use/tools/registry/service.py +572 -0
  138. browser_use/tools/registry/views.py +174 -0
  139. browser_use/tools/service.py +1675 -0
  140. browser_use/tools/utils.py +82 -0
  141. browser_use/tools/views.py +100 -0
  142. browser_use/utils.py +670 -0
  143. optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
  144. optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
  145. optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
  146. optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
  147. optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,542 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import time
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Literal, TypeVar, overload
7
+
8
+ from google import genai
9
+ from google.auth.credentials import Credentials
10
+ from google.genai import types
11
+ from google.genai.types import MediaModality
12
+ from pydantic import BaseModel
13
+
14
+ from browser_use.llm.base import BaseChatModel
15
+ from browser_use.llm.exceptions import ModelProviderError
16
+ from browser_use.llm.google.serializer import GoogleMessageSerializer
17
+ from browser_use.llm.messages import BaseMessage
18
+ from browser_use.llm.schema import SchemaOptimizer
19
+ from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
20
+
21
+ T = TypeVar('T', bound=BaseModel)
22
+
23
+
24
+ VerifiedGeminiModels = Literal[
25
+ 'gemini-2.0-flash',
26
+ 'gemini-2.0-flash-exp',
27
+ 'gemini-2.0-flash-lite-preview-02-05',
28
+ 'Gemini-2.0-exp',
29
+ 'gemini-2.5-flash',
30
+ 'gemini-2.5-flash-lite',
31
+ 'gemini-flash-latest',
32
+ 'gemini-flash-lite-latest',
33
+ 'gemini-2.5-pro',
34
+ 'gemma-3-27b-it',
35
+ 'gemma-3-4b',
36
+ 'gemma-3-12b',
37
+ 'gemma-3n-e2b',
38
+ 'gemma-3n-e4b',
39
+ ]
40
+
41
+
42
+ @dataclass
43
+ class ChatGoogle(BaseChatModel):
44
+ """
45
+ A wrapper around Google's Gemini chat model using the genai client.
46
+
47
+ This class accepts all genai.Client parameters while adding model,
48
+ temperature, and config parameters for the LLM interface.
49
+
50
+ Args:
51
+ model: The Gemini model to use
52
+ temperature: Temperature for response generation
53
+ config: Additional configuration parameters to pass to generate_content
54
+ (e.g., tools, safety_settings, etc.).
55
+ api_key: Google API key
56
+ vertexai: Whether to use Vertex AI
57
+ credentials: Google credentials object
58
+ project: Google Cloud project ID
59
+ location: Google Cloud location
60
+ http_options: HTTP options for the client
61
+ include_system_in_user: If True, system messages are included in the first user message
62
+ supports_structured_output: If True, uses native JSON mode; if False, uses prompt-based fallback
63
+ max_retries: Number of retries for retryable errors (default: 3)
64
+ retryable_status_codes: List of HTTP status codes to retry on (default: [403, 503])
65
+ retry_delay: Delay in seconds between retries (default: 0.01)
66
+
67
+ Example:
68
+ from google.genai import types
69
+
70
+ llm = ChatGoogle(
71
+ model='gemini-2.0-flash-exp',
72
+ config={
73
+ 'tools': [types.Tool(code_execution=types.ToolCodeExecution())]
74
+ },
75
+ max_retries=5,
76
+ retryable_status_codes=[403, 503],
77
+ retry_delay=0.02
78
+ )
79
+ """
80
+
81
+ # Model configuration
82
+ model: VerifiedGeminiModels | str
83
+ temperature: float | None = 0.5
84
+ top_p: float | None = None
85
+ seed: int | None = None
86
+ thinking_budget: int | None = None # for gemini-2.5 flash and flash-lite models, default will be set to 0
87
+ max_output_tokens: int | None = 8096
88
+ config: types.GenerateContentConfigDict | None = None
89
+ include_system_in_user: bool = False
90
+ supports_structured_output: bool = True # New flag
91
+ max_retries: int = 3 # Number of retries for retryable errors
92
+ retryable_status_codes: list[int] = field(default_factory=lambda: [403, 503]) # Status codes to retry on
93
+ retry_delay: float = 0.01 # Delay in seconds between retries
94
+
95
+ # Client initialization parameters
96
+ api_key: str | None = None
97
+ vertexai: bool | None = None
98
+ credentials: Credentials | None = None
99
+ project: str | None = None
100
+ location: str | None = None
101
+ http_options: types.HttpOptions | types.HttpOptionsDict | None = None
102
+
103
+ # Internal client cache to prevent connection issues
104
+ _client: genai.Client | None = None
105
+
106
+ # Static
107
+ @property
108
+ def provider(self) -> str:
109
+ return 'google'
110
+
111
+ @property
112
+ def logger(self) -> logging.Logger:
113
+ """Get logger for this chat instance"""
114
+ return logging.getLogger(f'browser_use.llm.google.{self.model}')
115
+
116
+ def _get_client_params(self) -> dict[str, Any]:
117
+ """Prepare client parameters dictionary."""
118
+ # Define base client params
119
+ base_params = {
120
+ 'api_key': self.api_key,
121
+ 'vertexai': self.vertexai,
122
+ 'credentials': self.credentials,
123
+ 'project': self.project,
124
+ 'location': self.location,
125
+ 'http_options': self.http_options,
126
+ }
127
+
128
+ # Create client_params dict with non-None values
129
+ client_params = {k: v for k, v in base_params.items() if v is not None}
130
+
131
+ return client_params
132
+
133
+ def get_client(self) -> genai.Client:
134
+ """
135
+ Returns a genai.Client instance.
136
+
137
+ Returns:
138
+ genai.Client: An instance of the Google genai client.
139
+ """
140
+ if self._client is not None:
141
+ return self._client
142
+
143
+ client_params = self._get_client_params()
144
+ self._client = genai.Client(**client_params)
145
+ return self._client
146
+
147
+ @property
148
+ def name(self) -> str:
149
+ return str(self.model)
150
+
151
+ def _get_stop_reason(self, response: types.GenerateContentResponse) -> str | None:
152
+ """Extract stop_reason from Google response."""
153
+ if hasattr(response, 'candidates') and response.candidates:
154
+ return str(response.candidates[0].finish_reason) if hasattr(response.candidates[0], 'finish_reason') else None
155
+ return None
156
+
157
+ def _get_usage(self, response: types.GenerateContentResponse) -> ChatInvokeUsage | None:
158
+ usage: ChatInvokeUsage | None = None
159
+
160
+ if response.usage_metadata is not None:
161
+ image_tokens = 0
162
+ if response.usage_metadata.prompt_tokens_details is not None:
163
+ image_tokens = sum(
164
+ detail.token_count or 0
165
+ for detail in response.usage_metadata.prompt_tokens_details
166
+ if detail.modality == MediaModality.IMAGE
167
+ )
168
+
169
+ usage = ChatInvokeUsage(
170
+ prompt_tokens=response.usage_metadata.prompt_token_count or 0,
171
+ completion_tokens=(response.usage_metadata.candidates_token_count or 0)
172
+ + (response.usage_metadata.thoughts_token_count or 0),
173
+ total_tokens=response.usage_metadata.total_token_count or 0,
174
+ prompt_cached_tokens=response.usage_metadata.cached_content_token_count,
175
+ prompt_cache_creation_tokens=None,
176
+ prompt_image_tokens=image_tokens,
177
+ )
178
+
179
+ return usage
180
+
181
+ @overload
182
+ async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ...
183
+
184
+ @overload
185
+ async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ...
186
+
187
+ async def ainvoke(
188
+ self, messages: list[BaseMessage], output_format: type[T] | None = None
189
+ ) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]:
190
+ """
191
+ Invoke the model with the given messages.
192
+
193
+ Args:
194
+ messages: List of chat messages
195
+ output_format: Optional Pydantic model class for structured output
196
+
197
+ Returns:
198
+ Either a string response or an instance of output_format
199
+ """
200
+
201
+ # Serialize messages to Google format with the include_system_in_user flag
202
+ contents, system_instruction = GoogleMessageSerializer.serialize_messages(
203
+ messages, include_system_in_user=self.include_system_in_user
204
+ )
205
+
206
+ # Build config dictionary starting with user-provided config
207
+ config: types.GenerateContentConfigDict = {}
208
+ if self.config:
209
+ config = self.config.copy()
210
+
211
+ # Apply model-specific configuration (these can override config)
212
+ if self.temperature is not None:
213
+ config['temperature'] = self.temperature
214
+
215
+ # Add system instruction if present
216
+ if system_instruction:
217
+ config['system_instruction'] = system_instruction
218
+
219
+ if self.top_p is not None:
220
+ config['top_p'] = self.top_p
221
+
222
+ if self.seed is not None:
223
+ config['seed'] = self.seed
224
+
225
+ # set default for flash, flash-lite, gemini-flash-lite-latest, and gemini-flash-latest models
226
+ if self.thinking_budget is None and ('gemini-2.5-flash' in self.model or 'gemini-flash' in self.model):
227
+ self.thinking_budget = 0
228
+
229
+ if self.thinking_budget is not None:
230
+ thinking_config_dict: types.ThinkingConfigDict = {'thinking_budget': self.thinking_budget}
231
+ config['thinking_config'] = thinking_config_dict
232
+
233
+ if self.max_output_tokens is not None:
234
+ config['max_output_tokens'] = self.max_output_tokens
235
+
236
+ async def _make_api_call():
237
+ start_time = time.time()
238
+ self.logger.debug(f'🚀 Starting API call to {self.model}')
239
+
240
+ try:
241
+ if output_format is None:
242
+ # Return string response
243
+ self.logger.debug('📄 Requesting text response')
244
+
245
+ response = await self.get_client().aio.models.generate_content(
246
+ model=self.model,
247
+ contents=contents, # type: ignore
248
+ config=config,
249
+ )
250
+
251
+ elapsed = time.time() - start_time
252
+ self.logger.debug(f'✅ Got text response in {elapsed:.2f}s')
253
+
254
+ # Handle case where response.text might be None
255
+ text = response.text or ''
256
+ if not text:
257
+ self.logger.warning('⚠️ Empty text response received')
258
+
259
+ usage = self._get_usage(response)
260
+
261
+ return ChatInvokeCompletion(
262
+ completion=text,
263
+ usage=usage,
264
+ stop_reason=self._get_stop_reason(response),
265
+ )
266
+
267
+ else:
268
+ # Handle structured output
269
+ if self.supports_structured_output:
270
+ # Use native JSON mode
271
+ self.logger.debug(f'🔧 Requesting structured output for {output_format.__name__}')
272
+ config['response_mime_type'] = 'application/json'
273
+ # Convert Pydantic model to Gemini-compatible schema
274
+ optimized_schema = SchemaOptimizer.create_gemini_optimized_schema(output_format)
275
+
276
+ gemini_schema = self._fix_gemini_schema(optimized_schema)
277
+ config['response_schema'] = gemini_schema
278
+
279
+ response = await self.get_client().aio.models.generate_content(
280
+ model=self.model,
281
+ contents=contents,
282
+ config=config,
283
+ )
284
+
285
+ elapsed = time.time() - start_time
286
+ self.logger.debug(f'✅ Got structured response in {elapsed:.2f}s')
287
+
288
+ usage = self._get_usage(response)
289
+
290
+ # Handle case where response.parsed might be None
291
+ if response.parsed is None:
292
+ self.logger.debug('📝 Parsing JSON from text response')
293
+ # When using response_schema, Gemini returns JSON as text
294
+ if response.text:
295
+ try:
296
+ # Handle JSON wrapped in markdown code blocks (common Gemini behavior)
297
+ text = response.text.strip()
298
+ if text.startswith('```json') and text.endswith('```'):
299
+ text = text[7:-3].strip()
300
+ self.logger.debug('🔧 Stripped ```json``` wrapper from response')
301
+ elif text.startswith('```') and text.endswith('```'):
302
+ text = text[3:-3].strip()
303
+ self.logger.debug('🔧 Stripped ``` wrapper from response')
304
+
305
+ # Parse the JSON text and validate with the Pydantic model
306
+ parsed_data = json.loads(text)
307
+ return ChatInvokeCompletion(
308
+ completion=output_format.model_validate(parsed_data),
309
+ usage=usage,
310
+ stop_reason=self._get_stop_reason(response),
311
+ )
312
+ except (json.JSONDecodeError, ValueError) as e:
313
+ self.logger.error(f'❌ Failed to parse JSON response: {str(e)}')
314
+ self.logger.debug(f'Raw response text: {response.text[:200]}...')
315
+ raise ModelProviderError(
316
+ message=f'Failed to parse or validate response {response}: {str(e)}',
317
+ status_code=500,
318
+ model=self.model,
319
+ ) from e
320
+ else:
321
+ self.logger.error('❌ No response text received')
322
+ raise ModelProviderError(
323
+ message=f'No response from model {response}',
324
+ status_code=500,
325
+ model=self.model,
326
+ )
327
+
328
+ # Ensure we return the correct type
329
+ if isinstance(response.parsed, output_format):
330
+ return ChatInvokeCompletion(
331
+ completion=response.parsed,
332
+ usage=usage,
333
+ stop_reason=self._get_stop_reason(response),
334
+ )
335
+ else:
336
+ # If it's not the expected type, try to validate it
337
+ return ChatInvokeCompletion(
338
+ completion=output_format.model_validate(response.parsed),
339
+ usage=usage,
340
+ stop_reason=self._get_stop_reason(response),
341
+ )
342
+ else:
343
+ # Fallback: Request JSON in the prompt for models without native JSON mode
344
+ self.logger.debug(f'🔄 Using fallback JSON mode for {output_format.__name__}')
345
+ # Create a copy of messages to modify
346
+ modified_messages = [m.model_copy(deep=True) for m in messages]
347
+
348
+ # Add JSON instruction to the last message
349
+ if modified_messages and isinstance(modified_messages[-1].content, str):
350
+ json_instruction = f'\n\nPlease respond with a valid JSON object that matches this schema: {SchemaOptimizer.create_optimized_json_schema(output_format)}'
351
+ modified_messages[-1].content += json_instruction
352
+
353
+ # Re-serialize with modified messages
354
+ fallback_contents, fallback_system = GoogleMessageSerializer.serialize_messages(
355
+ modified_messages, include_system_in_user=self.include_system_in_user
356
+ )
357
+
358
+ # Update config with fallback system instruction if present
359
+ fallback_config = config.copy()
360
+ if fallback_system:
361
+ fallback_config['system_instruction'] = fallback_system
362
+
363
+ response = await self.get_client().aio.models.generate_content(
364
+ model=self.model,
365
+ contents=fallback_contents, # type: ignore
366
+ config=fallback_config,
367
+ )
368
+
369
+ elapsed = time.time() - start_time
370
+ self.logger.debug(f'✅ Got fallback response in {elapsed:.2f}s')
371
+
372
+ usage = self._get_usage(response)
373
+
374
+ # Try to extract JSON from the text response
375
+ if response.text:
376
+ try:
377
+ # Try to find JSON in the response
378
+ text = response.text.strip()
379
+
380
+ # Common patterns: JSON wrapped in markdown code blocks
381
+ if text.startswith('```json') and text.endswith('```'):
382
+ text = text[7:-3].strip()
383
+ elif text.startswith('```') and text.endswith('```'):
384
+ text = text[3:-3].strip()
385
+
386
+ # Parse and validate
387
+ parsed_data = json.loads(text)
388
+ return ChatInvokeCompletion(
389
+ completion=output_format.model_validate(parsed_data),
390
+ usage=usage,
391
+ stop_reason=self._get_stop_reason(response),
392
+ )
393
+ except (json.JSONDecodeError, ValueError) as e:
394
+ self.logger.error(f'❌ Failed to parse fallback JSON: {str(e)}')
395
+ self.logger.debug(f'Raw response text: {response.text[:200]}...')
396
+ raise ModelProviderError(
397
+ message=f'Model does not support JSON mode and failed to parse JSON from text response: {str(e)}',
398
+ status_code=500,
399
+ model=self.model,
400
+ ) from e
401
+ else:
402
+ self.logger.error('❌ No response text in fallback mode')
403
+ raise ModelProviderError(
404
+ message='No response from model',
405
+ status_code=500,
406
+ model=self.model,
407
+ )
408
+ except Exception as e:
409
+ elapsed = time.time() - start_time
410
+ self.logger.error(f'💥 API call failed after {elapsed:.2f}s: {type(e).__name__}: {e}')
411
+ # Re-raise the exception
412
+ raise
413
+
414
+ # Retry logic for certain errors
415
+ assert self.max_retries >= 1, 'max_retries must be at least 1'
416
+
417
+ for attempt in range(self.max_retries):
418
+ try:
419
+ return await _make_api_call()
420
+ except ModelProviderError as e:
421
+ # Retry if status code is in retryable list and we have attempts left
422
+ if e.status_code in self.retryable_status_codes and attempt < self.max_retries - 1:
423
+ self.logger.warning(f'⚠️ Got {e.status_code} error, retrying... (attempt {attempt + 1}/{self.max_retries})')
424
+ await asyncio.sleep(self.retry_delay)
425
+ continue
426
+ # Otherwise raise
427
+ raise
428
+ except Exception as e:
429
+ # For non-ModelProviderError, wrap and raise
430
+ error_message = str(e)
431
+ status_code: int | None = None
432
+
433
+ # Try to extract status code if available
434
+ if hasattr(e, 'response'):
435
+ response_obj = getattr(e, 'response', None)
436
+ if response_obj and hasattr(response_obj, 'status_code'):
437
+ status_code = getattr(response_obj, 'status_code', None)
438
+
439
+ # Enhanced timeout error handling
440
+ if 'timeout' in error_message.lower() or 'cancelled' in error_message.lower():
441
+ if isinstance(e, asyncio.CancelledError) or 'CancelledError' in str(type(e)):
442
+ error_message = 'Gemini API request was cancelled (likely timeout). Consider: 1) Reducing input size, 2) Using a different model, 3) Checking network connectivity.'
443
+ status_code = 504
444
+ else:
445
+ status_code = 408
446
+ elif any(indicator in error_message.lower() for indicator in ['forbidden', '403']):
447
+ status_code = 403
448
+ elif any(
449
+ indicator in error_message.lower()
450
+ for indicator in ['rate limit', 'resource exhausted', 'quota exceeded', 'too many requests', '429']
451
+ ):
452
+ status_code = 429
453
+ elif any(
454
+ indicator in error_message.lower()
455
+ for indicator in ['service unavailable', 'internal server error', 'bad gateway', '503', '502', '500']
456
+ ):
457
+ status_code = 503
458
+
459
+ raise ModelProviderError(
460
+ message=error_message,
461
+ status_code=status_code or 502,
462
+ model=self.name,
463
+ ) from e
464
+
465
+ raise RuntimeError('Retry loop completed without return or exception')
466
+
467
+ def _fix_gemini_schema(self, schema: dict[str, Any]) -> dict[str, Any]:
468
+ """
469
+ Convert a Pydantic model to a Gemini-compatible schema.
470
+
471
+ This function removes unsupported properties like 'additionalProperties' and resolves
472
+ $ref references that Gemini doesn't support.
473
+ """
474
+
475
+ # Handle $defs and $ref resolution
476
+ if '$defs' in schema:
477
+ defs = schema.pop('$defs')
478
+
479
+ def resolve_refs(obj: Any) -> Any:
480
+ if isinstance(obj, dict):
481
+ if '$ref' in obj:
482
+ ref = obj.pop('$ref')
483
+ ref_name = ref.split('/')[-1]
484
+ if ref_name in defs:
485
+ # Replace the reference with the actual definition
486
+ resolved = defs[ref_name].copy()
487
+ # Merge any additional properties from the reference
488
+ for key, value in obj.items():
489
+ if key != '$ref':
490
+ resolved[key] = value
491
+ return resolve_refs(resolved)
492
+ return obj
493
+ else:
494
+ # Recursively process all dictionary values
495
+ return {k: resolve_refs(v) for k, v in obj.items()}
496
+ elif isinstance(obj, list):
497
+ return [resolve_refs(item) for item in obj]
498
+ return obj
499
+
500
+ schema = resolve_refs(schema)
501
+
502
+ # Remove unsupported properties
503
+ def clean_schema(obj: Any) -> Any:
504
+ if isinstance(obj, dict):
505
+ # Remove unsupported properties
506
+ cleaned = {}
507
+ for key, value in obj.items():
508
+ if key not in ['additionalProperties', 'title', 'default']:
509
+ cleaned_value = clean_schema(value)
510
+ # Handle empty object properties - Gemini doesn't allow empty OBJECT types
511
+ if (
512
+ key == 'properties'
513
+ and isinstance(cleaned_value, dict)
514
+ and len(cleaned_value) == 0
515
+ and isinstance(obj.get('type', ''), str)
516
+ and obj.get('type', '').upper() == 'OBJECT'
517
+ ):
518
+ # Convert empty object to have at least one property
519
+ cleaned['properties'] = {'_placeholder': {'type': 'string'}}
520
+ else:
521
+ cleaned[key] = cleaned_value
522
+
523
+ # If this is an object type with empty properties, add a placeholder
524
+ if (
525
+ isinstance(cleaned.get('type', ''), str)
526
+ and cleaned.get('type', '').upper() == 'OBJECT'
527
+ and 'properties' in cleaned
528
+ and isinstance(cleaned['properties'], dict)
529
+ and len(cleaned['properties']) == 0
530
+ ):
531
+ cleaned['properties'] = {'_placeholder': {'type': 'string'}}
532
+
533
+ # Also remove 'title' from the required list if it exists
534
+ if 'required' in cleaned and isinstance(cleaned.get('required'), list):
535
+ cleaned['required'] = [p for p in cleaned['required'] if p != 'title']
536
+
537
+ return cleaned
538
+ elif isinstance(obj, list):
539
+ return [clean_schema(item) for item in obj]
540
+ return obj
541
+
542
+ return clean_schema(schema)
@@ -0,0 +1,120 @@
1
+ import base64
2
+
3
+ from google.genai.types import Content, ContentListUnion, Part
4
+
5
+ from browser_use.llm.messages import (
6
+ AssistantMessage,
7
+ BaseMessage,
8
+ SystemMessage,
9
+ UserMessage,
10
+ )
11
+
12
+
13
+ class GoogleMessageSerializer:
14
+ """Serializer for converting messages to Google Gemini format."""
15
+
16
+ @staticmethod
17
+ def serialize_messages(
18
+ messages: list[BaseMessage], include_system_in_user: bool = False
19
+ ) -> tuple[ContentListUnion, str | None]:
20
+ """
21
+ Convert a list of BaseMessages to Google format, extracting system message.
22
+
23
+ Google handles system instructions separately from the conversation, so we need to:
24
+ 1. Extract any system messages and return them separately as a string (or include in first user message if flag is set)
25
+ 2. Convert the remaining messages to Content objects
26
+
27
+ Args:
28
+ messages: List of messages to convert
29
+ include_system_in_user: If True, system/developer messages are prepended to the first user message
30
+
31
+ Returns:
32
+ A tuple of (formatted_messages, system_message) where:
33
+ - formatted_messages: List of Content objects for the conversation
34
+ - system_message: System instruction string or None
35
+ """
36
+
37
+ messages = [m.model_copy(deep=True) for m in messages]
38
+
39
+ formatted_messages: ContentListUnion = []
40
+ system_message: str | None = None
41
+ system_parts: list[str] = []
42
+
43
+ for i, message in enumerate(messages):
44
+ role = message.role if hasattr(message, 'role') else None
45
+
46
+ # Handle system/developer messages
47
+ if isinstance(message, SystemMessage) or role in ['system', 'developer']:
48
+ # Extract system message content as string
49
+ if isinstance(message.content, str):
50
+ if include_system_in_user:
51
+ system_parts.append(message.content)
52
+ else:
53
+ system_message = message.content
54
+ elif message.content is not None:
55
+ # Handle Iterable of content parts
56
+ parts = []
57
+ for part in message.content:
58
+ if part.type == 'text':
59
+ parts.append(part.text)
60
+ combined_text = '\n'.join(parts)
61
+ if include_system_in_user:
62
+ system_parts.append(combined_text)
63
+ else:
64
+ system_message = combined_text
65
+ continue
66
+
67
+ # Determine the role for non-system messages
68
+ if isinstance(message, UserMessage):
69
+ role = 'user'
70
+ elif isinstance(message, AssistantMessage):
71
+ role = 'model'
72
+ else:
73
+ # Default to user for any unknown message types
74
+ role = 'user'
75
+
76
+ # Initialize message parts
77
+ message_parts: list[Part] = []
78
+
79
+ # If this is the first user message and we have system parts, prepend them
80
+ if include_system_in_user and system_parts and role == 'user' and not formatted_messages:
81
+ system_text = '\n\n'.join(system_parts)
82
+ if isinstance(message.content, str):
83
+ message_parts.append(Part.from_text(text=f'{system_text}\n\n{message.content}'))
84
+ else:
85
+ # Add system text as the first part
86
+ message_parts.append(Part.from_text(text=system_text))
87
+ system_parts = [] # Clear after using
88
+ else:
89
+ # Extract content and create parts normally
90
+ if isinstance(message.content, str):
91
+ # Regular text content
92
+ message_parts = [Part.from_text(text=message.content)]
93
+ elif message.content is not None:
94
+ # Handle Iterable of content parts
95
+ for part in message.content:
96
+ if part.type == 'text':
97
+ message_parts.append(Part.from_text(text=part.text))
98
+ elif part.type == 'refusal':
99
+ message_parts.append(Part.from_text(text=f'[Refusal] {part.refusal}'))
100
+ elif part.type == 'image_url':
101
+ # Handle images
102
+ url = part.image_url.url
103
+
104
+ # Format: data:image/jpeg;base64,<data>
105
+ header, data = url.split(',', 1)
106
+ # Decode base64 to bytes
107
+ image_bytes = base64.b64decode(data)
108
+
109
+ # Add image part
110
+ image_part = Part.from_bytes(data=image_bytes, mime_type='image/jpeg')
111
+
112
+ message_parts.append(image_part)
113
+
114
+ # Create the Content object
115
+ if message_parts:
116
+ final_message = Content(role=role, parts=message_parts)
117
+ # for some reason, the type checker is not able to infer the type of formatted_messages
118
+ formatted_messages.append(final_message) # type: ignore
119
+
120
+ return formatted_messages, system_message