PyPI - optexity-browser-use - Versions diffs - 0.9.5__py3-none-any.whl - Mend

optexity-browser-use 0.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (147) hide show

browser_use/__init__.py +157 -0
browser_use/actor/__init__.py +11 -0
browser_use/actor/element.py +1175 -0
browser_use/actor/mouse.py +134 -0
browser_use/actor/page.py +561 -0
browser_use/actor/playground/flights.py +41 -0
browser_use/actor/playground/mixed_automation.py +54 -0
browser_use/actor/playground/playground.py +236 -0
browser_use/actor/utils.py +176 -0
browser_use/agent/cloud_events.py +282 -0
browser_use/agent/gif.py +424 -0
browser_use/agent/judge.py +170 -0
browser_use/agent/message_manager/service.py +473 -0
browser_use/agent/message_manager/utils.py +52 -0
browser_use/agent/message_manager/views.py +98 -0
browser_use/agent/prompts.py +413 -0
browser_use/agent/service.py +2316 -0
browser_use/agent/system_prompt.md +185 -0
browser_use/agent/system_prompt_flash.md +10 -0
browser_use/agent/system_prompt_no_thinking.md +183 -0
browser_use/agent/views.py +743 -0
browser_use/browser/__init__.py +41 -0
browser_use/browser/cloud/cloud.py +203 -0
browser_use/browser/cloud/views.py +89 -0
browser_use/browser/events.py +578 -0
browser_use/browser/profile.py +1158 -0
browser_use/browser/python_highlights.py +548 -0
browser_use/browser/session.py +3225 -0
browser_use/browser/session_manager.py +399 -0
browser_use/browser/video_recorder.py +162 -0
browser_use/browser/views.py +200 -0
browser_use/browser/watchdog_base.py +260 -0
browser_use/browser/watchdogs/__init__.py +0 -0
browser_use/browser/watchdogs/aboutblank_watchdog.py +253 -0
browser_use/browser/watchdogs/crash_watchdog.py +335 -0
browser_use/browser/watchdogs/default_action_watchdog.py +2729 -0
browser_use/browser/watchdogs/dom_watchdog.py +817 -0
browser_use/browser/watchdogs/downloads_watchdog.py +1277 -0
browser_use/browser/watchdogs/local_browser_watchdog.py +461 -0
browser_use/browser/watchdogs/permissions_watchdog.py +43 -0
browser_use/browser/watchdogs/popups_watchdog.py +143 -0
browser_use/browser/watchdogs/recording_watchdog.py +126 -0
browser_use/browser/watchdogs/screenshot_watchdog.py +62 -0
browser_use/browser/watchdogs/security_watchdog.py +280 -0
browser_use/browser/watchdogs/storage_state_watchdog.py +335 -0
browser_use/cli.py +2359 -0
browser_use/code_use/__init__.py +16 -0
browser_use/code_use/formatting.py +192 -0
browser_use/code_use/namespace.py +665 -0
browser_use/code_use/notebook_export.py +276 -0
browser_use/code_use/service.py +1340 -0
browser_use/code_use/system_prompt.md +574 -0
browser_use/code_use/utils.py +150 -0
browser_use/code_use/views.py +171 -0
browser_use/config.py +505 -0
browser_use/controller/__init__.py +3 -0
browser_use/dom/enhanced_snapshot.py +161 -0
browser_use/dom/markdown_extractor.py +169 -0
browser_use/dom/playground/extraction.py +312 -0
browser_use/dom/playground/multi_act.py +32 -0
browser_use/dom/serializer/clickable_elements.py +200 -0
browser_use/dom/serializer/code_use_serializer.py +287 -0
browser_use/dom/serializer/eval_serializer.py +478 -0
browser_use/dom/serializer/html_serializer.py +212 -0
browser_use/dom/serializer/paint_order.py +197 -0
browser_use/dom/serializer/serializer.py +1170 -0
browser_use/dom/service.py +825 -0
browser_use/dom/utils.py +129 -0
browser_use/dom/views.py +906 -0
browser_use/exceptions.py +5 -0
browser_use/filesystem/__init__.py +0 -0
browser_use/filesystem/file_system.py +619 -0
browser_use/init_cmd.py +376 -0
browser_use/integrations/gmail/__init__.py +24 -0
browser_use/integrations/gmail/actions.py +115 -0
browser_use/integrations/gmail/service.py +225 -0
browser_use/llm/__init__.py +155 -0
browser_use/llm/anthropic/chat.py +242 -0
browser_use/llm/anthropic/serializer.py +312 -0
browser_use/llm/aws/__init__.py +36 -0
browser_use/llm/aws/chat_anthropic.py +242 -0
browser_use/llm/aws/chat_bedrock.py +289 -0
browser_use/llm/aws/serializer.py +257 -0
browser_use/llm/azure/chat.py +91 -0
browser_use/llm/base.py +57 -0
browser_use/llm/browser_use/__init__.py +3 -0
browser_use/llm/browser_use/chat.py +201 -0
browser_use/llm/cerebras/chat.py +193 -0
browser_use/llm/cerebras/serializer.py +109 -0
browser_use/llm/deepseek/chat.py +212 -0
browser_use/llm/deepseek/serializer.py +109 -0
browser_use/llm/exceptions.py +29 -0
browser_use/llm/google/__init__.py +3 -0
browser_use/llm/google/chat.py +542 -0
browser_use/llm/google/serializer.py +120 -0
browser_use/llm/groq/chat.py +229 -0
browser_use/llm/groq/parser.py +158 -0
browser_use/llm/groq/serializer.py +159 -0
browser_use/llm/messages.py +238 -0
browser_use/llm/models.py +271 -0
browser_use/llm/oci_raw/__init__.py +10 -0
browser_use/llm/oci_raw/chat.py +443 -0
browser_use/llm/oci_raw/serializer.py +229 -0
browser_use/llm/ollama/chat.py +97 -0
browser_use/llm/ollama/serializer.py +143 -0
browser_use/llm/openai/chat.py +264 -0
browser_use/llm/openai/like.py +15 -0
browser_use/llm/openai/serializer.py +165 -0
browser_use/llm/openrouter/chat.py +211 -0
browser_use/llm/openrouter/serializer.py +26 -0
browser_use/llm/schema.py +176 -0
browser_use/llm/views.py +48 -0
browser_use/logging_config.py +330 -0
browser_use/mcp/__init__.py +18 -0
browser_use/mcp/__main__.py +12 -0
browser_use/mcp/client.py +544 -0
browser_use/mcp/controller.py +264 -0
browser_use/mcp/server.py +1114 -0
browser_use/observability.py +204 -0
browser_use/py.typed +0 -0
browser_use/sandbox/__init__.py +41 -0
browser_use/sandbox/sandbox.py +637 -0
browser_use/sandbox/views.py +132 -0
browser_use/screenshots/__init__.py +1 -0
browser_use/screenshots/service.py +52 -0
browser_use/sync/__init__.py +6 -0
browser_use/sync/auth.py +357 -0
browser_use/sync/service.py +161 -0
browser_use/telemetry/__init__.py +51 -0
browser_use/telemetry/service.py +112 -0
browser_use/telemetry/views.py +101 -0
browser_use/tokens/__init__.py +0 -0
browser_use/tokens/custom_pricing.py +24 -0
browser_use/tokens/mappings.py +4 -0
browser_use/tokens/service.py +580 -0
browser_use/tokens/views.py +108 -0
browser_use/tools/registry/service.py +572 -0
browser_use/tools/registry/views.py +174 -0
browser_use/tools/service.py +1675 -0
browser_use/tools/utils.py +82 -0
browser_use/tools/views.py +100 -0
browser_use/utils.py +670 -0
optexity_browser_use-0.9.5.dist-info/METADATA +344 -0
optexity_browser_use-0.9.5.dist-info/RECORD +147 -0
optexity_browser_use-0.9.5.dist-info/WHEEL +4 -0
optexity_browser_use-0.9.5.dist-info/entry_points.txt +3 -0
optexity_browser_use-0.9.5.dist-info/licenses/LICENSE +21 -0

browser_use/llm/google/chat.py ADDED Viewed

@@ -0,0 +1,542 @@
+import asyncio
+import json
+import logging
+import time
+from dataclasses import dataclass, field
+from typing import Any, Literal, TypeVar, overload
+from google import genai
+from google.auth.credentials import Credentials
+from google.genai import types
+from google.genai.types import MediaModality
+from pydantic import BaseModel
+from browser_use.llm.base import BaseChatModel
+from browser_use.llm.exceptions import ModelProviderError
+from browser_use.llm.google.serializer import GoogleMessageSerializer
+from browser_use.llm.messages import BaseMessage
+from browser_use.llm.schema import SchemaOptimizer
+from browser_use.llm.views import ChatInvokeCompletion, ChatInvokeUsage
+T = TypeVar('T', bound=BaseModel)
+VerifiedGeminiModels = Literal[
+	'gemini-2.0-flash',
+	'gemini-2.0-flash-exp',
+	'gemini-2.0-flash-lite-preview-02-05',
+	'Gemini-2.0-exp',
+	'gemini-2.5-flash',
+	'gemini-2.5-flash-lite',
+	'gemini-flash-latest',
+	'gemini-flash-lite-latest',
+	'gemini-2.5-pro',
+	'gemma-3-27b-it',
+	'gemma-3-4b',
+	'gemma-3-12b',
+	'gemma-3n-e2b',
+	'gemma-3n-e4b',
+]
+@dataclass
+class ChatGoogle(BaseChatModel):
+	"""
+	A wrapper around Google's Gemini chat model using the genai client.
+	This class accepts all genai.Client parameters while adding model,
+	temperature, and config parameters for the LLM interface.
+	Args:
+		model: The Gemini model to use
+		temperature: Temperature for response generation
+		config: Additional configuration parameters to pass to generate_content
+			(e.g., tools, safety_settings, etc.).
+		api_key: Google API key
+		vertexai: Whether to use Vertex AI
+		credentials: Google credentials object
+		project: Google Cloud project ID
+		location: Google Cloud location
+		http_options: HTTP options for the client
+		include_system_in_user: If True, system messages are included in the first user message
+		supports_structured_output: If True, uses native JSON mode; if False, uses prompt-based fallback
+		max_retries: Number of retries for retryable errors (default: 3)
+		retryable_status_codes: List of HTTP status codes to retry on (default: [403,  503])
+		retry_delay: Delay in seconds between retries (default: 0.01)
+	Example:
+		from google.genai import types
+		llm = ChatGoogle(
+			model='gemini-2.0-flash-exp',
+			config={
+				'tools': [types.Tool(code_execution=types.ToolCodeExecution())]
+			},
+			max_retries=5,
+			retryable_status_codes=[403, 503],
+			retry_delay=0.02
+		)
+	"""
+	# Model configuration
+	model: VerifiedGeminiModels | str
+	temperature: float | None = 0.5
+	top_p: float | None = None
+	seed: int | None = None
+	thinking_budget: int | None = None  # for gemini-2.5 flash and flash-lite models, default will be set to 0
+	max_output_tokens: int | None = 8096
+	config: types.GenerateContentConfigDict | None = None
+	include_system_in_user: bool = False
+	supports_structured_output: bool = True  # New flag
+	max_retries: int = 3  # Number of retries for retryable errors
+	retryable_status_codes: list[int] = field(default_factory=lambda: [403, 503])  # Status codes to retry on
+	retry_delay: float = 0.01  # Delay in seconds between retries
+	# Client initialization parameters
+	api_key: str | None = None
+	vertexai: bool | None = None
+	credentials: Credentials | None = None
+	project: str | None = None
+	location: str | None = None
+	http_options: types.HttpOptions | types.HttpOptionsDict | None = None
+	# Internal client cache to prevent connection issues
+	_client: genai.Client | None = None
+	# Static
+	@property
+	def provider(self) -> str:
+		return 'google'
+	@property
+	def logger(self) -> logging.Logger:
+		"""Get logger for this chat instance"""
+		return logging.getLogger(f'browser_use.llm.google.{self.model}')
+	def _get_client_params(self) -> dict[str, Any]:
+		"""Prepare client parameters dictionary."""
+		# Define base client params
+		base_params = {
+			'api_key': self.api_key,
+			'vertexai': self.vertexai,
+			'credentials': self.credentials,
+			'project': self.project,
+			'location': self.location,
+			'http_options': self.http_options,
+		}
+		# Create client_params dict with non-None values
+		client_params = {k: v for k, v in base_params.items() if v is not None}
+		return client_params
+	def get_client(self) -> genai.Client:
+		"""
+		Returns a genai.Client instance.
+		Returns:
+			genai.Client: An instance of the Google genai client.
+		"""
+		if self._client is not None:
+			return self._client
+		client_params = self._get_client_params()
+		self._client = genai.Client(**client_params)
+		return self._client
+	@property
+	def name(self) -> str:
+		return str(self.model)
+	def _get_stop_reason(self, response: types.GenerateContentResponse) -> str | None:
+		"""Extract stop_reason from Google response."""
+		if hasattr(response, 'candidates') and response.candidates:
+			return str(response.candidates[0].finish_reason) if hasattr(response.candidates[0], 'finish_reason') else None
+		return None
+	def _get_usage(self, response: types.GenerateContentResponse) -> ChatInvokeUsage | None:
+		usage: ChatInvokeUsage | None = None
+		if response.usage_metadata is not None:
+			image_tokens = 0
+			if response.usage_metadata.prompt_tokens_details is not None:
+				image_tokens = sum(
+					detail.token_count or 0
+					for detail in response.usage_metadata.prompt_tokens_details
+					if detail.modality == MediaModality.IMAGE
+				)
+			usage = ChatInvokeUsage(
+				prompt_tokens=response.usage_metadata.prompt_token_count or 0,
+				completion_tokens=(response.usage_metadata.candidates_token_count or 0)
+				+ (response.usage_metadata.thoughts_token_count or 0),
+				total_tokens=response.usage_metadata.total_token_count or 0,
+				prompt_cached_tokens=response.usage_metadata.cached_content_token_count,
+				prompt_cache_creation_tokens=None,
+				prompt_image_tokens=image_tokens,
+			)
+		return usage
+	@overload
+	async def ainvoke(self, messages: list[BaseMessage], output_format: None = None) -> ChatInvokeCompletion[str]: ...
+	@overload
+	async def ainvoke(self, messages: list[BaseMessage], output_format: type[T]) -> ChatInvokeCompletion[T]: ...
+	async def ainvoke(
+		self, messages: list[BaseMessage], output_format: type[T] | None = None
+	) -> ChatInvokeCompletion[T] | ChatInvokeCompletion[str]:
+		"""
+		Invoke the model with the given messages.
+		Args:
+			messages: List of chat messages
+			output_format: Optional Pydantic model class for structured output
+		Returns:
+			Either a string response or an instance of output_format
+		"""
+		# Serialize messages to Google format with the include_system_in_user flag
+		contents, system_instruction = GoogleMessageSerializer.serialize_messages(
+			messages, include_system_in_user=self.include_system_in_user
+		)
+		# Build config dictionary starting with user-provided config
+		config: types.GenerateContentConfigDict = {}
+		if self.config:
+			config = self.config.copy()
+		# Apply model-specific configuration (these can override config)
+		if self.temperature is not None:
+			config['temperature'] = self.temperature
+		# Add system instruction if present
+		if system_instruction:
+			config['system_instruction'] = system_instruction
+		if self.top_p is not None:
+			config['top_p'] = self.top_p
+		if self.seed is not None:
+			config['seed'] = self.seed
+		# set default for flash, flash-lite, gemini-flash-lite-latest, and gemini-flash-latest models
+		if self.thinking_budget is None and ('gemini-2.5-flash' in self.model or 'gemini-flash' in self.model):
+			self.thinking_budget = 0
+		if self.thinking_budget is not None:
+			thinking_config_dict: types.ThinkingConfigDict = {'thinking_budget': self.thinking_budget}
+			config['thinking_config'] = thinking_config_dict
+		if self.max_output_tokens is not None:
+			config['max_output_tokens'] = self.max_output_tokens
+		async def _make_api_call():
+			start_time = time.time()
+			self.logger.debug(f'🚀 Starting API call to {self.model}')
+			try:
+				if output_format is None:
+					# Return string response
+					self.logger.debug('📄 Requesting text response')
+					response = await self.get_client().aio.models.generate_content(
+						model=self.model,
+						contents=contents,  # type: ignore
+						config=config,
+					)
+					elapsed = time.time() - start_time
+					self.logger.debug(f'✅ Got text response in {elapsed:.2f}s')
+					# Handle case where response.text might be None
+					text = response.text or ''
+					if not text:
+						self.logger.warning('⚠️ Empty text response received')
+					usage = self._get_usage(response)
+					return ChatInvokeCompletion(
+						completion=text,
+						usage=usage,
+						stop_reason=self._get_stop_reason(response),
+					)
+				else:
+					# Handle structured output
+					if self.supports_structured_output:
+						# Use native JSON mode
+						self.logger.debug(f'🔧 Requesting structured output for {output_format.__name__}')
+						config['response_mime_type'] = 'application/json'
+						# Convert Pydantic model to Gemini-compatible schema
+						optimized_schema = SchemaOptimizer.create_gemini_optimized_schema(output_format)
+						gemini_schema = self._fix_gemini_schema(optimized_schema)
+						config['response_schema'] = gemini_schema
+						response = await self.get_client().aio.models.generate_content(
+							model=self.model,
+							contents=contents,
+							config=config,
+						)
+						elapsed = time.time() - start_time
+						self.logger.debug(f'✅ Got structured response in {elapsed:.2f}s')
+						usage = self._get_usage(response)
+						# Handle case where response.parsed might be None
+						if response.parsed is None:
+							self.logger.debug('📝 Parsing JSON from text response')
+							# When using response_schema, Gemini returns JSON as text
+							if response.text:
+								try:
+									# Handle JSON wrapped in markdown code blocks (common Gemini behavior)
+									text = response.text.strip()
+									if text.startswith('```json') and text.endswith('```'):
+										text = text[7:-3].strip()
+										self.logger.debug('🔧 Stripped ```json``` wrapper from response')
+									elif text.startswith('```') and text.endswith('```'):
+										text = text[3:-3].strip()
+										self.logger.debug('🔧 Stripped ``` wrapper from response')
+									# Parse the JSON text and validate with the Pydantic model
+									parsed_data = json.loads(text)
+									return ChatInvokeCompletion(
+										completion=output_format.model_validate(parsed_data),
+										usage=usage,
+										stop_reason=self._get_stop_reason(response),
+									)
+								except (json.JSONDecodeError, ValueError) as e:
+									self.logger.error(f'❌ Failed to parse JSON response: {str(e)}')
+									self.logger.debug(f'Raw response text: {response.text[:200]}...')
+									raise ModelProviderError(
+										message=f'Failed to parse or validate response {response}: {str(e)}',
+										status_code=500,
+										model=self.model,
+									) from e
+							else:
+								self.logger.error('❌ No response text received')
+								raise ModelProviderError(
+									message=f'No response from model {response}',
+									status_code=500,
+									model=self.model,
+								)
+						# Ensure we return the correct type
+						if isinstance(response.parsed, output_format):
+							return ChatInvokeCompletion(
+								completion=response.parsed,
+								usage=usage,
+								stop_reason=self._get_stop_reason(response),
+							)
+						else:
+							# If it's not the expected type, try to validate it
+							return ChatInvokeCompletion(
+								completion=output_format.model_validate(response.parsed),
+								usage=usage,
+								stop_reason=self._get_stop_reason(response),
+							)
+					else:
+						# Fallback: Request JSON in the prompt for models without native JSON mode
+						self.logger.debug(f'🔄 Using fallback JSON mode for {output_format.__name__}')
+						# Create a copy of messages to modify
+						modified_messages = [m.model_copy(deep=True) for m in messages]
+						# Add JSON instruction to the last message
+						if modified_messages and isinstance(modified_messages[-1].content, str):
+							json_instruction = f'\n\nPlease respond with a valid JSON object that matches this schema: {SchemaOptimizer.create_optimized_json_schema(output_format)}'
+							modified_messages[-1].content += json_instruction
+						# Re-serialize with modified messages
+						fallback_contents, fallback_system = GoogleMessageSerializer.serialize_messages(
+							modified_messages, include_system_in_user=self.include_system_in_user
+						)
+						# Update config with fallback system instruction if present
+						fallback_config = config.copy()
+						if fallback_system:
+							fallback_config['system_instruction'] = fallback_system
+						response = await self.get_client().aio.models.generate_content(
+							model=self.model,
+							contents=fallback_contents,  # type: ignore
+							config=fallback_config,
+						)
+						elapsed = time.time() - start_time
+						self.logger.debug(f'✅ Got fallback response in {elapsed:.2f}s')
+						usage = self._get_usage(response)
+						# Try to extract JSON from the text response
+						if response.text:
+							try:
+								# Try to find JSON in the response
+								text = response.text.strip()
+								# Common patterns: JSON wrapped in markdown code blocks
+								if text.startswith('```json') and text.endswith('```'):
+									text = text[7:-3].strip()
+								elif text.startswith('```') and text.endswith('```'):
+									text = text[3:-3].strip()
+								# Parse and validate
+								parsed_data = json.loads(text)
+								return ChatInvokeCompletion(
+									completion=output_format.model_validate(parsed_data),
+									usage=usage,
+									stop_reason=self._get_stop_reason(response),
+								)
+							except (json.JSONDecodeError, ValueError) as e:
+								self.logger.error(f'❌ Failed to parse fallback JSON: {str(e)}')
+								self.logger.debug(f'Raw response text: {response.text[:200]}...')
+								raise ModelProviderError(
+									message=f'Model does not support JSON mode and failed to parse JSON from text response: {str(e)}',
+									status_code=500,
+									model=self.model,
+								) from e
+						else:
+							self.logger.error('❌ No response text in fallback mode')
+							raise ModelProviderError(
+								message='No response from model',
+								status_code=500,
+								model=self.model,
+							)
+			except Exception as e:
+				elapsed = time.time() - start_time
+				self.logger.error(f'💥 API call failed after {elapsed:.2f}s: {type(e).__name__}: {e}')
+				# Re-raise the exception
+				raise
+		# Retry logic for certain errors
+		assert self.max_retries >= 1, 'max_retries must be at least 1'
+		for attempt in range(self.max_retries):
+			try:
+				return await _make_api_call()
+			except ModelProviderError as e:
+				# Retry if status code is in retryable list and we have attempts left
+				if e.status_code in self.retryable_status_codes and attempt < self.max_retries - 1:
+					self.logger.warning(f'⚠️ Got {e.status_code} error, retrying... (attempt {attempt + 1}/{self.max_retries})')
+					await asyncio.sleep(self.retry_delay)
+					continue
+				# Otherwise raise
+				raise
+			except Exception as e:
+				# For non-ModelProviderError, wrap and raise
+				error_message = str(e)
+				status_code: int | None = None
+				# Try to extract status code if available
+				if hasattr(e, 'response'):
+					response_obj = getattr(e, 'response', None)
+					if response_obj and hasattr(response_obj, 'status_code'):
+						status_code = getattr(response_obj, 'status_code', None)
+				# Enhanced timeout error handling
+				if 'timeout' in error_message.lower() or 'cancelled' in error_message.lower():
+					if isinstance(e, asyncio.CancelledError) or 'CancelledError' in str(type(e)):
+						error_message = 'Gemini API request was cancelled (likely timeout). Consider: 1) Reducing input size, 2) Using a different model, 3) Checking network connectivity.'
+						status_code = 504
+					else:
+						status_code = 408
+				elif any(indicator in error_message.lower() for indicator in ['forbidden', '403']):
+					status_code = 403
+				elif any(
+					indicator in error_message.lower()
+					for indicator in ['rate limit', 'resource exhausted', 'quota exceeded', 'too many requests', '429']
+				):
+					status_code = 429
+				elif any(
+					indicator in error_message.lower()
+					for indicator in ['service unavailable', 'internal server error', 'bad gateway', '503', '502', '500']
+				):
+					status_code = 503
+				raise ModelProviderError(
+					message=error_message,
+					status_code=status_code or 502,
+					model=self.name,
+				) from e
+		raise RuntimeError('Retry loop completed without return or exception')
+	def _fix_gemini_schema(self, schema: dict[str, Any]) -> dict[str, Any]:
+		"""
+		Convert a Pydantic model to a Gemini-compatible schema.
+		This function removes unsupported properties like 'additionalProperties' and resolves
+		$ref references that Gemini doesn't support.
+		"""
+		# Handle $defs and $ref resolution
+		if '$defs' in schema:
+			defs = schema.pop('$defs')
+			def resolve_refs(obj: Any) -> Any:
+				if isinstance(obj, dict):
+					if '$ref' in obj:
+						ref = obj.pop('$ref')
+						ref_name = ref.split('/')[-1]
+						if ref_name in defs:
+							# Replace the reference with the actual definition
+							resolved = defs[ref_name].copy()
+							# Merge any additional properties from the reference
+							for key, value in obj.items():
+								if key != '$ref':
+									resolved[key] = value
+							return resolve_refs(resolved)
+						return obj
+					else:
+						# Recursively process all dictionary values
+						return {k: resolve_refs(v) for k, v in obj.items()}
+				elif isinstance(obj, list):
+					return [resolve_refs(item) for item in obj]
+				return obj
+			schema = resolve_refs(schema)
+		# Remove unsupported properties
+		def clean_schema(obj: Any) -> Any:
+			if isinstance(obj, dict):
+				# Remove unsupported properties
+				cleaned = {}
+				for key, value in obj.items():
+					if key not in ['additionalProperties', 'title', 'default']:
+						cleaned_value = clean_schema(value)
+						# Handle empty object properties - Gemini doesn't allow empty OBJECT types
+						if (
+							key == 'properties'
+							and isinstance(cleaned_value, dict)
+							and len(cleaned_value) == 0
+							and isinstance(obj.get('type', ''), str)
+							and obj.get('type', '').upper() == 'OBJECT'
+						):
+							# Convert empty object to have at least one property
+							cleaned['properties'] = {'_placeholder': {'type': 'string'}}
+						else:
+							cleaned[key] = cleaned_value
+				# If this is an object type with empty properties, add a placeholder
+				if (
+					isinstance(cleaned.get('type', ''), str)
+					and cleaned.get('type', '').upper() == 'OBJECT'
+					and 'properties' in cleaned
+					and isinstance(cleaned['properties'], dict)
+					and len(cleaned['properties']) == 0
+				):
+					cleaned['properties'] = {'_placeholder': {'type': 'string'}}
+				# Also remove 'title' from the required list if it exists
+				if 'required' in cleaned and isinstance(cleaned.get('required'), list):
+					cleaned['required'] = [p for p in cleaned['required'] if p != 'title']
+				return cleaned
+			elif isinstance(obj, list):
+				return [clean_schema(item) for item in obj]
+			return obj
+		return clean_schema(schema)

browser_use/llm/google/serializer.py ADDED Viewed

@@ -0,0 +1,120 @@
+import base64
+from google.genai.types import Content, ContentListUnion, Part
+from browser_use.llm.messages import (
+	AssistantMessage,
+	BaseMessage,
+	SystemMessage,
+	UserMessage,
+)
+class GoogleMessageSerializer:
+	"""Serializer for converting messages to Google Gemini format."""
+	@staticmethod
+	def serialize_messages(
+		messages: list[BaseMessage], include_system_in_user: bool = False
+	) -> tuple[ContentListUnion, str | None]:
+		"""
+		Convert a list of BaseMessages to Google format, extracting system message.
+		Google handles system instructions separately from the conversation, so we need to:
+		1. Extract any system messages and return them separately as a string (or include in first user message if flag is set)
+		2. Convert the remaining messages to Content objects
+		Args:
+		    messages: List of messages to convert
+		    include_system_in_user: If True, system/developer messages are prepended to the first user message
+		Returns:
+		    A tuple of (formatted_messages, system_message) where:
+		    - formatted_messages: List of Content objects for the conversation
+		    - system_message: System instruction string or None
+		"""
+		messages = [m.model_copy(deep=True) for m in messages]
+		formatted_messages: ContentListUnion = []
+		system_message: str | None = None
+		system_parts: list[str] = []
+		for i, message in enumerate(messages):
+			role = message.role if hasattr(message, 'role') else None
+			# Handle system/developer messages
+			if isinstance(message, SystemMessage) or role in ['system', 'developer']:
+				# Extract system message content as string
+				if isinstance(message.content, str):
+					if include_system_in_user:
+						system_parts.append(message.content)
+					else:
+						system_message = message.content
+				elif message.content is not None:
+					# Handle Iterable of content parts
+					parts = []
+					for part in message.content:
+						if part.type == 'text':
+							parts.append(part.text)
+					combined_text = '\n'.join(parts)
+					if include_system_in_user:
+						system_parts.append(combined_text)
+					else:
+						system_message = combined_text
+				continue
+			# Determine the role for non-system messages
+			if isinstance(message, UserMessage):
+				role = 'user'
+			elif isinstance(message, AssistantMessage):
+				role = 'model'
+			else:
+				# Default to user for any unknown message types
+				role = 'user'
+			# Initialize message parts
+			message_parts: list[Part] = []
+			# If this is the first user message and we have system parts, prepend them
+			if include_system_in_user and system_parts and role == 'user' and not formatted_messages:
+				system_text = '\n\n'.join(system_parts)
+				if isinstance(message.content, str):
+					message_parts.append(Part.from_text(text=f'{system_text}\n\n{message.content}'))
+				else:
+					# Add system text as the first part
+					message_parts.append(Part.from_text(text=system_text))
+				system_parts = []  # Clear after using
+			else:
+				# Extract content and create parts normally
+				if isinstance(message.content, str):
+					# Regular text content
+					message_parts = [Part.from_text(text=message.content)]
+				elif message.content is not None:
+					# Handle Iterable of content parts
+					for part in message.content:
+						if part.type == 'text':
+							message_parts.append(Part.from_text(text=part.text))
+						elif part.type == 'refusal':
+							message_parts.append(Part.from_text(text=f'[Refusal] {part.refusal}'))
+						elif part.type == 'image_url':
+							# Handle images
+							url = part.image_url.url
+							# Format: data:image/jpeg;base64,<data>
+							header, data = url.split(',', 1)
+							# Decode base64 to bytes
+							image_bytes = base64.b64decode(data)
+							# Add image part
+							image_part = Part.from_bytes(data=image_bytes, mime_type='image/jpeg')
+							message_parts.append(image_part)
+			# Create the Content object
+			if message_parts:
+				final_message = Content(role=role, parts=message_parts)
+				# for some reason, the type checker is not able to infer the type of formatted_messages
+				formatted_messages.append(final_message)  # type: ignore
+		return formatted_messages, system_message