droidrun 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- droidrun/__init__.py +22 -10
- droidrun/__main__.py +1 -2
- droidrun/adb/__init__.py +3 -3
- droidrun/adb/device.py +2 -2
- droidrun/adb/manager.py +2 -2
- droidrun/agent/__init__.py +5 -15
- droidrun/agent/codeact/__init__.py +11 -0
- droidrun/agent/codeact/codeact_agent.py +420 -0
- droidrun/agent/codeact/events.py +28 -0
- droidrun/agent/codeact/prompts.py +26 -0
- droidrun/agent/common/default.py +5 -0
- droidrun/agent/common/events.py +4 -0
- droidrun/agent/context/__init__.py +23 -0
- droidrun/agent/context/agent_persona.py +15 -0
- droidrun/agent/context/context_injection_manager.py +66 -0
- droidrun/agent/context/episodic_memory.py +15 -0
- droidrun/agent/context/personas/__init__.py +11 -0
- droidrun/agent/context/personas/app_starter.py +44 -0
- droidrun/agent/context/personas/default.py +95 -0
- droidrun/agent/context/personas/extractor.py +52 -0
- droidrun/agent/context/personas/ui_expert.py +107 -0
- droidrun/agent/context/reflection.py +20 -0
- droidrun/agent/context/task_manager.py +124 -0
- droidrun/agent/context/todo.txt +4 -0
- droidrun/agent/droid/__init__.py +13 -0
- droidrun/agent/droid/droid_agent.py +357 -0
- droidrun/agent/droid/events.py +28 -0
- droidrun/agent/oneflows/reflector.py +265 -0
- droidrun/agent/planner/__init__.py +13 -0
- droidrun/agent/planner/events.py +16 -0
- droidrun/agent/planner/planner_agent.py +268 -0
- droidrun/agent/planner/prompts.py +124 -0
- droidrun/agent/utils/__init__.py +3 -0
- droidrun/agent/utils/async_utils.py +17 -0
- droidrun/agent/utils/chat_utils.py +312 -0
- droidrun/agent/utils/executer.py +132 -0
- droidrun/agent/utils/llm_picker.py +147 -0
- droidrun/agent/utils/trajectory.py +184 -0
- droidrun/cli/__init__.py +1 -1
- droidrun/cli/logs.py +283 -0
- droidrun/cli/main.py +358 -149
- droidrun/run.py +105 -0
- droidrun/tools/__init__.py +4 -30
- droidrun/tools/adb.py +879 -0
- droidrun/tools/ios.py +594 -0
- droidrun/tools/tools.py +99 -0
- droidrun-0.3.0.dist-info/METADATA +149 -0
- droidrun-0.3.0.dist-info/RECORD +52 -0
- droidrun/agent/llm_reasoning.py +0 -567
- droidrun/agent/react_agent.py +0 -556
- droidrun/llm/__init__.py +0 -24
- droidrun/tools/actions.py +0 -854
- droidrun/tools/device.py +0 -29
- droidrun-0.1.0.dist-info/METADATA +0 -276
- droidrun-0.1.0.dist-info/RECORD +0 -20
- {droidrun-0.1.0.dist-info → droidrun-0.3.0.dist-info}/WHEEL +0 -0
- {droidrun-0.1.0.dist-info → droidrun-0.3.0.dist-info}/entry_points.txt +0 -0
- {droidrun-0.1.0.dist-info → droidrun-0.3.0.dist-info}/licenses/LICENSE +0 -0
droidrun/agent/llm_reasoning.py
DELETED
@@ -1,567 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
LLM Reasoning - Provides reasoning capabilities for the ReAct agent.
|
3
|
-
|
4
|
-
This module handles the integration with LLM providers to generate reasoning steps.
|
5
|
-
"""
|
6
|
-
|
7
|
-
import asyncio
|
8
|
-
import json
|
9
|
-
import os
|
10
|
-
import re
|
11
|
-
import logging
|
12
|
-
from typing import Any, Dict, List, Optional
|
13
|
-
|
14
|
-
# Import OpenAI for LLM integration
|
15
|
-
try:
|
16
|
-
from openai import OpenAI
|
17
|
-
OPENAI_AVAILABLE = True
|
18
|
-
except ImportError:
|
19
|
-
OPENAI_AVAILABLE = False
|
20
|
-
|
21
|
-
# Import Anthropic for Claude integration
|
22
|
-
try:
|
23
|
-
import anthropic
|
24
|
-
ANTHROPIC_AVAILABLE = True
|
25
|
-
except ImportError:
|
26
|
-
ANTHROPIC_AVAILABLE = False
|
27
|
-
|
28
|
-
# Set up logger
|
29
|
-
logger = logging.getLogger("droidrun")
|
30
|
-
|
31
|
-
# Simple token estimator (very rough approximation)
|
32
|
-
def estimate_tokens(text: str) -> int:
|
33
|
-
"""Estimate number of tokens in a string.
|
34
|
-
|
35
|
-
This is a very rough approximation based on the rule of thumb that
|
36
|
-
1 token is approximately 4 characters for English text.
|
37
|
-
|
38
|
-
Args:
|
39
|
-
text: Input text
|
40
|
-
|
41
|
-
Returns:
|
42
|
-
Estimated token count
|
43
|
-
"""
|
44
|
-
if not text:
|
45
|
-
return 0
|
46
|
-
return len(text) // 4 + 1 # Add 1 to be safe
|
47
|
-
|
48
|
-
class LLMReasoner:
|
49
|
-
"""LLM-based reasoner for ReAct agent."""
|
50
|
-
|
51
|
-
def __init__(
|
52
|
-
self,
|
53
|
-
llm_provider: str = "openai",
|
54
|
-
model_name: Optional[str] = None,
|
55
|
-
api_key: Optional[str] = None,
|
56
|
-
temperature: float = 0.2,
|
57
|
-
max_tokens: int = 2000,
|
58
|
-
vision: bool = False
|
59
|
-
):
|
60
|
-
"""Initialize the LLM reasoner.
|
61
|
-
|
62
|
-
Args:
|
63
|
-
llm_provider: LLM provider ('openai', 'anthropic', or 'gemini').
|
64
|
-
If model_name starts with 'gemini-', provider will be set to 'gemini' automatically.
|
65
|
-
model_name: Model name to use
|
66
|
-
api_key: API key for the LLM provider
|
67
|
-
temperature: Temperature for generation
|
68
|
-
max_tokens: Maximum tokens to generate
|
69
|
-
vision: Whether vision capabilities (screenshot) are enabled
|
70
|
-
"""
|
71
|
-
# Auto-detect Gemini models
|
72
|
-
if model_name and model_name.startswith("gemini-"):
|
73
|
-
llm_provider = "gemini"
|
74
|
-
|
75
|
-
self.llm_provider = llm_provider.lower()
|
76
|
-
self.temperature = temperature
|
77
|
-
self.max_tokens = max_tokens
|
78
|
-
self.vision = vision
|
79
|
-
|
80
|
-
# Token usage tracking
|
81
|
-
self.total_prompt_tokens = 0
|
82
|
-
self.total_completion_tokens = 0
|
83
|
-
self.total_tokens = 0
|
84
|
-
self.api_calls = 0
|
85
|
-
|
86
|
-
# Set up model and client based on provider
|
87
|
-
if self.llm_provider == "gemini":
|
88
|
-
if not OPENAI_AVAILABLE:
|
89
|
-
raise ImportError("OpenAI package not installed. Install with 'pip install openai'")
|
90
|
-
|
91
|
-
# Set default model if not specified
|
92
|
-
self.model_name = model_name or "gemini-2.0-flash"
|
93
|
-
|
94
|
-
# Get API key from env var if not provided
|
95
|
-
self.api_key = api_key or os.environ.get("GEMINI_API_KEY")
|
96
|
-
if not self.api_key:
|
97
|
-
raise ValueError("Gemini API key not provided and not found in environment (GEMINI_API_KEY)")
|
98
|
-
|
99
|
-
# Initialize client with Gemini configuration
|
100
|
-
self.client = OpenAI(
|
101
|
-
api_key=self.api_key,
|
102
|
-
base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
|
103
|
-
)
|
104
|
-
logger.info(f"Initialized Gemini client with model {self.model_name}")
|
105
|
-
|
106
|
-
elif self.llm_provider == "openai":
|
107
|
-
if not OPENAI_AVAILABLE:
|
108
|
-
raise ImportError("OpenAI package not installed. Install with 'pip install openai'")
|
109
|
-
|
110
|
-
# If vision is enabled, verify we're using a vision-capable model without auto-switching
|
111
|
-
if vision:
|
112
|
-
if model_name and not (model_name.startswith("gpt-4-vision") or model_name.startswith("gpt-4o") or model_name.endswith("-vision")):
|
113
|
-
# Instead of auto-switching, raise an error
|
114
|
-
raise ValueError(f"The selected model '{model_name}' does not support vision. Please manually specify a vision-capable model like gpt-4o or gpt-4-vision.")
|
115
|
-
elif not model_name:
|
116
|
-
# Only set default model if none was specified
|
117
|
-
model_name = "gpt-4o" # Default vision model
|
118
|
-
logger.info(f"Using vision-capable model: {model_name}")
|
119
|
-
|
120
|
-
# Set default model if not specified
|
121
|
-
self.model_name = model_name or "gpt-4o-mini"
|
122
|
-
|
123
|
-
# Get API key from env var if not provided
|
124
|
-
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
125
|
-
if not self.api_key:
|
126
|
-
raise ValueError("OpenAI API key not provided and not found in environment")
|
127
|
-
|
128
|
-
# Initialize client
|
129
|
-
self.client = OpenAI(api_key=self.api_key)
|
130
|
-
|
131
|
-
elif self.llm_provider == "anthropic":
|
132
|
-
if not ANTHROPIC_AVAILABLE:
|
133
|
-
raise ImportError("Anthropic package not installed. Install with 'pip install anthropic'")
|
134
|
-
|
135
|
-
# If vision is enabled, verify we're using a vision-capable model without auto-switching
|
136
|
-
if vision:
|
137
|
-
if model_name and not ("claude-3" in model_name):
|
138
|
-
# Instead of auto-switching, raise an error
|
139
|
-
raise ValueError(f"The selected model '{model_name}' does not support vision. Please manually specify a Claude 3 model which supports vision capabilities.")
|
140
|
-
elif not model_name:
|
141
|
-
# Only set default model if none was specified
|
142
|
-
model_name = "claude-3-opus-20240229" # Default vision model
|
143
|
-
logger.info(f"Using vision-capable Claude model: {model_name}")
|
144
|
-
|
145
|
-
# Set default model if not specified
|
146
|
-
self.model_name = model_name or "claude-3-opus-20240229"
|
147
|
-
|
148
|
-
# Get API key from env var if not provided
|
149
|
-
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
150
|
-
if not self.api_key:
|
151
|
-
raise ValueError("Anthropic API key not provided and not found in environment")
|
152
|
-
|
153
|
-
# Initialize client
|
154
|
-
self.client = anthropic.Anthropic(api_key=self.api_key)
|
155
|
-
|
156
|
-
else:
|
157
|
-
raise ValueError(f"Unsupported LLM provider: {llm_provider}")
|
158
|
-
|
159
|
-
def get_token_usage_stats(self) -> Dict[str, int]:
|
160
|
-
"""Get current token usage statistics.
|
161
|
-
|
162
|
-
Returns:
|
163
|
-
Dictionary with token usage statistics
|
164
|
-
"""
|
165
|
-
return {
|
166
|
-
"prompt_tokens": self.total_prompt_tokens,
|
167
|
-
"completion_tokens": self.total_completion_tokens,
|
168
|
-
"total_tokens": self.total_tokens,
|
169
|
-
"api_calls": self.api_calls
|
170
|
-
}
|
171
|
-
|
172
|
-
async def reason(
|
173
|
-
self,
|
174
|
-
goal: str,
|
175
|
-
history: List[Dict[str, Any]],
|
176
|
-
available_tools: Optional[List[str]] = None,
|
177
|
-
screenshot_data: Optional[bytes] = None
|
178
|
-
) -> Dict[str, Any]:
|
179
|
-
"""Generate a reasoning step using the LLM.
|
180
|
-
|
181
|
-
Args:
|
182
|
-
goal: The automation goal
|
183
|
-
history: List of previous steps as dictionaries
|
184
|
-
available_tools: Optional list of available tool names
|
185
|
-
screenshot_data: Optional bytes containing the latest screenshot
|
186
|
-
|
187
|
-
Returns:
|
188
|
-
Dictionary with next reasoning step, including thought,
|
189
|
-
action, and any parameters
|
190
|
-
"""
|
191
|
-
# Print current token usage stats before making the call
|
192
|
-
logger.info(f"Token usage before API call: {self.get_token_usage_stats()}")
|
193
|
-
|
194
|
-
# Construct the prompt
|
195
|
-
system_prompt = self._create_system_prompt(available_tools)
|
196
|
-
user_prompt = self._create_user_prompt(goal, history)
|
197
|
-
|
198
|
-
try:
|
199
|
-
# Call the LLM based on provider
|
200
|
-
if self.llm_provider in ["openai", "gemini"]: # Handle both OpenAI and Gemini with OpenAI client
|
201
|
-
response = await self._call_openai(system_prompt, user_prompt, screenshot_data)
|
202
|
-
elif self.llm_provider == "anthropic":
|
203
|
-
response = await self._call_anthropic(system_prompt, user_prompt, screenshot_data)
|
204
|
-
else:
|
205
|
-
raise ValueError(f"Unsupported LLM provider: {self.llm_provider}")
|
206
|
-
|
207
|
-
# Parse the response
|
208
|
-
result = self._parse_response(response)
|
209
|
-
|
210
|
-
# Print updated token usage stats after the call
|
211
|
-
logger.info(f"Token usage after API call: {self.get_token_usage_stats()}")
|
212
|
-
|
213
|
-
return result
|
214
|
-
|
215
|
-
except Exception as e:
|
216
|
-
error_str = str(e)
|
217
|
-
if "content[1].type" in error_str and "image" in error_str and self.vision:
|
218
|
-
logger.error(f"Vision error with {self.llm_provider} API: {e}")
|
219
|
-
logger.error("The selected model does not support image inputs. Please use a vision-capable model like gpt-4o or gpt-4-vision.")
|
220
|
-
return {
|
221
|
-
"thought": f"Error: The selected model '{self.model_name}' does not support vision. Please use a vision-capable model like gpt-4o.",
|
222
|
-
"action": "error",
|
223
|
-
"parameters": {}
|
224
|
-
}
|
225
|
-
else:
|
226
|
-
logger.error(f"Error in LLM reasoning: {e}")
|
227
|
-
# Return a fallback response
|
228
|
-
return {
|
229
|
-
"thought": f"LLM reasoning error: {e}",
|
230
|
-
"action": "error",
|
231
|
-
"parameters": {}
|
232
|
-
}
|
233
|
-
|
234
|
-
def _create_system_prompt(self, available_tools: Optional[List[str]] = None) -> str:
|
235
|
-
"""Create the system prompt for the LLM.
|
236
|
-
|
237
|
-
Args:
|
238
|
-
available_tools: Optional list of available tool names
|
239
|
-
|
240
|
-
Returns:
|
241
|
-
System prompt string
|
242
|
-
"""
|
243
|
-
# Base system prompt
|
244
|
-
prompt = """
|
245
|
-
You are an user assitant for an Android phone. Your task is to control an Android device to achieve a specified goal the user is asking for.
|
246
|
-
Follow these guidelines:
|
247
|
-
|
248
|
-
1. Analyze the current screen state from the UI state getting all UI elements
|
249
|
-
2. Think step-by-step to plan your actions
|
250
|
-
3. Choose the most appropriate tool for each step
|
251
|
-
4. Return your response in JSON format with the following fields:
|
252
|
-
- thought: Your detailed reasoning about the current state and what to do next
|
253
|
-
- action: The name of the tool to execute (use EXACT tool name without any parentheses)
|
254
|
-
- parameters: A dictionary of parameters to pass to the tool
|
255
|
-
|
256
|
-
IMPORTANT: When specifying the action field:
|
257
|
-
- Never add parentheses to the tool name
|
258
|
-
- Common mistakes to avoid:
|
259
|
-
❌ "get_clickables()"
|
260
|
-
✅ "get_clickables"
|
261
|
-
|
262
|
-
You have two very important tools for your observations.
|
263
|
-
1. You can get all UI elements to get a better understanding of the current screen including all texts container on the screen. Use this to to analyze the current ui context.
|
264
|
-
2. If you want to take action, after you analyzed the context, you can get all the clickable elements for your next interactive step. Only use this tool if you know about your current ui context.
|
265
|
-
|
266
|
-
"""
|
267
|
-
|
268
|
-
# Add vision-specific instructions if vision is enabled
|
269
|
-
if self.vision:
|
270
|
-
prompt += """
|
271
|
-
You have access to screenshots through the take_screenshot tool. Use it when visual context is needed.
|
272
|
-
"""
|
273
|
-
else:
|
274
|
-
prompt += """
|
275
|
-
Vision is disabled. Rely solely on text-based UI element data from get_clickables.
|
276
|
-
"""
|
277
|
-
|
278
|
-
# Tool documentation with exact parameter names
|
279
|
-
tool_docs = {
|
280
|
-
"tap": "tap(index: int) - Tap on the element with the given index on the device",
|
281
|
-
|
282
|
-
"swipe": "swipe(start_x: int, start_y: int, end_x: int, end_y: int, duration_ms: int = 300) - Swipe from (start_x,start_y) to (end_x,end_y) over duration_ms milliseconds",
|
283
|
-
|
284
|
-
"input_text": "input_text(text: str) - Input text on the device - this works only if an input is focused. Always make sure that an edit field was tapped before inserting text",
|
285
|
-
|
286
|
-
"press_key": "press_key(keycode: int) - Press a key on the device using keycode",
|
287
|
-
|
288
|
-
"start_app": "start_app(package: str, activity: str = '') - Start an app using its package name (e.g., 'com.android.settings')",
|
289
|
-
|
290
|
-
"list_packages": "list_packages(include_system_apps: bool = False) - List installed packages on the device, returns detailed package information",
|
291
|
-
|
292
|
-
"get_clickables": "get_clickables() - Get only the clickable UI elements from the device screen. Returns a dictionary containing interactive elements with their properties",
|
293
|
-
|
294
|
-
"complete": "complete(result: str) - IMPORTANT: This tool should ONLY be called after you have ACTUALLY completed all necessary actions for the goal. It does not perform any actions itself - it only signals that you have already achieved the goal through other actions. Include a summary of what was accomplished as the result parameter.",
|
295
|
-
}
|
296
|
-
|
297
|
-
# Add take_screenshot tool only if vision is enabled
|
298
|
-
if self.vision:
|
299
|
-
tool_docs["take_screenshot"] = "take_screenshot() - Take a screenshot to better understand the current UI. Use when you need visual context."
|
300
|
-
|
301
|
-
# Add available tools information if provided
|
302
|
-
if available_tools:
|
303
|
-
prompt += "\n\nAvailable tools and their parameters:\n"
|
304
|
-
|
305
|
-
# Only include docs for available tools
|
306
|
-
for tool in available_tools:
|
307
|
-
if tool in tool_docs:
|
308
|
-
prompt += f"- {tool_docs[tool]}\n"
|
309
|
-
else:
|
310
|
-
prompt += f"- {tool} (parameters unknown)\n"
|
311
|
-
|
312
|
-
return prompt
|
313
|
-
|
314
|
-
def _create_user_prompt(
|
315
|
-
self,
|
316
|
-
goal: str,
|
317
|
-
history: List[Dict[str, Any]],
|
318
|
-
) -> str:
|
319
|
-
"""Create the user prompt for the LLM.
|
320
|
-
|
321
|
-
Args:
|
322
|
-
goal: The automation goal
|
323
|
-
history: List of previous steps
|
324
|
-
|
325
|
-
Returns:
|
326
|
-
User prompt string
|
327
|
-
"""
|
328
|
-
prompt = f"Goal: {goal}\n\n"
|
329
|
-
|
330
|
-
# Add truncated history if available
|
331
|
-
if history:
|
332
|
-
# Start with a budget for tokens (very rough approximation)
|
333
|
-
total_budget = 100000 # Conservative limit to leave room for response
|
334
|
-
|
335
|
-
# Estimate tokens for the goal and other parts
|
336
|
-
goal_tokens = estimate_tokens(goal) * 2 # Account for repetition
|
337
|
-
|
338
|
-
# Calculate remaining budget for history
|
339
|
-
history_budget = total_budget - goal_tokens
|
340
|
-
|
341
|
-
# Start with most recent history and work backwards
|
342
|
-
truncated_history = []
|
343
|
-
current_size = 0
|
344
|
-
|
345
|
-
# Copy and reverse history to process most recent first
|
346
|
-
reversed_history = list(reversed(history))
|
347
|
-
|
348
|
-
for step in reversed_history:
|
349
|
-
step_type = step.get("type", "").upper()
|
350
|
-
content = step.get("content", "")
|
351
|
-
step_text = f"{step_type}: {content}\n"
|
352
|
-
step_tokens = estimate_tokens(step_text)
|
353
|
-
|
354
|
-
# If this step would exceed our budget, stop adding
|
355
|
-
if current_size + step_tokens > history_budget:
|
356
|
-
# Add a note about truncation
|
357
|
-
truncated_history.insert(0, "... (earlier history truncated)")
|
358
|
-
break
|
359
|
-
|
360
|
-
# Otherwise, add this step and update our current size
|
361
|
-
truncated_history.insert(0, step_text)
|
362
|
-
current_size += step_tokens
|
363
|
-
|
364
|
-
# Add the truncated history to the prompt
|
365
|
-
prompt += "History:\n"
|
366
|
-
for step_text in truncated_history:
|
367
|
-
prompt += step_text
|
368
|
-
prompt += "\n"
|
369
|
-
|
370
|
-
prompt += "Based on the current state, what's your next action? Return your response in JSON format."
|
371
|
-
|
372
|
-
# Final sanity check - if prompt is still too large, truncate aggressively
|
373
|
-
if estimate_tokens(prompt) > 100000:
|
374
|
-
logger.warning("Prompt still too large after normal truncation. Applying emergency truncation.")
|
375
|
-
# Keep the beginning (goal) and end (instructions) but truncate the middle
|
376
|
-
beginning = prompt[:2000] # Keep goal
|
377
|
-
end = prompt[-1000:] # Keep final instructions
|
378
|
-
prompt = beginning + "\n... (content truncated to fit token limits) ...\n" + end
|
379
|
-
|
380
|
-
return prompt
|
381
|
-
|
382
|
-
async def _call_openai(self, system_prompt: str, user_prompt: str, screenshot_data: Optional[bytes] = None) -> str:
|
383
|
-
"""Call OpenAI or Gemini API to generate a response.
|
384
|
-
|
385
|
-
Args:
|
386
|
-
system_prompt: System prompt string
|
387
|
-
user_prompt: User prompt string
|
388
|
-
screenshot_data: Optional bytes containing the latest screenshot
|
389
|
-
|
390
|
-
Returns:
|
391
|
-
Generated response string
|
392
|
-
"""
|
393
|
-
try:
|
394
|
-
messages = [
|
395
|
-
{"role": "system", "content": system_prompt},
|
396
|
-
]
|
397
|
-
|
398
|
-
# If we have a screenshot, add it as a message with the image
|
399
|
-
if screenshot_data:
|
400
|
-
import base64
|
401
|
-
base64_image = base64.b64encode(screenshot_data).decode('utf-8')
|
402
|
-
|
403
|
-
# Different image format for different providers
|
404
|
-
if self.llm_provider == "gemini":
|
405
|
-
# Gemini format
|
406
|
-
image_content = {
|
407
|
-
"type": "image_url",
|
408
|
-
"image_url": {
|
409
|
-
"url": f"data:image/jpeg;base64,{base64_image}"
|
410
|
-
}
|
411
|
-
}
|
412
|
-
else:
|
413
|
-
# OpenAI and others format
|
414
|
-
image_content = {
|
415
|
-
"type": "image_url",
|
416
|
-
"image_url": {
|
417
|
-
"url": f"data:image/jpeg;base64,{base64_image}"
|
418
|
-
}
|
419
|
-
}
|
420
|
-
|
421
|
-
messages.append({
|
422
|
-
"role": "user",
|
423
|
-
"content": [
|
424
|
-
{
|
425
|
-
"type": "text",
|
426
|
-
"text": "Here's the current screenshot of the device. Please analyze it to help with the next action."
|
427
|
-
},
|
428
|
-
image_content
|
429
|
-
]
|
430
|
-
})
|
431
|
-
|
432
|
-
# Add the main user prompt
|
433
|
-
messages.append({"role": "user", "content": user_prompt})
|
434
|
-
|
435
|
-
response = await asyncio.to_thread(
|
436
|
-
self.client.chat.completions.create,
|
437
|
-
model=self.model_name,
|
438
|
-
messages=messages,
|
439
|
-
temperature=self.temperature,
|
440
|
-
max_tokens=self.max_tokens,
|
441
|
-
response_format={"type": "json_object"}
|
442
|
-
)
|
443
|
-
|
444
|
-
# Extract token usage statistics
|
445
|
-
usage = response.usage
|
446
|
-
prompt_tokens = usage.prompt_tokens
|
447
|
-
completion_tokens = usage.completion_tokens
|
448
|
-
total_tokens = usage.total_tokens
|
449
|
-
|
450
|
-
# Update token usage counters
|
451
|
-
self.total_prompt_tokens += prompt_tokens
|
452
|
-
self.total_completion_tokens += completion_tokens
|
453
|
-
self.total_tokens += total_tokens
|
454
|
-
self.api_calls += 1
|
455
|
-
|
456
|
-
# Print token usage information
|
457
|
-
logger.info("===== Token Usage Statistics =====")
|
458
|
-
logger.info(f"API Call #{self.api_calls}")
|
459
|
-
logger.info(f"This call: {prompt_tokens} prompt + {completion_tokens} completion = {total_tokens} tokens")
|
460
|
-
logger.info(f"Cumulative: {self.total_prompt_tokens} prompt + {self.total_completion_tokens} completion = {self.total_tokens} tokens")
|
461
|
-
logger.info("=================================")
|
462
|
-
|
463
|
-
return response.choices[0].message.content
|
464
|
-
except Exception as e:
|
465
|
-
logger.error(f"Error calling {'Gemini' if self.llm_provider == 'gemini' else 'OpenAI'} API: {e}")
|
466
|
-
raise
|
467
|
-
|
468
|
-
async def _call_anthropic(self, system_prompt: str, user_prompt: str, screenshot_data: Optional[bytes] = None) -> str:
|
469
|
-
"""Call Anthropic API to generate a response.
|
470
|
-
|
471
|
-
Args:
|
472
|
-
system_prompt: System prompt string
|
473
|
-
user_prompt: User prompt string
|
474
|
-
screenshot_data: Optional bytes containing the latest screenshot
|
475
|
-
|
476
|
-
Returns:
|
477
|
-
Generated response string
|
478
|
-
"""
|
479
|
-
try:
|
480
|
-
messages = []
|
481
|
-
|
482
|
-
# If we have a screenshot, add it as a message with the image
|
483
|
-
if screenshot_data:
|
484
|
-
import base64
|
485
|
-
# Convert the image bytes to base64
|
486
|
-
base64_image = base64.b64encode(screenshot_data).decode('utf-8')
|
487
|
-
messages.append({
|
488
|
-
"role": "user",
|
489
|
-
"content": [
|
490
|
-
{
|
491
|
-
"type": "image",
|
492
|
-
"source": {
|
493
|
-
"type": "base64",
|
494
|
-
"media_type": "image/jpeg",
|
495
|
-
"data": base64_image
|
496
|
-
}
|
497
|
-
},
|
498
|
-
{
|
499
|
-
"type": "text",
|
500
|
-
"text": "Here's the current screenshot of the device. Please analyze it to help with the next action."
|
501
|
-
}
|
502
|
-
]
|
503
|
-
})
|
504
|
-
|
505
|
-
# Add the main user prompt
|
506
|
-
messages.append({"role": "user", "content": user_prompt})
|
507
|
-
|
508
|
-
response = await asyncio.to_thread(
|
509
|
-
self.client.messages.create,
|
510
|
-
model=self.model_name,
|
511
|
-
system=system_prompt,
|
512
|
-
messages=messages,
|
513
|
-
temperature=self.temperature,
|
514
|
-
max_tokens=self.max_tokens
|
515
|
-
)
|
516
|
-
return response.content[0].text
|
517
|
-
except Exception as e:
|
518
|
-
logger.error(f"Error calling Anthropic API: {e}")
|
519
|
-
raise
|
520
|
-
|
521
|
-
def _parse_response(self, response: str) -> Dict[str, Any]:
|
522
|
-
"""Parse the LLM response into a structured format.
|
523
|
-
|
524
|
-
Args:
|
525
|
-
response: LLM response string
|
526
|
-
|
527
|
-
Returns:
|
528
|
-
Dictionary with parsed response
|
529
|
-
"""
|
530
|
-
try:
|
531
|
-
# Try to parse as JSON
|
532
|
-
data = json.loads(response)
|
533
|
-
|
534
|
-
# Ensure required fields are present
|
535
|
-
if "thought" not in data:
|
536
|
-
data["thought"] = "No thought provided"
|
537
|
-
if "action" not in data:
|
538
|
-
data["action"] = "no_action"
|
539
|
-
if "parameters" not in data:
|
540
|
-
data["parameters"] = {}
|
541
|
-
|
542
|
-
return data
|
543
|
-
except json.JSONDecodeError:
|
544
|
-
# If not valid JSON, try to extract fields using regex
|
545
|
-
thought_match = re.search(r'thought["\s:]+([^"]+)', response)
|
546
|
-
action_match = re.search(r'action["\s:]+([^",\n]+)', response)
|
547
|
-
params_match = re.search(r'parameters["\s:]+({.+})', response, re.DOTALL)
|
548
|
-
|
549
|
-
thought = thought_match.group(1) if thought_match else "Failed to parse thought"
|
550
|
-
action = action_match.group(1) if action_match else "no_action"
|
551
|
-
|
552
|
-
# Try to parse parameters
|
553
|
-
params = {}
|
554
|
-
if params_match:
|
555
|
-
try:
|
556
|
-
params_str = params_match.group(1)
|
557
|
-
# Replace single quotes with double quotes for valid JSON
|
558
|
-
params_str = params_str.replace("'", "\"")
|
559
|
-
params = json.loads(params_str)
|
560
|
-
except json.JSONDecodeError:
|
561
|
-
logger.warning("Failed to parse parameters JSON")
|
562
|
-
|
563
|
-
return {
|
564
|
-
"thought": thought,
|
565
|
-
"action": action,
|
566
|
-
"parameters": params
|
567
|
-
}
|