khoj 1.41.1.dev43__py3-none-any.whl → 1.41.1.dev90__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khoj/database/adapters/__init__.py +1 -1
- khoj/interface/compiled/404/index.html +2 -2
- khoj/interface/compiled/_next/static/chunks/{2327-f03b2a77f67b8f8c.js → 2327-aa22697ed9c8d54a.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/4986-9ddd694756d03aa1.js +1 -0
- khoj/interface/compiled/_next/static/chunks/{8515-010dd769c584b672.js → 8515-f305779d95dd5780.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-4e2a134ec26aa606.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/agents/{page-ceeb9a91edea74ce.js → page-996513ae80f8720c.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/automations/{page-e3cb78747ab98cc7.js → page-2320231573aa9a49.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-ad4d1792ab1a4108.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/chat/{page-14ac9d1ad5cb84c5.js → page-6257055246cdebd5.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/{page-a4053e1bb578b2ce.js → page-d9a2e44bbcf49f82.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-f5881c7ae3ba0795.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/search/{page-8973da2f4c076fe1.js → page-31452bbda0e0a56f.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/settings/{page-375136dbb400525b.js → page-fdb72b15ca908b43.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-abb6c5f4239ad7be.js +1 -0
- khoj/interface/compiled/_next/static/chunks/app/share/chat/{page-384b54fc953b18f2.js → page-5b7cb35d835af900.js} +1 -1
- khoj/interface/compiled/_next/static/chunks/{webpack-1169ca6e9e7e6247.js → webpack-e091508620cb8aef.js} +1 -1
- khoj/interface/compiled/_next/static/css/{fca983d49c3dd1a3.css → 0db53bacf81896f5.css} +1 -1
- khoj/interface/compiled/_next/static/css/55d4a822f8d94b67.css +1 -0
- khoj/interface/compiled/agents/index.html +2 -2
- khoj/interface/compiled/agents/index.txt +2 -2
- khoj/interface/compiled/automations/index.html +2 -2
- khoj/interface/compiled/automations/index.txt +3 -3
- khoj/interface/compiled/chat/index.html +2 -2
- khoj/interface/compiled/chat/index.txt +2 -2
- khoj/interface/compiled/index.html +2 -2
- khoj/interface/compiled/index.txt +2 -2
- khoj/interface/compiled/search/index.html +2 -2
- khoj/interface/compiled/search/index.txt +2 -2
- khoj/interface/compiled/settings/index.html +2 -2
- khoj/interface/compiled/settings/index.txt +4 -4
- khoj/interface/compiled/share/chat/index.html +2 -2
- khoj/interface/compiled/share/chat/index.txt +2 -2
- khoj/processor/conversation/anthropic/anthropic_chat.py +5 -0
- khoj/processor/conversation/google/gemini_chat.py +5 -0
- khoj/processor/conversation/openai/gpt.py +5 -0
- khoj/processor/conversation/prompts.py +12 -1
- khoj/processor/conversation/utils.py +12 -0
- khoj/processor/operator/grounding_agent.py +345 -0
- khoj/processor/operator/grounding_agent_uitars.py +973 -0
- khoj/processor/operator/operate_browser.py +152 -0
- khoj/processor/operator/operator_actions.py +149 -0
- khoj/processor/operator/operator_agent_anthropic.py +383 -0
- khoj/processor/operator/operator_agent_base.py +80 -0
- khoj/processor/operator/operator_agent_binary.py +336 -0
- khoj/processor/operator/operator_agent_openai.py +349 -0
- khoj/processor/operator/operator_environment_base.py +37 -0
- khoj/processor/operator/operator_environment_browser.py +395 -0
- khoj/routers/api_chat.py +42 -3
- khoj/routers/helpers.py +14 -3
- khoj/routers/research.py +48 -1
- khoj/utils/helpers.py +17 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/METADATA +3 -1
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/RECORD +65 -55
- khoj/interface/compiled/_next/static/chunks/4986-14ea63faad1615a4.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/agents/layout-e00fb81dca656a10.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/chat/layout-33934fc2d6ae6838.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/search/layout-c02531d586972d7d.js +0 -1
- khoj/interface/compiled/_next/static/chunks/app/share/chat/layout-e8e5db7830bf3f47.js +0 -1
- khoj/interface/compiled/_next/static/css/f29752d6e1be7624.css +0 -1
- /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → WLmcH2J-wz36GlS6O8HSL}/_buildManifest.js +0 -0
- /khoj/interface/compiled/_next/static/{doKtSKC0j2ECO8K8viDKD → WLmcH2J-wz36GlS6O8HSL}/_ssgManifest.js +0 -0
- /khoj/interface/compiled/_next/static/chunks/{1915-ab4353eaca76f690.js → 1915-1943ee8a628b893c.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{2117-1c18aa2098982bf9.js → 2117-5a41630a2bd2eae8.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4363-4efaf12abe696251.js → 4363-e6ac2203564d1a3b.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{4447-5d44807c40355b1a.js → 4447-e038b251d626c340.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{8667-adbe6017a66cef10.js → 8667-8136f74e9a086fca.js} +0 -0
- /khoj/interface/compiled/_next/static/chunks/{9259-d8bcd9da9e80c81e.js → 9259-640fdd77408475df.js} +0 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/WHEEL +0 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/entry_points.txt +0 -0
- {khoj-1.41.1.dev43.dist-info → khoj-1.41.1.dev90.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,152 @@
|
|
1
|
+
import asyncio
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import os
|
5
|
+
from typing import Callable, List, Optional
|
6
|
+
|
7
|
+
import requests
|
8
|
+
|
9
|
+
from khoj.database.adapters import AgentAdapters, ConversationAdapters
|
10
|
+
from khoj.database.models import Agent, ChatModel, KhojUser
|
11
|
+
from khoj.processor.operator.operator_actions import *
|
12
|
+
from khoj.processor.operator.operator_agent_anthropic import AnthropicOperatorAgent
|
13
|
+
from khoj.processor.operator.operator_agent_base import OperatorAgent
|
14
|
+
from khoj.processor.operator.operator_agent_binary import BinaryOperatorAgent
|
15
|
+
from khoj.processor.operator.operator_agent_openai import OpenAIOperatorAgent
|
16
|
+
from khoj.processor.operator.operator_environment_base import EnvStepResult
|
17
|
+
from khoj.processor.operator.operator_environment_browser import BrowserEnvironment
|
18
|
+
from khoj.routers.helpers import ChatEvent
|
19
|
+
from khoj.utils.helpers import timer
|
20
|
+
from khoj.utils.rawconfig import LocationData
|
21
|
+
|
22
|
+
logger = logging.getLogger(__name__)
|
23
|
+
|
24
|
+
|
25
|
+
# --- Browser Operator Function ---
|
26
|
+
async def operate_browser(
|
27
|
+
query: str,
|
28
|
+
user: KhojUser,
|
29
|
+
conversation_log: dict,
|
30
|
+
location_data: LocationData,
|
31
|
+
send_status_func: Optional[Callable] = None,
|
32
|
+
query_images: Optional[List[str]] = None, # TODO: Handle query images
|
33
|
+
agent: Agent = None,
|
34
|
+
query_files: str = None, # TODO: Handle query files
|
35
|
+
cancellation_event: Optional[asyncio.Event] = None,
|
36
|
+
tracer: dict = {},
|
37
|
+
):
|
38
|
+
response, summary_message, user_input_message = None, None, None
|
39
|
+
environment: Optional[BrowserEnvironment] = None
|
40
|
+
|
41
|
+
# Get the agent chat model
|
42
|
+
agent_chat_model = await AgentAdapters.aget_agent_chat_model(agent, user) if agent else None
|
43
|
+
reasoning_model: ChatModel = await ConversationAdapters.aget_default_chat_model(user, agent_chat_model)
|
44
|
+
if not reasoning_model or not reasoning_model.vision_enabled:
|
45
|
+
reasoning_model = await ConversationAdapters.aget_vision_enabled_config()
|
46
|
+
if not reasoning_model:
|
47
|
+
raise ValueError(f"No vision enabled chat model found. Configure a vision chat model to operate browser.")
|
48
|
+
|
49
|
+
# Initialize Agent
|
50
|
+
max_iterations = int(os.getenv("KHOJ_OPERATOR_ITERATIONS", 40))
|
51
|
+
operator_agent: OperatorAgent
|
52
|
+
if reasoning_model.name.startswith("gpt-4o"):
|
53
|
+
operator_agent = OpenAIOperatorAgent(query, reasoning_model, max_iterations, tracer)
|
54
|
+
elif reasoning_model.name.startswith("claude-3-7-sonnet"):
|
55
|
+
operator_agent = AnthropicOperatorAgent(query, reasoning_model, max_iterations, tracer)
|
56
|
+
else:
|
57
|
+
grounding_model_name = "ui-tars-1.5"
|
58
|
+
grounding_model = await ConversationAdapters.aget_chat_model_by_name(grounding_model_name)
|
59
|
+
if (
|
60
|
+
not grounding_model
|
61
|
+
or not grounding_model.vision_enabled
|
62
|
+
or not grounding_model.model_type == ChatModel.ModelType.OPENAI
|
63
|
+
):
|
64
|
+
raise ValueError("No supported visual grounding model for binary operator agent found.")
|
65
|
+
operator_agent = BinaryOperatorAgent(query, reasoning_model, grounding_model, max_iterations, tracer)
|
66
|
+
|
67
|
+
# Initialize Environment
|
68
|
+
if send_status_func:
|
69
|
+
async for event in send_status_func(f"**Launching Browser**"):
|
70
|
+
yield {ChatEvent.STATUS: event}
|
71
|
+
environment = BrowserEnvironment()
|
72
|
+
await environment.start(width=1024, height=768)
|
73
|
+
|
74
|
+
# Start Operator Loop
|
75
|
+
try:
|
76
|
+
summarize_prompt = f"Use the results of our research to provide a comprehensive, self-contained answer for the target query:\n{query}."
|
77
|
+
task_completed = False
|
78
|
+
iterations = 0
|
79
|
+
|
80
|
+
with timer(f"Operating browser with {reasoning_model.model_type} {reasoning_model.name}", logger):
|
81
|
+
while iterations < max_iterations and not task_completed:
|
82
|
+
if cancellation_event and cancellation_event.is_set():
|
83
|
+
logger.debug(f"Browser operator cancelled by client disconnect")
|
84
|
+
break
|
85
|
+
|
86
|
+
iterations += 1
|
87
|
+
|
88
|
+
# 1. Get current environment state
|
89
|
+
browser_state = await environment.get_state()
|
90
|
+
|
91
|
+
# 2. Agent decides action(s)
|
92
|
+
agent_result = await operator_agent.act(browser_state)
|
93
|
+
|
94
|
+
# 3. Execute actions in the environment
|
95
|
+
env_steps: List[EnvStepResult] = []
|
96
|
+
for action in agent_result.actions:
|
97
|
+
if cancellation_event and cancellation_event.is_set():
|
98
|
+
logger.debug(f"Browser operator cancelled by client disconnect")
|
99
|
+
break
|
100
|
+
# Handle request for user action and break the loop
|
101
|
+
if isinstance(action, RequestUserAction):
|
102
|
+
user_input_message = action.request
|
103
|
+
if send_status_func:
|
104
|
+
async for event in send_status_func(f"**Requesting User Input**:\n{action.request}"):
|
105
|
+
yield {ChatEvent.STATUS: event}
|
106
|
+
break
|
107
|
+
env_step = await environment.step(action)
|
108
|
+
env_steps.append(env_step)
|
109
|
+
|
110
|
+
# Render status update
|
111
|
+
latest_screenshot = f"data:image/webp;base64,{env_steps[-1].screenshot_base64 if env_steps else browser_state.screenshot}"
|
112
|
+
render_payload = agent_result.rendered_response
|
113
|
+
render_payload["image"] = latest_screenshot
|
114
|
+
render_content = f"**Action**: {json.dumps(render_payload)}"
|
115
|
+
if send_status_func:
|
116
|
+
async for event in send_status_func(f"**Operating Browser**:\n{render_content}"):
|
117
|
+
yield {ChatEvent.STATUS: event}
|
118
|
+
|
119
|
+
# Check if termination conditions are met
|
120
|
+
task_completed = not agent_result.actions # No actions requested by agent
|
121
|
+
trigger_iteration_limit = iterations == max_iterations
|
122
|
+
if user_input_message:
|
123
|
+
logger.info(f"User input requested: {user_input_message}")
|
124
|
+
break
|
125
|
+
if task_completed or trigger_iteration_limit:
|
126
|
+
# Summarize results of operator run on last iteration
|
127
|
+
operator_agent.add_action_results(env_steps, agent_result)
|
128
|
+
summary_message = await operator_agent.summarize(summarize_prompt, browser_state)
|
129
|
+
logger.info(f"Task completed: {task_completed}, Iteration limit: {trigger_iteration_limit}")
|
130
|
+
break
|
131
|
+
|
132
|
+
# 4. Update agent on the results of its action on the environment
|
133
|
+
operator_agent.add_action_results(env_steps, agent_result)
|
134
|
+
|
135
|
+
# Determine final response message
|
136
|
+
if user_input_message:
|
137
|
+
response = user_input_message
|
138
|
+
elif task_completed:
|
139
|
+
response = summary_message
|
140
|
+
else: # Hit iteration limit
|
141
|
+
response = f"Operator hit iteration limit ({max_iterations}). If the results seem incomplete try again, assign a smaller task or try a different approach.\nThese were the results till now:\n{summary_message}"
|
142
|
+
finally:
|
143
|
+
if environment and not user_input_message: # Don't close browser if user input required
|
144
|
+
await environment.close()
|
145
|
+
if operator_agent:
|
146
|
+
operator_agent.reset()
|
147
|
+
|
148
|
+
yield {
|
149
|
+
"query": query,
|
150
|
+
"result": user_input_message or response,
|
151
|
+
"webpages": [{"link": url, "snippet": ""} for url in environment.visited_urls],
|
152
|
+
}
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# --- Standardized Action Models ---
|
2
|
+
from typing import List, Literal, Optional, Union
|
3
|
+
|
4
|
+
from pydantic import BaseModel
|
5
|
+
|
6
|
+
|
7
|
+
class Point(BaseModel):
|
8
|
+
x: float
|
9
|
+
y: float
|
10
|
+
|
11
|
+
|
12
|
+
class BaseAction(BaseModel):
|
13
|
+
type: str
|
14
|
+
|
15
|
+
|
16
|
+
class ClickAction(BaseAction):
|
17
|
+
type: Literal["click"] = "click"
|
18
|
+
x: float
|
19
|
+
y: float
|
20
|
+
button: Literal["left", "right", "middle", "wheel"] = "left"
|
21
|
+
modifiers: str = None
|
22
|
+
|
23
|
+
|
24
|
+
class DoubleClickAction(BaseAction):
|
25
|
+
type: Literal["double_click"] = "double_click"
|
26
|
+
x: float
|
27
|
+
y: float
|
28
|
+
|
29
|
+
|
30
|
+
class TripleClickAction(BaseAction):
|
31
|
+
type: Literal["triple_click"] = "triple_click"
|
32
|
+
x: float
|
33
|
+
y: float
|
34
|
+
|
35
|
+
|
36
|
+
class ScrollAction(BaseAction):
|
37
|
+
type: Literal["scroll"] = "scroll"
|
38
|
+
x: Optional[float] = None
|
39
|
+
y: Optional[float] = None
|
40
|
+
scroll_x: Optional[float] = None
|
41
|
+
scroll_y: Optional[float] = None
|
42
|
+
scroll_direction: Optional[Literal["up", "down", "left", "right"]] = None
|
43
|
+
scroll_amount: Optional[float] = 2.0
|
44
|
+
|
45
|
+
|
46
|
+
class KeypressAction(BaseAction):
|
47
|
+
type: Literal["keypress"] = "keypress"
|
48
|
+
keys: List[str] # Standardized on list of keys
|
49
|
+
|
50
|
+
|
51
|
+
class TypeAction(BaseAction):
|
52
|
+
type: Literal["type"] = "type"
|
53
|
+
text: str
|
54
|
+
|
55
|
+
|
56
|
+
class WaitAction(BaseAction):
|
57
|
+
type: Literal["wait"] = "wait"
|
58
|
+
duration: float = 1.0
|
59
|
+
|
60
|
+
|
61
|
+
class ScreenshotAction(BaseAction):
|
62
|
+
type: Literal["screenshot"] = "screenshot"
|
63
|
+
|
64
|
+
|
65
|
+
class MoveAction(BaseAction):
|
66
|
+
type: Literal["move"] = "move"
|
67
|
+
x: float
|
68
|
+
y: float
|
69
|
+
|
70
|
+
|
71
|
+
class DragAction(BaseAction):
|
72
|
+
type: Literal["drag"] = "drag"
|
73
|
+
path: List[Point]
|
74
|
+
|
75
|
+
|
76
|
+
class MouseDownAction(BaseAction):
|
77
|
+
type: Literal["mouse_down"] = "mouse_down"
|
78
|
+
button: Literal["left", "right", "middle"] = "left"
|
79
|
+
|
80
|
+
|
81
|
+
class MouseUpAction(BaseAction):
|
82
|
+
type: Literal["mouse_up"] = "mouse_up"
|
83
|
+
button: Literal["left", "right", "middle"] = "left"
|
84
|
+
|
85
|
+
|
86
|
+
class HoldKeyAction(BaseAction):
|
87
|
+
type: Literal["hold_key"] = "hold_key"
|
88
|
+
text: str # xdotool style key combination string
|
89
|
+
duration: float = 1.0
|
90
|
+
|
91
|
+
|
92
|
+
class KeyUpAction(BaseAction):
|
93
|
+
type: Literal["key_up"] = "key_up"
|
94
|
+
key: str
|
95
|
+
|
96
|
+
|
97
|
+
class KeyDownAction(BaseAction):
|
98
|
+
type: Literal["key_down"] = "key_down"
|
99
|
+
key: str
|
100
|
+
|
101
|
+
|
102
|
+
class CursorPositionAction(BaseAction):
|
103
|
+
type: Literal["cursor_position"] = "cursor_position"
|
104
|
+
|
105
|
+
|
106
|
+
class GotoAction(BaseAction):
|
107
|
+
type: Literal["goto"] = "goto"
|
108
|
+
url: str
|
109
|
+
|
110
|
+
|
111
|
+
class BackAction(BaseAction):
|
112
|
+
type: Literal["back"] = "back"
|
113
|
+
|
114
|
+
|
115
|
+
class RequestUserAction(BaseAction):
|
116
|
+
"""Request user action to confirm or provide input."""
|
117
|
+
|
118
|
+
type: Literal["request_user"] = "request_user"
|
119
|
+
request: str
|
120
|
+
|
121
|
+
|
122
|
+
class NoopAction(BaseAction):
|
123
|
+
"""No operation action."""
|
124
|
+
|
125
|
+
type: Literal["noop"] = "noop"
|
126
|
+
|
127
|
+
|
128
|
+
OperatorAction = Union[
|
129
|
+
ClickAction,
|
130
|
+
DoubleClickAction,
|
131
|
+
TripleClickAction,
|
132
|
+
ScrollAction,
|
133
|
+
KeypressAction,
|
134
|
+
TypeAction,
|
135
|
+
WaitAction,
|
136
|
+
ScreenshotAction,
|
137
|
+
MoveAction,
|
138
|
+
DragAction,
|
139
|
+
MouseDownAction,
|
140
|
+
MouseUpAction,
|
141
|
+
HoldKeyAction,
|
142
|
+
KeyDownAction,
|
143
|
+
KeyUpAction,
|
144
|
+
CursorPositionAction,
|
145
|
+
GotoAction,
|
146
|
+
BackAction,
|
147
|
+
RequestUserAction,
|
148
|
+
NoopAction,
|
149
|
+
]
|
@@ -0,0 +1,383 @@
|
|
1
|
+
import ast
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
from copy import deepcopy
|
5
|
+
from datetime import datetime
|
6
|
+
from typing import Any, List, Optional, cast
|
7
|
+
|
8
|
+
from anthropic.types.beta import BetaContentBlock
|
9
|
+
|
10
|
+
from khoj.processor.operator.operator_actions import *
|
11
|
+
from khoj.processor.operator.operator_agent_base import (
|
12
|
+
AgentActResult,
|
13
|
+
AgentMessage,
|
14
|
+
OperatorAgent,
|
15
|
+
)
|
16
|
+
from khoj.processor.operator.operator_environment_base import EnvState, EnvStepResult
|
17
|
+
from khoj.utils.helpers import get_anthropic_async_client, is_none_or_empty
|
18
|
+
|
19
|
+
logger = logging.getLogger(__name__)
|
20
|
+
|
21
|
+
|
22
|
+
# --- Anthropic Operator Agent ---
|
23
|
+
class AnthropicOperatorAgent(OperatorAgent):
|
24
|
+
async def act(self, current_state: EnvState) -> AgentActResult:
|
25
|
+
client = get_anthropic_async_client(
|
26
|
+
self.vision_model.ai_model_api.api_key, self.vision_model.ai_model_api.api_base_url
|
27
|
+
)
|
28
|
+
tool_version = "2025-01-24"
|
29
|
+
betas = [f"computer-use-{tool_version}", "token-efficient-tools-2025-02-19"]
|
30
|
+
temperature = 1.0
|
31
|
+
actions: List[OperatorAction] = []
|
32
|
+
action_results: List[dict] = []
|
33
|
+
self._commit_trace() # Commit trace before next action
|
34
|
+
|
35
|
+
system_prompt = f"""<SYSTEM_CAPABILITY>
|
36
|
+
* You are Khoj, a smart web browser operating assistant. You help the users accomplish tasks using a web browser.
|
37
|
+
* You operate a Chromium browser using Playwright via the 'computer' tool.
|
38
|
+
* You cannot access the OS or filesystem.
|
39
|
+
* You can interact with the web browser to perform tasks like clicking, typing, scrolling, and more.
|
40
|
+
* You can use the additional back() and goto() helper functions to ease navigating the browser. If you see nothing, try goto duckduckgo.com
|
41
|
+
* When viewing a webpage it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
|
42
|
+
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
|
43
|
+
* Perform web searches using DuckDuckGo. Don't use Google even if requested as the query will fail.
|
44
|
+
* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
|
45
|
+
* The current URL is {current_state.url}.
|
46
|
+
</SYSTEM_CAPABILITY>
|
47
|
+
|
48
|
+
<IMPORTANT>
|
49
|
+
* You are allowed upto {self.max_iterations} iterations to complete the task.
|
50
|
+
* Do not loop on wait, screenshot for too many turns without taking any action.
|
51
|
+
* After initialization if the browser is blank, enter a website URL using the goto() function instead of waiting
|
52
|
+
</IMPORTANT>
|
53
|
+
"""
|
54
|
+
if is_none_or_empty(self.messages):
|
55
|
+
self.messages = [AgentMessage(role="user", content=self.query)]
|
56
|
+
|
57
|
+
tools = [
|
58
|
+
{
|
59
|
+
"type": f"computer_20250124",
|
60
|
+
"name": "computer",
|
61
|
+
"display_width_px": 1024,
|
62
|
+
"display_height_px": 768,
|
63
|
+
}, # TODO: Get from env
|
64
|
+
{
|
65
|
+
"name": "back",
|
66
|
+
"description": "Go back to the previous page.",
|
67
|
+
"input_schema": {"type": "object", "properties": {}},
|
68
|
+
},
|
69
|
+
{
|
70
|
+
"name": "goto",
|
71
|
+
"description": "Go to a specific URL.",
|
72
|
+
"input_schema": {
|
73
|
+
"type": "object",
|
74
|
+
"properties": {"url": {"type": "string", "description": "Fully qualified URL to navigate to."}},
|
75
|
+
"required": ["url"],
|
76
|
+
},
|
77
|
+
},
|
78
|
+
]
|
79
|
+
|
80
|
+
thinking: dict[str, str | int] = {"type": "disabled"}
|
81
|
+
if self.vision_model.name.startswith("claude-3-7"):
|
82
|
+
thinking = {"type": "enabled", "budget_tokens": 1024}
|
83
|
+
|
84
|
+
messages_for_api = self._format_message_for_api(self.messages)
|
85
|
+
response = await client.beta.messages.create(
|
86
|
+
messages=messages_for_api,
|
87
|
+
model=self.vision_model.name,
|
88
|
+
system=system_prompt,
|
89
|
+
tools=tools,
|
90
|
+
betas=betas,
|
91
|
+
thinking=thinking,
|
92
|
+
max_tokens=4096, # TODO: Make configurable?
|
93
|
+
temperature=temperature,
|
94
|
+
)
|
95
|
+
|
96
|
+
logger.debug(f"Anthropic response: {response.model_dump_json()}")
|
97
|
+
self.messages.append(AgentMessage(role="assistant", content=response.content))
|
98
|
+
rendered_response = self._render_response(response.content, current_state.screenshot)
|
99
|
+
|
100
|
+
for block in response.content:
|
101
|
+
if block.type == "tool_use":
|
102
|
+
content = None
|
103
|
+
is_error = False
|
104
|
+
|
105
|
+
action_to_run: Optional[OperatorAction] = None
|
106
|
+
tool_input = block.input
|
107
|
+
tool_name = block.input.get("action") if block.name == "computer" else block.name
|
108
|
+
tool_use_id = block.id
|
109
|
+
|
110
|
+
try:
|
111
|
+
if tool_name == "mouse_move":
|
112
|
+
coord = self.get_coordinates(tool_input)
|
113
|
+
if coord:
|
114
|
+
action_to_run = MoveAction(x=coord[0], y=coord[1])
|
115
|
+
elif tool_name == "left_click":
|
116
|
+
coord = self.get_coordinates(tool_input)
|
117
|
+
if coord:
|
118
|
+
action_to_run = ClickAction(
|
119
|
+
x=coord[0], y=coord[1], button="left", modifier=tool_input.get("text")
|
120
|
+
)
|
121
|
+
elif tool_name == "right_click":
|
122
|
+
coord = self.get_coordinates(tool_input)
|
123
|
+
if coord:
|
124
|
+
action_to_run = ClickAction(x=coord[0], y=coord[1], button="right")
|
125
|
+
elif tool_name == "middle_click":
|
126
|
+
coord = self.get_coordinates(tool_input)
|
127
|
+
if coord:
|
128
|
+
action_to_run = ClickAction(x=coord[0], y=coord[1], button="middle")
|
129
|
+
elif tool_name == "double_click":
|
130
|
+
coord = self.get_coordinates(tool_input)
|
131
|
+
if coord:
|
132
|
+
action_to_run = DoubleClickAction(x=coord[0], y=coord[1])
|
133
|
+
elif tool_name == "triple_click":
|
134
|
+
coord = self.get_coordinates(tool_input)
|
135
|
+
if coord:
|
136
|
+
action_to_run = TripleClickAction(x=coord[0], y=coord[1])
|
137
|
+
elif tool_name == "left_click_drag":
|
138
|
+
start_coord = self.get_coordinates(tool_input, key="start_coordinate")
|
139
|
+
end_coord = self.get_coordinates(tool_input)
|
140
|
+
if start_coord and end_coord:
|
141
|
+
action_to_run = DragAction(path=[Point(x=p[0], y=p[1]) for p in [start_coord, end_coord]])
|
142
|
+
elif tool_name == "left_mouse_down":
|
143
|
+
action_to_run = MouseDownAction(button="left")
|
144
|
+
elif tool_name == "left_mouse_up":
|
145
|
+
action_to_run = MouseUpAction(button="left")
|
146
|
+
elif tool_name == "type":
|
147
|
+
text: str = tool_input.get("text")
|
148
|
+
if text:
|
149
|
+
action_to_run = TypeAction(text=text)
|
150
|
+
elif tool_name == "scroll":
|
151
|
+
direction = tool_input.get("scroll_direction")
|
152
|
+
amount = int(tool_input.get("scroll_amount", 5))
|
153
|
+
coord = self.get_coordinates(tool_input)
|
154
|
+
x = coord[0] if coord else None
|
155
|
+
y = coord[1] if coord else None
|
156
|
+
if direction:
|
157
|
+
action_to_run = ScrollAction(scroll_direction=direction, scroll_amount=amount, x=x, y=y)
|
158
|
+
elif tool_name == "key":
|
159
|
+
text = tool_input.get("text")
|
160
|
+
if text:
|
161
|
+
action_to_run = KeypressAction(keys=text.split("+")) # Split xdotool style
|
162
|
+
elif tool_name == "hold_key":
|
163
|
+
text = tool_input.get("text")
|
164
|
+
duration = tool_input.get("duration", 1.0)
|
165
|
+
if text:
|
166
|
+
action_to_run = HoldKeyAction(text=text, duration=duration)
|
167
|
+
elif tool_name == "wait":
|
168
|
+
duration = tool_input.get("duration", 1.0)
|
169
|
+
action_to_run = WaitAction(duration=duration)
|
170
|
+
elif tool_name == "screenshot":
|
171
|
+
action_to_run = ScreenshotAction()
|
172
|
+
elif tool_name == "cursor_position":
|
173
|
+
action_to_run = CursorPositionAction()
|
174
|
+
elif tool_name == "goto":
|
175
|
+
url = tool_input.get("url")
|
176
|
+
if url:
|
177
|
+
action_to_run = GotoAction(url=url)
|
178
|
+
else:
|
179
|
+
logger.warning("Goto tool called without URL.")
|
180
|
+
elif tool_name == "back":
|
181
|
+
action_to_run = BackAction()
|
182
|
+
else:
|
183
|
+
logger.warning(f"Unsupported Anthropic computer action type: {tool_name}")
|
184
|
+
|
185
|
+
except Exception as e:
|
186
|
+
error_msg = f"Error converting Anthropic action {tool_name} ({tool_input}): {e}"
|
187
|
+
logger.error(error_msg)
|
188
|
+
content = error_msg
|
189
|
+
is_error = True
|
190
|
+
action_to_run = NoopAction()
|
191
|
+
|
192
|
+
if action_to_run:
|
193
|
+
actions.append(action_to_run)
|
194
|
+
action_results.append(
|
195
|
+
{
|
196
|
+
"type": "tool_result",
|
197
|
+
"tool_use_id": tool_use_id,
|
198
|
+
"content": content, # Updated after environment step
|
199
|
+
"is_error": is_error, # Updated after environment step
|
200
|
+
}
|
201
|
+
)
|
202
|
+
|
203
|
+
self._update_usage(
|
204
|
+
response.usage.input_tokens,
|
205
|
+
response.usage.output_tokens,
|
206
|
+
response.usage.cache_read_input_tokens,
|
207
|
+
response.usage.cache_creation_input_tokens,
|
208
|
+
)
|
209
|
+
self.tracer["temperature"] = temperature
|
210
|
+
|
211
|
+
return AgentActResult(
|
212
|
+
actions=actions,
|
213
|
+
action_results=action_results,
|
214
|
+
rendered_response=rendered_response,
|
215
|
+
)
|
216
|
+
|
217
|
+
def add_action_results(self, env_steps: list[EnvStepResult], agent_action: AgentActResult):
|
218
|
+
if not agent_action.action_results:
|
219
|
+
return
|
220
|
+
|
221
|
+
# Update action results with results of applying suggested actions on the environment
|
222
|
+
for idx, env_step in enumerate(env_steps):
|
223
|
+
action_result = agent_action.action_results[idx]
|
224
|
+
result_content = env_step.error or env_step.output or "[Action completed]"
|
225
|
+
if env_step.type == "image" and isinstance(result_content, dict):
|
226
|
+
# Add screenshot data in anthropic message format
|
227
|
+
action_result["content"] = [
|
228
|
+
{
|
229
|
+
"type": "image",
|
230
|
+
"source": {
|
231
|
+
"type": "base64",
|
232
|
+
"media_type": "image/webp",
|
233
|
+
"data": result_content["image"],
|
234
|
+
},
|
235
|
+
}
|
236
|
+
]
|
237
|
+
else:
|
238
|
+
# Add text data
|
239
|
+
action_result["content"] = result_content
|
240
|
+
if env_step.error:
|
241
|
+
action_result["is_error"] = True
|
242
|
+
|
243
|
+
# Append tool results to the message history
|
244
|
+
self.messages += [AgentMessage(role="environment", content=agent_action.action_results)]
|
245
|
+
|
246
|
+
# Mark the final tool result as a cache break point
|
247
|
+
agent_action.action_results[-1]["cache_control"] = {"type": "ephemeral"}
|
248
|
+
# Remove previous cache controls
|
249
|
+
for msg in self.messages:
|
250
|
+
if msg.role == "environment" and isinstance(msg.content, list):
|
251
|
+
for block in msg.content:
|
252
|
+
if isinstance(block, dict) and "cache_control" in block:
|
253
|
+
del block["cache_control"]
|
254
|
+
|
255
|
+
def _format_message_for_api(self, messages: list[AgentMessage]) -> list[dict]:
|
256
|
+
"""Format Anthropic response into a single string."""
|
257
|
+
formatted_messages = []
|
258
|
+
for message in messages:
|
259
|
+
role = "user" if message.role == "environment" else message.role
|
260
|
+
content = (
|
261
|
+
[{"type": "text", "text": message.content}]
|
262
|
+
if not isinstance(message.content, list)
|
263
|
+
else message.content
|
264
|
+
)
|
265
|
+
formatted_messages.append(
|
266
|
+
{
|
267
|
+
"role": role,
|
268
|
+
"content": content,
|
269
|
+
}
|
270
|
+
)
|
271
|
+
return formatted_messages
|
272
|
+
|
273
|
+
def compile_response(self, response_content: list[BetaContentBlock | dict] | str) -> str:
|
274
|
+
"""Compile Anthropic response into a single string."""
|
275
|
+
if isinstance(response_content, str):
|
276
|
+
return response_content
|
277
|
+
elif is_none_or_empty(response_content):
|
278
|
+
return ""
|
279
|
+
# action results are a list dictionaries,
|
280
|
+
# beta content blocks are objects with a type attribute
|
281
|
+
elif isinstance(response_content[0], dict):
|
282
|
+
return json.dumps(response_content)
|
283
|
+
|
284
|
+
compiled_response = [""]
|
285
|
+
for block in deepcopy(response_content):
|
286
|
+
block = cast(BetaContentBlock, block) # Ensure block is of type BetaContentBlock
|
287
|
+
if block.type == "text":
|
288
|
+
compiled_response.append(block.text)
|
289
|
+
elif block.type == "tool_use":
|
290
|
+
block_input = {"action": block.name}
|
291
|
+
if block.name == "computer":
|
292
|
+
block_input = block.input # Computer action details are in input dict
|
293
|
+
elif block.name == "goto":
|
294
|
+
block_input["url"] = block.input.get("url", "[Missing URL]")
|
295
|
+
|
296
|
+
# Avoid showing large image data in compiled text log
|
297
|
+
if isinstance(block_input, dict) and block_input.get("action") == "screenshot":
|
298
|
+
block_input["image"] = "[placeholder for screenshot]"
|
299
|
+
compiled_response.append(f"**Action**: {json.dumps(block_input)}")
|
300
|
+
else:
|
301
|
+
compiled_response.append(f"**Action**: {json.dumps(block_input)}")
|
302
|
+
elif block.type == "thinking":
|
303
|
+
# Check if thinking content exists before appending
|
304
|
+
thinking_content = getattr(block, "thinking", None)
|
305
|
+
if thinking_content:
|
306
|
+
compiled_response.append(f"**Thought**: {thinking_content}")
|
307
|
+
|
308
|
+
return "\n- ".join(filter(None, compiled_response)) # Filter out empty strings
|
309
|
+
|
310
|
+
def _render_response(self, response_content: list[BetaContentBlock], screenshot: str | None) -> dict:
|
311
|
+
"""Render Anthropic response, potentially including actual screenshots."""
|
312
|
+
render_texts = []
|
313
|
+
for block in deepcopy(response_content): # Use deepcopy to avoid modifying original
|
314
|
+
if block.type == "thinking":
|
315
|
+
thinking_content = getattr(block, "thinking", None)
|
316
|
+
if thinking_content:
|
317
|
+
render_texts += [f"**Thought**: {thinking_content}"]
|
318
|
+
elif block.type == "text":
|
319
|
+
render_texts += [block.text]
|
320
|
+
elif block.type == "tool_use":
|
321
|
+
if block.name == "goto":
|
322
|
+
render_texts += [f"Open URL: {block.input.get('url', '[Missing URL]')}"]
|
323
|
+
elif block.name == "back":
|
324
|
+
render_texts += ["Go back to the previous page."]
|
325
|
+
elif block.name == "computer":
|
326
|
+
block_input = block.input
|
327
|
+
if not isinstance(block_input, dict):
|
328
|
+
render_texts += [json.dumps(block_input)]
|
329
|
+
# Handle computer action details
|
330
|
+
elif "action" in block_input:
|
331
|
+
action = block_input["action"]
|
332
|
+
if action == "type":
|
333
|
+
text: str = block_input.get("text")
|
334
|
+
if text:
|
335
|
+
render_texts += [f'Type "{text}"']
|
336
|
+
elif action == "key":
|
337
|
+
text = block_input.get("text")
|
338
|
+
if text:
|
339
|
+
render_texts += [f"Press {text}"]
|
340
|
+
elif action == "hold_key":
|
341
|
+
text = block_input.get("text")
|
342
|
+
duration = block_input.get("duration", 1.0)
|
343
|
+
if text:
|
344
|
+
render_texts += [f"Hold {text} for {duration} seconds"]
|
345
|
+
else:
|
346
|
+
# Handle other actions
|
347
|
+
render_texts += [f"{action.capitalize()}"]
|
348
|
+
|
349
|
+
# If screenshot is not available when screenshot action was requested
|
350
|
+
if isinstance(block.input, dict) and block.input.get("action") == "screenshot" and not screenshot:
|
351
|
+
render_texts += ["Failed to get screenshot"]
|
352
|
+
|
353
|
+
# Do not show screenshot if no actions requested
|
354
|
+
if all([block.type != "tool_use" for block in response_content]):
|
355
|
+
# If all blocks are not tool_use, return None
|
356
|
+
screenshot = None
|
357
|
+
|
358
|
+
# Create render payload
|
359
|
+
render_payload = {
|
360
|
+
# Combine text into a single string and filter out empty strings
|
361
|
+
"text": "\n- ".join(filter(None, render_texts)),
|
362
|
+
# Add screenshot data if available
|
363
|
+
"image": f"data:image/webp;base64,{screenshot}" if screenshot else None,
|
364
|
+
}
|
365
|
+
|
366
|
+
return render_payload
|
367
|
+
|
368
|
+
def get_coordinates(self, tool_input: dict, key: str = "coordinate") -> Optional[list | tuple]:
|
369
|
+
"""Get coordinates from tool input."""
|
370
|
+
raw_coord = tool_input.get(key)
|
371
|
+
if not raw_coord:
|
372
|
+
return None
|
373
|
+
try:
|
374
|
+
coord = ast.literal_eval(raw_coord) if isinstance(raw_coord, str) else raw_coord
|
375
|
+
except (ValueError, SyntaxError):
|
376
|
+
logger.warning(f"Could not parse coordinate from value: {raw_coord}")
|
377
|
+
return None
|
378
|
+
|
379
|
+
if not isinstance(coord, (list, tuple)) or not len(coord) == 2:
|
380
|
+
logger.warning(f"Parsed coordinate string '{raw_coord}' is not a 2-element list/tuple: {coord}")
|
381
|
+
return None
|
382
|
+
|
383
|
+
return coord
|