notte-agent 0.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notte_agent/README.md +58 -0
- notte_agent/__init__.py +7 -0
- notte_agent/common/__init__.py +0 -0
- notte_agent/common/base.py +14 -0
- notte_agent/common/captcha_detector.py +87 -0
- notte_agent/common/config.py +219 -0
- notte_agent/common/conversation.py +246 -0
- notte_agent/common/notifier.py +55 -0
- notte_agent/common/parser.py +78 -0
- notte_agent/common/perception.py +21 -0
- notte_agent/common/prompt.py +15 -0
- notte_agent/common/safe_executor.py +100 -0
- notte_agent/common/trajectory_history.py +100 -0
- notte_agent/common/types.py +41 -0
- notte_agent/common/validator.py +90 -0
- notte_agent/falco/__init__.py +0 -0
- notte_agent/falco/agent.py +343 -0
- notte_agent/falco/perception.py +83 -0
- notte_agent/falco/prompt.py +132 -0
- notte_agent/falco/prompts/system_prompt_multi_actions.md +107 -0
- notte_agent/falco/prompts/system_prompt_single_action.md +107 -0
- notte_agent/falco/trajectory_history.py +42 -0
- notte_agent/falco/types.py +132 -0
- notte_agent/gufo/__init__.py +0 -0
- notte_agent/gufo/agent.py +180 -0
- notte_agent/gufo/parser.py +79 -0
- notte_agent/gufo/perception.py +53 -0
- notte_agent/gufo/prompt.py +61 -0
- notte_agent/gufo/system.md +8 -0
- notte_agent/main.py +77 -0
- notte_agent/py.typed +0 -0
- notte_agent-0.0.dev0.dist-info/METADATA +8 -0
- notte_agent-0.0.dev0.dist-info/RECORD +34 -0
- notte_agent-0.0.dev0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import traceback
|
|
3
|
+
import typing
|
|
4
|
+
from collections.abc import Callable
|
|
5
|
+
from enum import StrEnum
|
|
6
|
+
|
|
7
|
+
import notte_core
|
|
8
|
+
from litellm import AllMessageValues, override
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from notte_browser.dom.locate import locate_element
|
|
11
|
+
from notte_browser.resolution import NodeResolutionPipe
|
|
12
|
+
from notte_browser.session import NotteSession, NotteSessionConfig
|
|
13
|
+
from notte_browser.vault import VaultScreetsScreenshotMask
|
|
14
|
+
from notte_browser.window import BrowserWindow
|
|
15
|
+
from notte_core.browser.observation import Observation
|
|
16
|
+
from notte_core.common.tracer import LlmUsageDictTracer
|
|
17
|
+
from notte_core.controller.actions import (
|
|
18
|
+
BaseAction,
|
|
19
|
+
CompletionAction,
|
|
20
|
+
FallbackObserveAction,
|
|
21
|
+
InteractionAction,
|
|
22
|
+
)
|
|
23
|
+
from notte_core.credentials.base import BaseVault, LocatorAttributes
|
|
24
|
+
from notte_core.llms.engine import LLMEngine
|
|
25
|
+
from patchright.async_api import Locator
|
|
26
|
+
|
|
27
|
+
from notte_agent.common.base import BaseAgent
|
|
28
|
+
from notte_agent.common.captcha_detector import CaptchaDetector
|
|
29
|
+
from notte_agent.common.config import AgentConfig, RaiseCondition
|
|
30
|
+
from notte_agent.common.conversation import Conversation
|
|
31
|
+
from notte_agent.common.safe_executor import ExecutionStatus, SafeActionExecutor
|
|
32
|
+
from notte_agent.common.types import AgentResponse
|
|
33
|
+
from notte_agent.common.validator import CompletionValidator
|
|
34
|
+
from notte_agent.falco.perception import FalcoPerception
|
|
35
|
+
from notte_agent.falco.prompt import FalcoPrompt
|
|
36
|
+
from notte_agent.falco.trajectory_history import FalcoTrajectoryHistory
|
|
37
|
+
from notte_agent.falco.types import StepAgentOutput
|
|
38
|
+
|
|
39
|
+
# TODO: list
|
|
40
|
+
# handle tooling calling methods for different providers (if not supported by litellm)
|
|
41
|
+
# Handle control flags
|
|
42
|
+
# Done callback
|
|
43
|
+
# Setup telemetry
|
|
44
|
+
# Setup memory
|
|
45
|
+
# Handle custom functions, e.g. `Upload file to element`ç
|
|
46
|
+
# Remove base 64 images from current state
|
|
47
|
+
# TODO: add fault tolerance LLM parsing
|
|
48
|
+
# TODO: only display modal actions when modal is open (same as before)
|
|
49
|
+
# TODO: handle prevent default click JS events
|
|
50
|
+
# TODO: add some tree structure for menu elements (like we had in notte before. Ex. Menu in Arxiv)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class HistoryType(StrEnum):
|
|
54
|
+
FULL_CONVERSATION = "full_conversation"
|
|
55
|
+
SHORT_OBSERVATIONS = "short_observations"
|
|
56
|
+
SHORT_OBSERVATIONS_WITH_RAW_DATA = "short_observations_with_raw_data"
|
|
57
|
+
SHORT_OBSERVATIONS_WITH_SHORT_DATA = "short_observations_with_short_data"
|
|
58
|
+
COMPRESSED = "compressed"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class FalcoAgentConfig(AgentConfig):
|
|
62
|
+
max_actions_per_step: int = 1
|
|
63
|
+
history_type: HistoryType = HistoryType.SHORT_OBSERVATIONS_WITH_SHORT_DATA
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
@override
|
|
67
|
+
def default_session(cls) -> NotteSessionConfig:
|
|
68
|
+
return NotteSessionConfig().disable_perception()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class FalcoAgent(BaseAgent):
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
config: FalcoAgentConfig,
|
|
75
|
+
window: BrowserWindow | None = None,
|
|
76
|
+
vault: BaseVault | None = None,
|
|
77
|
+
step_callback: Callable[[str, StepAgentOutput], None] | None = None,
|
|
78
|
+
):
|
|
79
|
+
super().__init__(session=NotteSession(config=config.session, window=window))
|
|
80
|
+
self.config: FalcoAgentConfig = config
|
|
81
|
+
self.vault: BaseVault | None = vault
|
|
82
|
+
|
|
83
|
+
self.tracer: LlmUsageDictTracer = LlmUsageDictTracer()
|
|
84
|
+
self.llm: LLMEngine = LLMEngine(
|
|
85
|
+
model=config.reasoning_model,
|
|
86
|
+
tracer=self.tracer,
|
|
87
|
+
structured_output_retries=config.session.structured_output_retries,
|
|
88
|
+
verbose=self.config.verbose,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
self.step_callback: Callable[[str, StepAgentOutput], None] | None = step_callback
|
|
92
|
+
# Users should implement their own parser to customize how observations
|
|
93
|
+
# and actions are formatted for their specific LLM and use case
|
|
94
|
+
|
|
95
|
+
if self.vault is not None:
|
|
96
|
+
# hide vault leaked credentials within llm inputs
|
|
97
|
+
self.llm.structured_completion = self.vault.patch_structured_completion(0, self.vault.get_replacement_map)(
|
|
98
|
+
self.llm.structured_completion
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
self.perception: FalcoPerception = FalcoPerception()
|
|
102
|
+
self.validator: CompletionValidator = CompletionValidator(llm=self.llm, perception=self.perception)
|
|
103
|
+
self.captcha_detector: CaptchaDetector = CaptchaDetector(llm=self.llm, perception=self.perception)
|
|
104
|
+
self.prompt: FalcoPrompt = FalcoPrompt(max_actions_per_step=config.max_actions_per_step)
|
|
105
|
+
self.conv: Conversation = Conversation(
|
|
106
|
+
max_tokens=config.max_history_tokens,
|
|
107
|
+
convert_tools_to_assistant=True,
|
|
108
|
+
autosize=True,
|
|
109
|
+
model=config.reasoning_model,
|
|
110
|
+
)
|
|
111
|
+
self.history_type: HistoryType = config.history_type
|
|
112
|
+
self.trajectory: FalcoTrajectoryHistory = FalcoTrajectoryHistory(max_error_length=config.max_error_length)
|
|
113
|
+
|
|
114
|
+
async def execute_action(action: BaseAction) -> Observation:
|
|
115
|
+
if self.vault is not None and self.vault.contains_credentials(action):
|
|
116
|
+
action_with_selector = await NodeResolutionPipe.forward(action, self.session.snapshot)
|
|
117
|
+
if isinstance(action_with_selector, InteractionAction) and action_with_selector.selector is not None:
|
|
118
|
+
locator: Locator = await locate_element(self.session.window.page, action_with_selector.selector)
|
|
119
|
+
assert (
|
|
120
|
+
isinstance(action_with_selector, InteractionAction)
|
|
121
|
+
and action_with_selector.selector is not None
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
attrs = await FalcoAgent.compute_locator_attributes(locator)
|
|
125
|
+
action = self.vault.replace_credentials(
|
|
126
|
+
action,
|
|
127
|
+
attrs,
|
|
128
|
+
self.session.snapshot,
|
|
129
|
+
)
|
|
130
|
+
return await self.session.act(action)
|
|
131
|
+
|
|
132
|
+
self.step_executor: SafeActionExecutor[BaseAction, Observation] = SafeActionExecutor(
|
|
133
|
+
func=execute_action,
|
|
134
|
+
raise_on_failure=(self.config.raise_condition is RaiseCondition.IMMEDIATELY),
|
|
135
|
+
max_consecutive_failures=config.max_consecutive_failures,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
async def compute_locator_attributes(locator: Locator) -> LocatorAttributes:
|
|
140
|
+
attr_type = await locator.get_attribute("type")
|
|
141
|
+
autocomplete = await locator.get_attribute("autocomplete")
|
|
142
|
+
outer_html = await locator.evaluate("el => el.outerHTML")
|
|
143
|
+
return LocatorAttributes(type=attr_type, autocomplete=autocomplete, outerHTML=outer_html)
|
|
144
|
+
|
|
145
|
+
async def reset(self) -> None:
|
|
146
|
+
self.conv.reset()
|
|
147
|
+
self.trajectory.reset()
|
|
148
|
+
self.step_executor.reset()
|
|
149
|
+
await self.session.reset()
|
|
150
|
+
|
|
151
|
+
def output(self, answer: str, success: bool) -> AgentResponse:
|
|
152
|
+
return AgentResponse(
|
|
153
|
+
answer=answer,
|
|
154
|
+
success=success,
|
|
155
|
+
session_trajectory=self.session.trajectory,
|
|
156
|
+
agent_trajectory=self.trajectory.steps, # type: ignore[reportArgumentType]
|
|
157
|
+
messages=self.conv.messages(),
|
|
158
|
+
duration_in_s=time.time() - self.start_time,
|
|
159
|
+
llm_usage=self.tracer.usage,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
async def get_messages(self, task: str) -> list[AllMessageValues]:
|
|
163
|
+
self.conv.reset()
|
|
164
|
+
system_msg, task_msg = self.prompt.system(), self.prompt.task(task)
|
|
165
|
+
if self.vault is not None:
|
|
166
|
+
system_msg += "\n" + self.vault.instructions()
|
|
167
|
+
self.conv.add_system_message(content=system_msg)
|
|
168
|
+
self.conv.add_user_message(content=task_msg)
|
|
169
|
+
# just for logging
|
|
170
|
+
traj_msg = self.trajectory.perceive()
|
|
171
|
+
if self.config.verbose:
|
|
172
|
+
logger.info(f"🔍 Trajectory history:\n{traj_msg}")
|
|
173
|
+
# add trajectory to the conversation
|
|
174
|
+
match self.history_type:
|
|
175
|
+
case HistoryType.COMPRESSED:
|
|
176
|
+
self.conv.add_user_message(content=traj_msg)
|
|
177
|
+
case _:
|
|
178
|
+
if len(self.trajectory.steps) == 0:
|
|
179
|
+
self.conv.add_user_message(content=self.trajectory.start_rules())
|
|
180
|
+
for step in self.trajectory.steps:
|
|
181
|
+
# TODO: choose if we want this to be an assistant message or a tool message
|
|
182
|
+
# self.conv.add_tool_message(step.agent_response, tool_id="step")
|
|
183
|
+
self.conv.add_assistant_message(step.agent_response.model_dump_json(exclude_none=True))
|
|
184
|
+
for result in step.results:
|
|
185
|
+
short_step_msg = self.trajectory.perceive_step_result(result, include_ids=True)
|
|
186
|
+
self.conv.add_user_message(content=short_step_msg)
|
|
187
|
+
if not result.success:
|
|
188
|
+
continue
|
|
189
|
+
# add observation data to the conversation
|
|
190
|
+
obs = result.get()
|
|
191
|
+
match (self.history_type, obs.has_data()):
|
|
192
|
+
case (HistoryType.FULL_CONVERSATION, _):
|
|
193
|
+
self.conv.add_user_message(
|
|
194
|
+
content=self.perception.perceive(obs),
|
|
195
|
+
image=(obs.screenshot if self.config.include_screenshot else None),
|
|
196
|
+
)
|
|
197
|
+
case (HistoryType.SHORT_OBSERVATIONS_WITH_RAW_DATA, True):
|
|
198
|
+
# add data if data was scraped
|
|
199
|
+
self.conv.add_user_message(content=self.perception.perceive_data(obs, raw=True))
|
|
200
|
+
|
|
201
|
+
case (HistoryType.SHORT_OBSERVATIONS_WITH_SHORT_DATA, True):
|
|
202
|
+
self.conv.add_user_message(content=self.perception.perceive_data(obs, raw=False))
|
|
203
|
+
case _:
|
|
204
|
+
pass
|
|
205
|
+
|
|
206
|
+
last_valid_obs = self.trajectory.last_obs()
|
|
207
|
+
if last_valid_obs is not None and self.history_type is not HistoryType.FULL_CONVERSATION:
|
|
208
|
+
self.conv.add_user_message(
|
|
209
|
+
content=self.perception.perceive(last_valid_obs),
|
|
210
|
+
image=(last_valid_obs.screenshot if self.config.include_screenshot else None),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if len(self.trajectory.steps) > 0:
|
|
214
|
+
self.conv.add_user_message(self.prompt.action_message())
|
|
215
|
+
|
|
216
|
+
return self.conv.messages()
|
|
217
|
+
|
|
218
|
+
async def step(self, task: str) -> CompletionAction | None:
|
|
219
|
+
"""Execute a single step of the agent"""
|
|
220
|
+
messages = await self.get_messages(task)
|
|
221
|
+
response: StepAgentOutput = self.llm.structured_completion(messages, response_format=StepAgentOutput)
|
|
222
|
+
if self.step_callback is not None:
|
|
223
|
+
self.step_callback(task, response)
|
|
224
|
+
|
|
225
|
+
if self.config.verbose:
|
|
226
|
+
logger.info(f"🔍 LLM response:\n{response}")
|
|
227
|
+
|
|
228
|
+
for line in response.pretty_string().split("\n"):
|
|
229
|
+
logger.opt(colors=True).info(line)
|
|
230
|
+
|
|
231
|
+
self.trajectory.add_output(response)
|
|
232
|
+
# check for completion
|
|
233
|
+
if response.output is not None:
|
|
234
|
+
return response.output
|
|
235
|
+
# Execute the actions
|
|
236
|
+
for action in response.get_actions(self.config.max_actions_per_step):
|
|
237
|
+
result = await self.step_executor.execute(action)
|
|
238
|
+
|
|
239
|
+
self.trajectory.add_step(result)
|
|
240
|
+
step_msg = self.trajectory.perceive_step_result(result, include_ids=True)
|
|
241
|
+
logger.info(f"{step_msg}\n\n")
|
|
242
|
+
if not result.success:
|
|
243
|
+
# observe again
|
|
244
|
+
obs = await self.session.observe()
|
|
245
|
+
|
|
246
|
+
# cast is necessary because we cant have covariance
|
|
247
|
+
# in ExecutionStatus
|
|
248
|
+
ex_status = ExecutionStatus(
|
|
249
|
+
input=typing.cast(BaseAction, FallbackObserveAction()),
|
|
250
|
+
output=obs,
|
|
251
|
+
success=True,
|
|
252
|
+
message="Observed",
|
|
253
|
+
)
|
|
254
|
+
self.trajectory.add_output(response)
|
|
255
|
+
self.trajectory.add_step(ex_status)
|
|
256
|
+
|
|
257
|
+
# stop the loop
|
|
258
|
+
break
|
|
259
|
+
# Successfully executed the action
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
@override
|
|
263
|
+
async def run(self, task: str, url: str | None = None) -> AgentResponse:
|
|
264
|
+
logger.info(f"Running task: {task}")
|
|
265
|
+
self.start_time: float = time.time()
|
|
266
|
+
try:
|
|
267
|
+
return await self._run(task, url=url)
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
if self.config.raise_condition is RaiseCondition.NEVER:
|
|
271
|
+
return self.output(f"Failed due to {e}: {traceback.format_exc()}", False)
|
|
272
|
+
raise e
|
|
273
|
+
|
|
274
|
+
async def _human_in_the_loop(self) -> None:
|
|
275
|
+
# Check for captcha if human-in-the-loop is enabled
|
|
276
|
+
captcha_result = self.captcha_detector.detect(self.session.trajectory[-1])
|
|
277
|
+
if captcha_result.has_captcha:
|
|
278
|
+
logger.warning(f"⚠️ Captcha detected: {captcha_result.description}")
|
|
279
|
+
logger.info("🔄 Waiting for human intervention...")
|
|
280
|
+
_ = input("Press Enter to continue after solving the captcha...")
|
|
281
|
+
# Observe again after human intervention
|
|
282
|
+
obs = await self.session.observe()
|
|
283
|
+
self.trajectory.add_step(
|
|
284
|
+
ExecutionStatus(
|
|
285
|
+
input=typing.cast(BaseAction, FallbackObserveAction()),
|
|
286
|
+
output=obs,
|
|
287
|
+
success=True,
|
|
288
|
+
message="Observed after human intervention",
|
|
289
|
+
)
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
async def _run(self, task: str, url: str | None = None) -> AgentResponse:
|
|
293
|
+
"""Execute the task with maximum number of steps"""
|
|
294
|
+
# change this to DEV if you want more explicit error messages
|
|
295
|
+
# when you are developing your own agent
|
|
296
|
+
notte_core.set_error_mode("agent")
|
|
297
|
+
if url is not None:
|
|
298
|
+
task = f"Start on '{url}' and {task}"
|
|
299
|
+
|
|
300
|
+
# Loop through the steps
|
|
301
|
+
async with self.session:
|
|
302
|
+
# hide vault leaked credentials within screenshots
|
|
303
|
+
if self.vault is not None:
|
|
304
|
+
self.session.window.screenshot_mask = VaultScreetsScreenshotMask(vault=self.vault)
|
|
305
|
+
|
|
306
|
+
for step in range(self.session.config.max_steps):
|
|
307
|
+
logger.info(f"💡 Step {step}")
|
|
308
|
+
output: CompletionAction | None = await self.step(task)
|
|
309
|
+
# Check for captcha if human-in-the-loop is enabled
|
|
310
|
+
if self.config.human_in_the_loop:
|
|
311
|
+
await self._human_in_the_loop()
|
|
312
|
+
if output is None:
|
|
313
|
+
continue
|
|
314
|
+
# validate the output
|
|
315
|
+
if not output.success:
|
|
316
|
+
logger.error(f"🚨 Agent terminated early with failure: {output.answer}")
|
|
317
|
+
return self.output(output.answer, False)
|
|
318
|
+
# Sucessful execution and LLM output is not None
|
|
319
|
+
# Need to validate the output
|
|
320
|
+
logger.info(f"🔥 Validating agent output:\n{output.model_dump_json()}")
|
|
321
|
+
val = self.validator.validate(task, output, self.session.trajectory[-1])
|
|
322
|
+
if val.is_valid:
|
|
323
|
+
# Successfully validated the output
|
|
324
|
+
logger.info("✅ Task completed successfully")
|
|
325
|
+
return self.output(output.answer, output.success)
|
|
326
|
+
else:
|
|
327
|
+
# TODO handle that differently
|
|
328
|
+
failed_val_msg = f"Final validation failed: {val.reason}. Continuing..."
|
|
329
|
+
logger.error(failed_val_msg)
|
|
330
|
+
# add the validation result to the trajectory and continue
|
|
331
|
+
self.trajectory.add_step(
|
|
332
|
+
ExecutionStatus(
|
|
333
|
+
input=output,
|
|
334
|
+
output=None,
|
|
335
|
+
success=False,
|
|
336
|
+
message=failed_val_msg,
|
|
337
|
+
)
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
error_msg = f"Failed to solve task in {self.session.config.max_steps} steps"
|
|
341
|
+
logger.info(f"🚨 {error_msg}")
|
|
342
|
+
notte_core.set_error_mode("developer")
|
|
343
|
+
return self.output(error_msg, False)
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from typing import final
|
|
2
|
+
|
|
3
|
+
from notte_core.browser.observation import Observation
|
|
4
|
+
from typing_extensions import override
|
|
5
|
+
|
|
6
|
+
from notte_agent.common.perception import BasePerception
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@final
|
|
10
|
+
class FalcoPerception(BasePerception):
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
include_step_info: bool = True,
|
|
14
|
+
include_attributes: list[str] | None = None,
|
|
15
|
+
):
|
|
16
|
+
self.include_attributes = include_attributes
|
|
17
|
+
self.include_step_info = include_step_info
|
|
18
|
+
|
|
19
|
+
@override
|
|
20
|
+
def perceive_metadata(self, obs: Observation) -> str:
|
|
21
|
+
if obs.progress is None:
|
|
22
|
+
raise ValueError("Observation has no progress")
|
|
23
|
+
return f"""
|
|
24
|
+
You will see the following only once. If you need to remember it and you dont know it yet, write it down in the memory.
|
|
25
|
+
|
|
26
|
+
* Current url: {obs.metadata.url}
|
|
27
|
+
* Current page title: {obs.metadata.title}
|
|
28
|
+
* Current date and time: {obs.metadata.timestamp.strftime("%Y-%m-%d %H:%M:%S")}
|
|
29
|
+
* Available tabs:
|
|
30
|
+
{obs.metadata.tabs}
|
|
31
|
+
* Current step: {obs.progress.current_step + 1}/{obs.progress.max_steps}'
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
@override
|
|
35
|
+
def perceive(self, obs: Observation) -> str:
|
|
36
|
+
return f"""
|
|
37
|
+
[Relevant metadata]
|
|
38
|
+
{self.perceive_metadata(obs)}
|
|
39
|
+
|
|
40
|
+
[Interaction elements and context]
|
|
41
|
+
{self.perceive_actions(obs)}
|
|
42
|
+
|
|
43
|
+
[Data found in the page]
|
|
44
|
+
{self.perceive_data(obs)}
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
@override
|
|
48
|
+
def perceive_actions(self, obs: Observation) -> str:
|
|
49
|
+
px_above = obs.metadata.viewport.pixels_above
|
|
50
|
+
px_below = obs.metadata.viewport.pixels_below
|
|
51
|
+
|
|
52
|
+
more_above = f"... {px_above} pixels above - scroll or scrape content to see more ..."
|
|
53
|
+
more_below = f"... {px_below} pixels below - scroll or scrape content to see more ..."
|
|
54
|
+
|
|
55
|
+
space_description = obs.space.description
|
|
56
|
+
|
|
57
|
+
return f"""
|
|
58
|
+
[Start of page]
|
|
59
|
+
{more_above if px_above > 0 else ""}
|
|
60
|
+
{space_description or "No content to display"}
|
|
61
|
+
{more_below if px_below > 0 else ""}
|
|
62
|
+
[End of page]
|
|
63
|
+
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
@override
|
|
67
|
+
def perceive_data(self, obs: Observation, raw: bool = True) -> str:
|
|
68
|
+
if not obs.has_data() or obs.data is None:
|
|
69
|
+
return ""
|
|
70
|
+
if raw:
|
|
71
|
+
percieved_data = obs.data.markdown
|
|
72
|
+
else:
|
|
73
|
+
structured_data = obs.data.structured
|
|
74
|
+
if structured_data is None or not structured_data.success or structured_data.data is None:
|
|
75
|
+
error_msg = f" with error: {structured_data.error}" if structured_data is not None else ""
|
|
76
|
+
return f"Scraping failed{error_msg}. Please try again with different instructions."
|
|
77
|
+
percieved_data = structured_data.data.model_dump_json()
|
|
78
|
+
|
|
79
|
+
return f"""
|
|
80
|
+
Data scraped from current page view:
|
|
81
|
+
|
|
82
|
+
{percieved_data or "No valid data to display"}
|
|
83
|
+
"""
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
from enum import StrEnum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import chevron
|
|
6
|
+
from notte_core.controller.actions import (
|
|
7
|
+
BaseAction,
|
|
8
|
+
ClickAction,
|
|
9
|
+
CompletionAction,
|
|
10
|
+
FallbackObserveAction,
|
|
11
|
+
FillAction,
|
|
12
|
+
GotoAction,
|
|
13
|
+
ScrapeAction,
|
|
14
|
+
)
|
|
15
|
+
from notte_core.controller.space import ActionSpace
|
|
16
|
+
|
|
17
|
+
system_prompt_dir = Path(__file__).parent / "prompts"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PromptType(StrEnum):
|
|
21
|
+
SINGLE_ACTION = "single_action"
|
|
22
|
+
MULTI_ACTION = "multi_action"
|
|
23
|
+
|
|
24
|
+
def prompt_file(self) -> Path:
|
|
25
|
+
match self:
|
|
26
|
+
case PromptType.SINGLE_ACTION:
|
|
27
|
+
return system_prompt_dir / "system_prompt_single_action.md"
|
|
28
|
+
case PromptType.MULTI_ACTION:
|
|
29
|
+
return system_prompt_dir / "system_prompt_multi_action.md"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class FalcoPrompt:
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
max_actions_per_step: int,
|
|
36
|
+
) -> None:
|
|
37
|
+
multi_act = max_actions_per_step > 1
|
|
38
|
+
prompt_type = PromptType.MULTI_ACTION if multi_act else PromptType.SINGLE_ACTION
|
|
39
|
+
self.system_prompt: str = prompt_type.prompt_file().read_text()
|
|
40
|
+
self.max_actions_per_step: int = max_actions_per_step
|
|
41
|
+
self.space: ActionSpace = ActionSpace(description="", exclude_actions={FallbackObserveAction})
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def _json_dump(steps: list[BaseAction]) -> str:
|
|
45
|
+
lines = ",\n ".join([action.dump_str() for action in steps])
|
|
46
|
+
return "[\n " + lines + "\n]"
|
|
47
|
+
|
|
48
|
+
def example_form_filling(self) -> str:
|
|
49
|
+
return self._json_dump(
|
|
50
|
+
[FillAction(id="I99", value="username"), FillAction(id="I101", value="password"), ClickAction(id="B1")]
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def example_invalid_sequence(self) -> str:
|
|
54
|
+
return self._json_dump(
|
|
55
|
+
[
|
|
56
|
+
ClickAction(id="L1"),
|
|
57
|
+
ClickAction(id="B4"),
|
|
58
|
+
ClickAction(id="L2"),
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def example_navigation_and_extraction(self) -> str:
|
|
63
|
+
return self._json_dump([GotoAction(url="https://www.google.com"), ScrapeAction()])
|
|
64
|
+
|
|
65
|
+
def completion_example(self) -> str:
|
|
66
|
+
return self._json_dump([CompletionAction(success=True, answer="<answer to the task>")])
|
|
67
|
+
|
|
68
|
+
def example_step(self) -> str:
|
|
69
|
+
goal_eval = (
|
|
70
|
+
"Analyze the current elements and the image to check if the previous goals/actions"
|
|
71
|
+
" are successful like intended by the task. Ignore the action result. The website is the ground truth. "
|
|
72
|
+
"Also mention if something unexpected happened like new suggestions in an input field. "
|
|
73
|
+
"Shortly state why/why not"
|
|
74
|
+
)
|
|
75
|
+
return chevron.render(
|
|
76
|
+
"""
|
|
77
|
+
{
|
|
78
|
+
"state": {
|
|
79
|
+
"page_summary": "On the page are company a,b,c wtih their revenue 1,2,3.",
|
|
80
|
+
"relevant_interactions": [{"id": "B2", "reason": "The button with id B2 represents search and I'm looking to search"}],
|
|
81
|
+
"previous_goal_status": "success|failure|unknown",
|
|
82
|
+
"previous_goal_eval": "{{goal_eval}}",
|
|
83
|
+
"memory": "Description of what has been done and what you need to remember until the end of the task",
|
|
84
|
+
"next_goal": "What needs to be done with the next actions"
|
|
85
|
+
},
|
|
86
|
+
"actions": [
|
|
87
|
+
{ "one_action_name": {
|
|
88
|
+
// action-specific parameter
|
|
89
|
+
...
|
|
90
|
+
}
|
|
91
|
+
}, // ... more actions in sequence ...
|
|
92
|
+
]
|
|
93
|
+
}
|
|
94
|
+
""",
|
|
95
|
+
{"goal_eval": goal_eval},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def system(self) -> str:
|
|
99
|
+
return chevron.render(
|
|
100
|
+
self.system_prompt,
|
|
101
|
+
{
|
|
102
|
+
"timstamp": dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
103
|
+
"max_actions_per_step": self.max_actions_per_step,
|
|
104
|
+
"action_description": self.space.markdown(),
|
|
105
|
+
"example_form_filling": self.example_form_filling(),
|
|
106
|
+
"example_step": self.example_step(),
|
|
107
|
+
"completion_example": self.completion_example(),
|
|
108
|
+
"completion_action_name": CompletionAction.name(),
|
|
109
|
+
"goto_action_name": GotoAction.name(),
|
|
110
|
+
"example_navigation_and_extraction": self.example_navigation_and_extraction(),
|
|
111
|
+
"example_invalid_sequence": self.example_invalid_sequence(),
|
|
112
|
+
},
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def task(self, task: str):
|
|
116
|
+
return f"""
|
|
117
|
+
Your ultimate task is: "{task}".
|
|
118
|
+
If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task.
|
|
119
|
+
If not, continue as usual.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def new_task(self, task: str) -> str:
|
|
123
|
+
return f"""
|
|
124
|
+
Your new ultimate task is: {task}.
|
|
125
|
+
Take the previous context into account and finish your new ultimate task.
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def action_message(self) -> str:
|
|
129
|
+
return """Given the previous information, start by reflecting on your last action. Then, summarize the current page and list relevant available interactions.
|
|
130
|
+
Absolutely do not under any circumstance list or pay attention to any id that is not explicitly found in the page.
|
|
131
|
+
From there, select the your next goal, and in turn, your next action.
|
|
132
|
+
"""
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
You are a precise browser automation agent that interacts with websites through structured commands.
|
|
2
|
+
Your role is to:
|
|
3
|
+
1. Analyze the provided webpage elements and structure
|
|
4
|
+
2. Plan a sequence of actions to accomplish the given task
|
|
5
|
+
3. Respond with valid JSON containing your action sequence and state assessment
|
|
6
|
+
|
|
7
|
+
Current date and time: {{timstamp}}
|
|
8
|
+
|
|
9
|
+
INPUT STRUCTURE:
|
|
10
|
+
1. Current URL: The webpage you're currently on
|
|
11
|
+
2. Available Tabs: List of open browser tabs
|
|
12
|
+
3. Interactive Elements: List in the format:
|
|
13
|
+
id[:]<element_type>element_text</element_type>
|
|
14
|
+
- `id`: identifier for interaction. `ids` can be decomposed into `<role_first_letter><index>[:]` where `<index>` is the index of the element in the list of elements with the same role and `<role_first_letter>` are:
|
|
15
|
+
- `I` for input fields (textbox, select, checkbox, etc.)
|
|
16
|
+
- `B` for buttons
|
|
17
|
+
- `L` for links
|
|
18
|
+
- `F` for figures and images
|
|
19
|
+
- `O` for options in select elements
|
|
20
|
+
- `M` for miscallaneous elements (e.g. modals, dialogs, etc.) that are only clickable for the most part.
|
|
21
|
+
- `element_type`: HTML element type (button, input, etc.)
|
|
22
|
+
- `element_text`: Visible text or element description
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
B1[:]<button>Submit Form</button>
|
|
26
|
+
_[:] Non-interactive text
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
Notes:
|
|
30
|
+
- Only elements with `ids` are interactive
|
|
31
|
+
- `_[:]` elements provide context but cannot be interacted with
|
|
32
|
+
|
|
33
|
+
1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
|
|
34
|
+
```json
|
|
35
|
+
{{& example_step}}
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item.
|
|
39
|
+
|
|
40
|
+
Common action sequences:
|
|
41
|
+
- Form filling:
|
|
42
|
+
```json
|
|
43
|
+
{{& example_form_filling}}
|
|
44
|
+
```
|
|
45
|
+
- Navigation and extraction:
|
|
46
|
+
```json
|
|
47
|
+
{{& example_navigation_and_extraction}}
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
CRITICAL: some actions sequences are invalid because they cannot be executed in the same step without triggering a page change:
|
|
51
|
+
- `link clicks` always trigger a page change and hence cannot be part of multiple actions, e.g. this sequence is invalid:
|
|
52
|
+
```json
|
|
53
|
+
{{& example_invalid_sequence}}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
3. ELEMENT INTERACTION:
|
|
59
|
+
- Only use `ids` that exist in the provided element list
|
|
60
|
+
- Each element has a unique `id` (e.g., `I2[:]<button>`)
|
|
61
|
+
- Elements marked with `_[:]` are non-interactive (for context only)
|
|
62
|
+
|
|
63
|
+
4. NAVIGATION & ERROR HANDLING:
|
|
64
|
+
- If no suitable elements exist, use other functions to complete the task
|
|
65
|
+
- If stuck, try alternative approaches
|
|
66
|
+
- Handle popups/cookies by accepting or closing them
|
|
67
|
+
- Use scroll to find elements you are looking for
|
|
68
|
+
|
|
69
|
+
5. TASK COMPLETION:
|
|
70
|
+
- Use the `{{completion_action_name}}` action as the last action as soon as the task is complete
|
|
71
|
+
- Don't hallucinate actions
|
|
72
|
+
- If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
|
|
73
|
+
- If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
|
|
74
|
+
|
|
75
|
+
- Example of sucessfuly `{{completion_action_name}}` action:
|
|
76
|
+
```json
|
|
77
|
+
{{& completion_example}}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
6. VISUAL CONTEXT:
|
|
81
|
+
- When an image is provided, use it to understand the page layout
|
|
82
|
+
- Bounding boxes with labels correspond to element indexes
|
|
83
|
+
- Each bounding box and its label have the same color
|
|
84
|
+
- Most often the label is inside the bounding box, on the top right
|
|
85
|
+
- Visual context helps verify element locations and relationships
|
|
86
|
+
- sometimes labels overlap, so use the context to verify the correct element
|
|
87
|
+
|
|
88
|
+
7. Form filling:
|
|
89
|
+
- If you fill an input field and your action sequence is interrupted, most often a list with suggestions popped up under the field and you need to first select the right element from the suggestion list.
|
|
90
|
+
|
|
91
|
+
8. ACTION SEQUENCING:
|
|
92
|
+
- Actions are executed in the order they appear in the list
|
|
93
|
+
- Each action should logically follow from the previous one
|
|
94
|
+
- If the page changes after an action, the sequence is interrupted and you get the new state.
|
|
95
|
+
- If content only disappears the sequence continues.
|
|
96
|
+
- Only provide the action sequence until you think the page will change.
|
|
97
|
+
- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
|
|
98
|
+
- only use multiple actions if it makes sense.
|
|
99
|
+
- use maximum {{max_actions_per_step}} actions per sequence
|
|
100
|
+
|
|
101
|
+
9. Long tasks:
|
|
102
|
+
- If the task is long keep track of the status in the memory. If the ultimate task requires multiple subinformation, keep track of the status in the memory
|
|
103
|
+
|
|
104
|
+
Functions:
|
|
105
|
+
{{& action_description}}
|
|
106
|
+
|
|
107
|
+
Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid.
|