notte-agent 0.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,343 @@
1
+ import time
2
+ import traceback
3
+ import typing
4
+ from collections.abc import Callable
5
+ from enum import StrEnum
6
+
7
+ import notte_core
8
+ from litellm import AllMessageValues, override
9
+ from loguru import logger
10
+ from notte_browser.dom.locate import locate_element
11
+ from notte_browser.resolution import NodeResolutionPipe
12
+ from notte_browser.session import NotteSession, NotteSessionConfig
13
+ from notte_browser.vault import VaultScreetsScreenshotMask
14
+ from notte_browser.window import BrowserWindow
15
+ from notte_core.browser.observation import Observation
16
+ from notte_core.common.tracer import LlmUsageDictTracer
17
+ from notte_core.controller.actions import (
18
+ BaseAction,
19
+ CompletionAction,
20
+ FallbackObserveAction,
21
+ InteractionAction,
22
+ )
23
+ from notte_core.credentials.base import BaseVault, LocatorAttributes
24
+ from notte_core.llms.engine import LLMEngine
25
+ from patchright.async_api import Locator
26
+
27
+ from notte_agent.common.base import BaseAgent
28
+ from notte_agent.common.captcha_detector import CaptchaDetector
29
+ from notte_agent.common.config import AgentConfig, RaiseCondition
30
+ from notte_agent.common.conversation import Conversation
31
+ from notte_agent.common.safe_executor import ExecutionStatus, SafeActionExecutor
32
+ from notte_agent.common.types import AgentResponse
33
+ from notte_agent.common.validator import CompletionValidator
34
+ from notte_agent.falco.perception import FalcoPerception
35
+ from notte_agent.falco.prompt import FalcoPrompt
36
+ from notte_agent.falco.trajectory_history import FalcoTrajectoryHistory
37
+ from notte_agent.falco.types import StepAgentOutput
38
+
39
+ # TODO: list
40
+ # handle tooling calling methods for different providers (if not supported by litellm)
41
+ # Handle control flags
42
+ # Done callback
43
+ # Setup telemetry
44
+ # Setup memory
45
+ # Handle custom functions, e.g. `Upload file to element`ç
46
+ # Remove base 64 images from current state
47
+ # TODO: add fault tolerance LLM parsing
48
+ # TODO: only display modal actions when modal is open (same as before)
49
+ # TODO: handle prevent default click JS events
50
+ # TODO: add some tree structure for menu elements (like we had in notte before. Ex. Menu in Arxiv)
51
+
52
+
53
+ class HistoryType(StrEnum):
54
+ FULL_CONVERSATION = "full_conversation"
55
+ SHORT_OBSERVATIONS = "short_observations"
56
+ SHORT_OBSERVATIONS_WITH_RAW_DATA = "short_observations_with_raw_data"
57
+ SHORT_OBSERVATIONS_WITH_SHORT_DATA = "short_observations_with_short_data"
58
+ COMPRESSED = "compressed"
59
+
60
+
61
+ class FalcoAgentConfig(AgentConfig):
62
+ max_actions_per_step: int = 1
63
+ history_type: HistoryType = HistoryType.SHORT_OBSERVATIONS_WITH_SHORT_DATA
64
+
65
+ @classmethod
66
+ @override
67
+ def default_session(cls) -> NotteSessionConfig:
68
+ return NotteSessionConfig().disable_perception()
69
+
70
+
71
+ class FalcoAgent(BaseAgent):
72
+ def __init__(
73
+ self,
74
+ config: FalcoAgentConfig,
75
+ window: BrowserWindow | None = None,
76
+ vault: BaseVault | None = None,
77
+ step_callback: Callable[[str, StepAgentOutput], None] | None = None,
78
+ ):
79
+ super().__init__(session=NotteSession(config=config.session, window=window))
80
+ self.config: FalcoAgentConfig = config
81
+ self.vault: BaseVault | None = vault
82
+
83
+ self.tracer: LlmUsageDictTracer = LlmUsageDictTracer()
84
+ self.llm: LLMEngine = LLMEngine(
85
+ model=config.reasoning_model,
86
+ tracer=self.tracer,
87
+ structured_output_retries=config.session.structured_output_retries,
88
+ verbose=self.config.verbose,
89
+ )
90
+
91
+ self.step_callback: Callable[[str, StepAgentOutput], None] | None = step_callback
92
+ # Users should implement their own parser to customize how observations
93
+ # and actions are formatted for their specific LLM and use case
94
+
95
+ if self.vault is not None:
96
+ # hide vault leaked credentials within llm inputs
97
+ self.llm.structured_completion = self.vault.patch_structured_completion(0, self.vault.get_replacement_map)(
98
+ self.llm.structured_completion
99
+ )
100
+
101
+ self.perception: FalcoPerception = FalcoPerception()
102
+ self.validator: CompletionValidator = CompletionValidator(llm=self.llm, perception=self.perception)
103
+ self.captcha_detector: CaptchaDetector = CaptchaDetector(llm=self.llm, perception=self.perception)
104
+ self.prompt: FalcoPrompt = FalcoPrompt(max_actions_per_step=config.max_actions_per_step)
105
+ self.conv: Conversation = Conversation(
106
+ max_tokens=config.max_history_tokens,
107
+ convert_tools_to_assistant=True,
108
+ autosize=True,
109
+ model=config.reasoning_model,
110
+ )
111
+ self.history_type: HistoryType = config.history_type
112
+ self.trajectory: FalcoTrajectoryHistory = FalcoTrajectoryHistory(max_error_length=config.max_error_length)
113
+
114
+ async def execute_action(action: BaseAction) -> Observation:
115
+ if self.vault is not None and self.vault.contains_credentials(action):
116
+ action_with_selector = await NodeResolutionPipe.forward(action, self.session.snapshot)
117
+ if isinstance(action_with_selector, InteractionAction) and action_with_selector.selector is not None:
118
+ locator: Locator = await locate_element(self.session.window.page, action_with_selector.selector)
119
+ assert (
120
+ isinstance(action_with_selector, InteractionAction)
121
+ and action_with_selector.selector is not None
122
+ )
123
+
124
+ attrs = await FalcoAgent.compute_locator_attributes(locator)
125
+ action = self.vault.replace_credentials(
126
+ action,
127
+ attrs,
128
+ self.session.snapshot,
129
+ )
130
+ return await self.session.act(action)
131
+
132
+ self.step_executor: SafeActionExecutor[BaseAction, Observation] = SafeActionExecutor(
133
+ func=execute_action,
134
+ raise_on_failure=(self.config.raise_condition is RaiseCondition.IMMEDIATELY),
135
+ max_consecutive_failures=config.max_consecutive_failures,
136
+ )
137
+
138
+ @staticmethod
139
+ async def compute_locator_attributes(locator: Locator) -> LocatorAttributes:
140
+ attr_type = await locator.get_attribute("type")
141
+ autocomplete = await locator.get_attribute("autocomplete")
142
+ outer_html = await locator.evaluate("el => el.outerHTML")
143
+ return LocatorAttributes(type=attr_type, autocomplete=autocomplete, outerHTML=outer_html)
144
+
145
+ async def reset(self) -> None:
146
+ self.conv.reset()
147
+ self.trajectory.reset()
148
+ self.step_executor.reset()
149
+ await self.session.reset()
150
+
151
+ def output(self, answer: str, success: bool) -> AgentResponse:
152
+ return AgentResponse(
153
+ answer=answer,
154
+ success=success,
155
+ session_trajectory=self.session.trajectory,
156
+ agent_trajectory=self.trajectory.steps, # type: ignore[reportArgumentType]
157
+ messages=self.conv.messages(),
158
+ duration_in_s=time.time() - self.start_time,
159
+ llm_usage=self.tracer.usage,
160
+ )
161
+
162
+ async def get_messages(self, task: str) -> list[AllMessageValues]:
163
+ self.conv.reset()
164
+ system_msg, task_msg = self.prompt.system(), self.prompt.task(task)
165
+ if self.vault is not None:
166
+ system_msg += "\n" + self.vault.instructions()
167
+ self.conv.add_system_message(content=system_msg)
168
+ self.conv.add_user_message(content=task_msg)
169
+ # just for logging
170
+ traj_msg = self.trajectory.perceive()
171
+ if self.config.verbose:
172
+ logger.info(f"🔍 Trajectory history:\n{traj_msg}")
173
+ # add trajectory to the conversation
174
+ match self.history_type:
175
+ case HistoryType.COMPRESSED:
176
+ self.conv.add_user_message(content=traj_msg)
177
+ case _:
178
+ if len(self.trajectory.steps) == 0:
179
+ self.conv.add_user_message(content=self.trajectory.start_rules())
180
+ for step in self.trajectory.steps:
181
+ # TODO: choose if we want this to be an assistant message or a tool message
182
+ # self.conv.add_tool_message(step.agent_response, tool_id="step")
183
+ self.conv.add_assistant_message(step.agent_response.model_dump_json(exclude_none=True))
184
+ for result in step.results:
185
+ short_step_msg = self.trajectory.perceive_step_result(result, include_ids=True)
186
+ self.conv.add_user_message(content=short_step_msg)
187
+ if not result.success:
188
+ continue
189
+ # add observation data to the conversation
190
+ obs = result.get()
191
+ match (self.history_type, obs.has_data()):
192
+ case (HistoryType.FULL_CONVERSATION, _):
193
+ self.conv.add_user_message(
194
+ content=self.perception.perceive(obs),
195
+ image=(obs.screenshot if self.config.include_screenshot else None),
196
+ )
197
+ case (HistoryType.SHORT_OBSERVATIONS_WITH_RAW_DATA, True):
198
+ # add data if data was scraped
199
+ self.conv.add_user_message(content=self.perception.perceive_data(obs, raw=True))
200
+
201
+ case (HistoryType.SHORT_OBSERVATIONS_WITH_SHORT_DATA, True):
202
+ self.conv.add_user_message(content=self.perception.perceive_data(obs, raw=False))
203
+ case _:
204
+ pass
205
+
206
+ last_valid_obs = self.trajectory.last_obs()
207
+ if last_valid_obs is not None and self.history_type is not HistoryType.FULL_CONVERSATION:
208
+ self.conv.add_user_message(
209
+ content=self.perception.perceive(last_valid_obs),
210
+ image=(last_valid_obs.screenshot if self.config.include_screenshot else None),
211
+ )
212
+
213
+ if len(self.trajectory.steps) > 0:
214
+ self.conv.add_user_message(self.prompt.action_message())
215
+
216
+ return self.conv.messages()
217
+
218
+ async def step(self, task: str) -> CompletionAction | None:
219
+ """Execute a single step of the agent"""
220
+ messages = await self.get_messages(task)
221
+ response: StepAgentOutput = self.llm.structured_completion(messages, response_format=StepAgentOutput)
222
+ if self.step_callback is not None:
223
+ self.step_callback(task, response)
224
+
225
+ if self.config.verbose:
226
+ logger.info(f"🔍 LLM response:\n{response}")
227
+
228
+ for line in response.pretty_string().split("\n"):
229
+ logger.opt(colors=True).info(line)
230
+
231
+ self.trajectory.add_output(response)
232
+ # check for completion
233
+ if response.output is not None:
234
+ return response.output
235
+ # Execute the actions
236
+ for action in response.get_actions(self.config.max_actions_per_step):
237
+ result = await self.step_executor.execute(action)
238
+
239
+ self.trajectory.add_step(result)
240
+ step_msg = self.trajectory.perceive_step_result(result, include_ids=True)
241
+ logger.info(f"{step_msg}\n\n")
242
+ if not result.success:
243
+ # observe again
244
+ obs = await self.session.observe()
245
+
246
+ # cast is necessary because we cant have covariance
247
+ # in ExecutionStatus
248
+ ex_status = ExecutionStatus(
249
+ input=typing.cast(BaseAction, FallbackObserveAction()),
250
+ output=obs,
251
+ success=True,
252
+ message="Observed",
253
+ )
254
+ self.trajectory.add_output(response)
255
+ self.trajectory.add_step(ex_status)
256
+
257
+ # stop the loop
258
+ break
259
+ # Successfully executed the action
260
+ return None
261
+
262
+ @override
263
+ async def run(self, task: str, url: str | None = None) -> AgentResponse:
264
+ logger.info(f"Running task: {task}")
265
+ self.start_time: float = time.time()
266
+ try:
267
+ return await self._run(task, url=url)
268
+
269
+ except Exception as e:
270
+ if self.config.raise_condition is RaiseCondition.NEVER:
271
+ return self.output(f"Failed due to {e}: {traceback.format_exc()}", False)
272
+ raise e
273
+
274
+ async def _human_in_the_loop(self) -> None:
275
+ # Check for captcha if human-in-the-loop is enabled
276
+ captcha_result = self.captcha_detector.detect(self.session.trajectory[-1])
277
+ if captcha_result.has_captcha:
278
+ logger.warning(f"⚠️ Captcha detected: {captcha_result.description}")
279
+ logger.info("🔄 Waiting for human intervention...")
280
+ _ = input("Press Enter to continue after solving the captcha...")
281
+ # Observe again after human intervention
282
+ obs = await self.session.observe()
283
+ self.trajectory.add_step(
284
+ ExecutionStatus(
285
+ input=typing.cast(BaseAction, FallbackObserveAction()),
286
+ output=obs,
287
+ success=True,
288
+ message="Observed after human intervention",
289
+ )
290
+ )
291
+
292
+ async def _run(self, task: str, url: str | None = None) -> AgentResponse:
293
+ """Execute the task with maximum number of steps"""
294
+ # change this to DEV if you want more explicit error messages
295
+ # when you are developing your own agent
296
+ notte_core.set_error_mode("agent")
297
+ if url is not None:
298
+ task = f"Start on '{url}' and {task}"
299
+
300
+ # Loop through the steps
301
+ async with self.session:
302
+ # hide vault leaked credentials within screenshots
303
+ if self.vault is not None:
304
+ self.session.window.screenshot_mask = VaultScreetsScreenshotMask(vault=self.vault)
305
+
306
+ for step in range(self.session.config.max_steps):
307
+ logger.info(f"💡 Step {step}")
308
+ output: CompletionAction | None = await self.step(task)
309
+ # Check for captcha if human-in-the-loop is enabled
310
+ if self.config.human_in_the_loop:
311
+ await self._human_in_the_loop()
312
+ if output is None:
313
+ continue
314
+ # validate the output
315
+ if not output.success:
316
+ logger.error(f"🚨 Agent terminated early with failure: {output.answer}")
317
+ return self.output(output.answer, False)
318
+ # Sucessful execution and LLM output is not None
319
+ # Need to validate the output
320
+ logger.info(f"🔥 Validating agent output:\n{output.model_dump_json()}")
321
+ val = self.validator.validate(task, output, self.session.trajectory[-1])
322
+ if val.is_valid:
323
+ # Successfully validated the output
324
+ logger.info("✅ Task completed successfully")
325
+ return self.output(output.answer, output.success)
326
+ else:
327
+ # TODO handle that differently
328
+ failed_val_msg = f"Final validation failed: {val.reason}. Continuing..."
329
+ logger.error(failed_val_msg)
330
+ # add the validation result to the trajectory and continue
331
+ self.trajectory.add_step(
332
+ ExecutionStatus(
333
+ input=output,
334
+ output=None,
335
+ success=False,
336
+ message=failed_val_msg,
337
+ )
338
+ )
339
+
340
+ error_msg = f"Failed to solve task in {self.session.config.max_steps} steps"
341
+ logger.info(f"🚨 {error_msg}")
342
+ notte_core.set_error_mode("developer")
343
+ return self.output(error_msg, False)
@@ -0,0 +1,83 @@
1
+ from typing import final
2
+
3
+ from notte_core.browser.observation import Observation
4
+ from typing_extensions import override
5
+
6
+ from notte_agent.common.perception import BasePerception
7
+
8
+
9
+ @final
10
+ class FalcoPerception(BasePerception):
11
+ def __init__(
12
+ self,
13
+ include_step_info: bool = True,
14
+ include_attributes: list[str] | None = None,
15
+ ):
16
+ self.include_attributes = include_attributes
17
+ self.include_step_info = include_step_info
18
+
19
+ @override
20
+ def perceive_metadata(self, obs: Observation) -> str:
21
+ if obs.progress is None:
22
+ raise ValueError("Observation has no progress")
23
+ return f"""
24
+ You will see the following only once. If you need to remember it and you dont know it yet, write it down in the memory.
25
+
26
+ * Current url: {obs.metadata.url}
27
+ * Current page title: {obs.metadata.title}
28
+ * Current date and time: {obs.metadata.timestamp.strftime("%Y-%m-%d %H:%M:%S")}
29
+ * Available tabs:
30
+ {obs.metadata.tabs}
31
+ * Current step: {obs.progress.current_step + 1}/{obs.progress.max_steps}'
32
+ """
33
+
34
+ @override
35
+ def perceive(self, obs: Observation) -> str:
36
+ return f"""
37
+ [Relevant metadata]
38
+ {self.perceive_metadata(obs)}
39
+
40
+ [Interaction elements and context]
41
+ {self.perceive_actions(obs)}
42
+
43
+ [Data found in the page]
44
+ {self.perceive_data(obs)}
45
+ """
46
+
47
+ @override
48
+ def perceive_actions(self, obs: Observation) -> str:
49
+ px_above = obs.metadata.viewport.pixels_above
50
+ px_below = obs.metadata.viewport.pixels_below
51
+
52
+ more_above = f"... {px_above} pixels above - scroll or scrape content to see more ..."
53
+ more_below = f"... {px_below} pixels below - scroll or scrape content to see more ..."
54
+
55
+ space_description = obs.space.description
56
+
57
+ return f"""
58
+ [Start of page]
59
+ {more_above if px_above > 0 else ""}
60
+ {space_description or "No content to display"}
61
+ {more_below if px_below > 0 else ""}
62
+ [End of page]
63
+
64
+ """
65
+
66
+ @override
67
+ def perceive_data(self, obs: Observation, raw: bool = True) -> str:
68
+ if not obs.has_data() or obs.data is None:
69
+ return ""
70
+ if raw:
71
+ percieved_data = obs.data.markdown
72
+ else:
73
+ structured_data = obs.data.structured
74
+ if structured_data is None or not structured_data.success or structured_data.data is None:
75
+ error_msg = f" with error: {structured_data.error}" if structured_data is not None else ""
76
+ return f"Scraping failed{error_msg}. Please try again with different instructions."
77
+ percieved_data = structured_data.data.model_dump_json()
78
+
79
+ return f"""
80
+ Data scraped from current page view:
81
+
82
+ {percieved_data or "No valid data to display"}
83
+ """
@@ -0,0 +1,132 @@
1
+ import datetime as dt
2
+ from enum import StrEnum
3
+ from pathlib import Path
4
+
5
+ import chevron
6
+ from notte_core.controller.actions import (
7
+ BaseAction,
8
+ ClickAction,
9
+ CompletionAction,
10
+ FallbackObserveAction,
11
+ FillAction,
12
+ GotoAction,
13
+ ScrapeAction,
14
+ )
15
+ from notte_core.controller.space import ActionSpace
16
+
17
+ system_prompt_dir = Path(__file__).parent / "prompts"
18
+
19
+
20
+ class PromptType(StrEnum):
21
+ SINGLE_ACTION = "single_action"
22
+ MULTI_ACTION = "multi_action"
23
+
24
+ def prompt_file(self) -> Path:
25
+ match self:
26
+ case PromptType.SINGLE_ACTION:
27
+ return system_prompt_dir / "system_prompt_single_action.md"
28
+ case PromptType.MULTI_ACTION:
29
+ return system_prompt_dir / "system_prompt_multi_action.md"
30
+
31
+
32
+ class FalcoPrompt:
33
+ def __init__(
34
+ self,
35
+ max_actions_per_step: int,
36
+ ) -> None:
37
+ multi_act = max_actions_per_step > 1
38
+ prompt_type = PromptType.MULTI_ACTION if multi_act else PromptType.SINGLE_ACTION
39
+ self.system_prompt: str = prompt_type.prompt_file().read_text()
40
+ self.max_actions_per_step: int = max_actions_per_step
41
+ self.space: ActionSpace = ActionSpace(description="", exclude_actions={FallbackObserveAction})
42
+
43
+ @staticmethod
44
+ def _json_dump(steps: list[BaseAction]) -> str:
45
+ lines = ",\n ".join([action.dump_str() for action in steps])
46
+ return "[\n " + lines + "\n]"
47
+
48
+ def example_form_filling(self) -> str:
49
+ return self._json_dump(
50
+ [FillAction(id="I99", value="username"), FillAction(id="I101", value="password"), ClickAction(id="B1")]
51
+ )
52
+
53
+ def example_invalid_sequence(self) -> str:
54
+ return self._json_dump(
55
+ [
56
+ ClickAction(id="L1"),
57
+ ClickAction(id="B4"),
58
+ ClickAction(id="L2"),
59
+ ]
60
+ )
61
+
62
+ def example_navigation_and_extraction(self) -> str:
63
+ return self._json_dump([GotoAction(url="https://www.google.com"), ScrapeAction()])
64
+
65
+ def completion_example(self) -> str:
66
+ return self._json_dump([CompletionAction(success=True, answer="<answer to the task>")])
67
+
68
+ def example_step(self) -> str:
69
+ goal_eval = (
70
+ "Analyze the current elements and the image to check if the previous goals/actions"
71
+ " are successful like intended by the task. Ignore the action result. The website is the ground truth. "
72
+ "Also mention if something unexpected happened like new suggestions in an input field. "
73
+ "Shortly state why/why not"
74
+ )
75
+ return chevron.render(
76
+ """
77
+ {
78
+ "state": {
79
+ "page_summary": "On the page are company a,b,c wtih their revenue 1,2,3.",
80
+ "relevant_interactions": [{"id": "B2", "reason": "The button with id B2 represents search and I'm looking to search"}],
81
+ "previous_goal_status": "success|failure|unknown",
82
+ "previous_goal_eval": "{{goal_eval}}",
83
+ "memory": "Description of what has been done and what you need to remember until the end of the task",
84
+ "next_goal": "What needs to be done with the next actions"
85
+ },
86
+ "actions": [
87
+ { "one_action_name": {
88
+ // action-specific parameter
89
+ ...
90
+ }
91
+ }, // ... more actions in sequence ...
92
+ ]
93
+ }
94
+ """,
95
+ {"goal_eval": goal_eval},
96
+ )
97
+
98
+ def system(self) -> str:
99
+ return chevron.render(
100
+ self.system_prompt,
101
+ {
102
+ "timstamp": dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
103
+ "max_actions_per_step": self.max_actions_per_step,
104
+ "action_description": self.space.markdown(),
105
+ "example_form_filling": self.example_form_filling(),
106
+ "example_step": self.example_step(),
107
+ "completion_example": self.completion_example(),
108
+ "completion_action_name": CompletionAction.name(),
109
+ "goto_action_name": GotoAction.name(),
110
+ "example_navigation_and_extraction": self.example_navigation_and_extraction(),
111
+ "example_invalid_sequence": self.example_invalid_sequence(),
112
+ },
113
+ )
114
+
115
+ def task(self, task: str):
116
+ return f"""
117
+ Your ultimate task is: "{task}".
118
+ If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task.
119
+ If not, continue as usual.
120
+ """
121
+
122
+ def new_task(self, task: str) -> str:
123
+ return f"""
124
+ Your new ultimate task is: {task}.
125
+ Take the previous context into account and finish your new ultimate task.
126
+ """
127
+
128
+ def action_message(self) -> str:
129
+ return """Given the previous information, start by reflecting on your last action. Then, summarize the current page and list relevant available interactions.
130
+ Absolutely do not under any circumstance list or pay attention to any id that is not explicitly found in the page.
131
+ From there, select the your next goal, and in turn, your next action.
132
+ """
@@ -0,0 +1,107 @@
1
+ You are a precise browser automation agent that interacts with websites through structured commands.
2
+ Your role is to:
3
+ 1. Analyze the provided webpage elements and structure
4
+ 2. Plan a sequence of actions to accomplish the given task
5
+ 3. Respond with valid JSON containing your action sequence and state assessment
6
+
7
+ Current date and time: {{timstamp}}
8
+
9
+ INPUT STRUCTURE:
10
+ 1. Current URL: The webpage you're currently on
11
+ 2. Available Tabs: List of open browser tabs
12
+ 3. Interactive Elements: List in the format:
13
+ id[:]<element_type>element_text</element_type>
14
+ - `id`: identifier for interaction. `ids` can be decomposed into `<role_first_letter><index>[:]` where `<index>` is the index of the element in the list of elements with the same role and `<role_first_letter>` are:
15
+ - `I` for input fields (textbox, select, checkbox, etc.)
16
+ - `B` for buttons
17
+ - `L` for links
18
+ - `F` for figures and images
19
+ - `O` for options in select elements
20
+ - `M` for miscallaneous elements (e.g. modals, dialogs, etc.) that are only clickable for the most part.
21
+ - `element_type`: HTML element type (button, input, etc.)
22
+ - `element_text`: Visible text or element description
23
+
24
+ Example:
25
+ B1[:]<button>Submit Form</button>
26
+ _[:] Non-interactive text
27
+
28
+
29
+ Notes:
30
+ - Only elements with `ids` are interactive
31
+ - `_[:]` elements provide context but cannot be interacted with
32
+
33
+ 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
34
+ ```json
35
+ {{& example_step}}
36
+ ```
37
+
38
+ 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item.
39
+
40
+ Common action sequences:
41
+ - Form filling:
42
+ ```json
43
+ {{& example_form_filling}}
44
+ ```
45
+ - Navigation and extraction:
46
+ ```json
47
+ {{& example_navigation_and_extraction}}
48
+ ```
49
+
50
+ CRITICAL: some actions sequences are invalid because they cannot be executed in the same step without triggering a page change:
51
+ - `link clicks` always trigger a page change and hence cannot be part of multiple actions, e.g. this sequence is invalid:
52
+ ```json
53
+ {{& example_invalid_sequence}}
54
+ ```
55
+
56
+
57
+
58
+ 3. ELEMENT INTERACTION:
59
+ - Only use `ids` that exist in the provided element list
60
+ - Each element has a unique `id` (e.g., `I2[:]<button>`)
61
+ - Elements marked with `_[:]` are non-interactive (for context only)
62
+
63
+ 4. NAVIGATION & ERROR HANDLING:
64
+ - If no suitable elements exist, use other functions to complete the task
65
+ - If stuck, try alternative approaches
66
+ - Handle popups/cookies by accepting or closing them
67
+ - Use scroll to find elements you are looking for
68
+
69
+ 5. TASK COMPLETION:
70
+ - Use the `{{completion_action_name}}` action as the last action as soon as the task is complete
71
+ - Don't hallucinate actions
72
+ - If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
73
+ - If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
74
+
75
+ - Example of sucessfuly `{{completion_action_name}}` action:
76
+ ```json
77
+ {{& completion_example}}
78
+ ```
79
+
80
+ 6. VISUAL CONTEXT:
81
+ - When an image is provided, use it to understand the page layout
82
+ - Bounding boxes with labels correspond to element indexes
83
+ - Each bounding box and its label have the same color
84
+ - Most often the label is inside the bounding box, on the top right
85
+ - Visual context helps verify element locations and relationships
86
+ - sometimes labels overlap, so use the context to verify the correct element
87
+
88
+ 7. Form filling:
89
+ - If you fill an input field and your action sequence is interrupted, most often a list with suggestions popped up under the field and you need to first select the right element from the suggestion list.
90
+
91
+ 8. ACTION SEQUENCING:
92
+ - Actions are executed in the order they appear in the list
93
+ - Each action should logically follow from the previous one
94
+ - If the page changes after an action, the sequence is interrupted and you get the new state.
95
+ - If content only disappears the sequence continues.
96
+ - Only provide the action sequence until you think the page will change.
97
+ - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page like saving, extracting, checkboxes...
98
+ - only use multiple actions if it makes sense.
99
+ - use maximum {{max_actions_per_step}} actions per sequence
100
+
101
+ 9. Long tasks:
102
+ - If the task is long keep track of the status in the memory. If the ultimate task requires multiple subinformation, keep track of the status in the memory
103
+
104
+ Functions:
105
+ {{& action_description}}
106
+
107
+ Remember: Your responses must be valid JSON matching the specified format. Each action in the sequence must be valid.