notte-agent 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
notte_agent/README.md ADDED
@@ -0,0 +1,58 @@
1
+ # How to build an LLM agent with *Notte*
2
+
3
+ This guide explains how to build a custom LLM agent using *Notte*. The example in `agent.py` demonstrates a basic implementation that you can customize for your specific needs.
4
+
5
+ ## Overview
6
+
7
+ *Notte* provides a flexible environment for web automation that can be controlled through an API. To build an agent with *Notte*, you need:
8
+
9
+ 1. An agent implementation that coordinates between your LLM and the *Notte* environment
10
+ 2. A parser that formats *Notte*'s outputs into prompts suitable for your LLM
11
+ 3. A way to interpret the LLM's responses back into *Notte* commands
12
+
13
+ ## Key Components
14
+
15
+ ### Agent
16
+
17
+ The `Agent` class in `agent.py` shows how to:
18
+ - Initialize a connection to your LLM service
19
+ - Manage the conversation flow between the LLM and *Notte*
20
+ - Track the state of task completion
21
+
22
+ ### Parser
23
+
24
+ The parser is crucial for translating between *Notte* and your LLM. You'll need to:
25
+
26
+ 1. Create a custom parser (by extending `BaseNotteParser` or implementing the `Parser` interface)
27
+ 2. Define how to format:
28
+ - Observations from web pages
29
+ - Available actions
30
+ - Data extraction results
31
+ - Task completion status
32
+
33
+ The provided `BaseNotteParser` is a simple example that you should modify based on your needs. Consider:
34
+ - The prompt format your LLM works best with
35
+ - How to structure web observations for your specific tasks
36
+ - What action format makes sense for your use case
37
+ - How to handle task completion and data extraction
38
+
39
+ ## Example Implementation
40
+
41
+ See `agent.py` for a basic implementation. Key points to customize:
42
+ - The parser implementation
43
+ - The prompt engineering in the conversation flow
44
+ - How task completion is determined
45
+ - Error handling and retry logic
46
+
47
+ ## Best Practices
48
+
49
+ 1. **Custom Parser**: Don't just use the `BaseNotteParser` as-is. Create your own parser that:
50
+ - Formats observations in a way that makes sense for your LLM
51
+ - Structures action possibilities clearly
52
+ - Handles task-specific data extraction
53
+
54
+ 2. **Prompt Engineering**: Carefully design your system prompt and conversation flow
55
+
56
+ 3. **Error Handling**: Add robust error handling for both LLM and *Notte* interactions
57
+
58
+ 4. **Testing**: Test your parser and agent with different scenarios
@@ -0,0 +1,7 @@
1
+ from notte_core import check_notte_version
2
+
3
+ from notte_agent.main import Agent
4
+
5
+ __version__ = check_notte_version("notte_agent")
6
+
7
+ __all__ = ["Agent"]
File without changes
@@ -0,0 +1,9 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from notte_agent.common.types import AgentResponse
4
+
5
+
6
+ class BaseAgent(ABC):
7
+ @abstractmethod
8
+ async def run(self, task: str, url: str | None = None) -> AgentResponse:
9
+ pass
@@ -0,0 +1,213 @@
1
+ from abc import ABC, abstractmethod
2
+ from argparse import ArgumentParser, Namespace
3
+ from collections.abc import Callable
4
+ from enum import StrEnum
5
+ from typing import Any, ClassVar, Self, get_origin, get_type_hints
6
+
7
+ from notte_browser.env import NotteEnvConfig
8
+ from notte_core.common.config import FrozenConfig
9
+ from notte_core.llms.engine import LlmModel
10
+ from notte_sdk.types import DEFAULT_MAX_NB_STEPS
11
+ from pydantic import Field, model_validator
12
+
13
+
14
+ class RaiseCondition(StrEnum):
15
+ """How to raise an error when the agent fails to complete a step.
16
+
17
+ Either immediately upon failure, after retry, or never.
18
+ """
19
+
20
+ IMMEDIATELY = "immediately"
21
+ RETRY = "retry"
22
+ NEVER = "never"
23
+
24
+
25
+ class DefaultAgentArgs(StrEnum):
26
+ ENV_DISABLE_WEB_SECURITY = "disable_web_security"
27
+ ENV_HEADLESS = "headless"
28
+ ENV_PERCEPTION_MODEL = "perception_model"
29
+ ENV_MAX_STEPS = "max_steps"
30
+
31
+ def with_prefix(self: Self, prefix: str = "env") -> str:
32
+ return f"{prefix}.{self.value}"
33
+
34
+
35
+ class AgentConfig(FrozenConfig, ABC):
36
+ # make env private to avoid exposing the NotteEnvConfig class
37
+ env: NotteEnvConfig = Field(init=False)
38
+ reasoning_model: str = Field(
39
+ default=LlmModel.default(), description="The model to use for reasoning (i.e taking actions)."
40
+ )
41
+ include_screenshot: bool = Field(default=False, description="Whether to include a screenshot in the response.")
42
+ max_history_tokens: int | None = Field(
43
+ default=None,
44
+ description="The maximum number of tokens in the history. When the history exceeds this limit, the oldest messages are discarded.",
45
+ )
46
+ max_error_length: int = Field(
47
+ default=500, description="The maximum length of an error message to be forwarded to the reasoning model."
48
+ )
49
+ raise_condition: RaiseCondition = Field(
50
+ default=RaiseCondition.RETRY, description="How to raise an error when the agent fails to complete a step."
51
+ )
52
+ max_consecutive_failures: int = Field(
53
+ default=3, description="The maximum number of consecutive failures before the agent gives up."
54
+ )
55
+ force_env: bool | None = Field(
56
+ default=None,
57
+ description="Whether to allow the user to set the environment.",
58
+ )
59
+
60
+ @classmethod
61
+ @abstractmethod
62
+ def default_env(cls) -> NotteEnvConfig:
63
+ raise NotImplementedError("Subclasses must implement this method")
64
+
65
+ @model_validator(mode="before")
66
+ @classmethod
67
+ def set_env(cls, values: dict[str, Any]) -> dict[str, Any]:
68
+ if "env" in values:
69
+ if "force_env" in values and values["force_env"]:
70
+ del values["force_env"]
71
+ return values
72
+ raise ValueError("Env should not be set by the user. Set `default_env` instead.")
73
+ values["env"] = cls.default_env() # Set the env field using the subclass's method
74
+ return values
75
+
76
+ def groq(self: Self, deep: bool = True) -> Self:
77
+ return self.model(LlmModel.groq, deep=deep)
78
+
79
+ def openai(self: Self, deep: bool = True) -> Self:
80
+ return self.model(LlmModel.openai, deep=deep)
81
+
82
+ def gemini(self: Self, deep: bool = True) -> Self:
83
+ return self.model(LlmModel.gemini, deep=deep)
84
+
85
+ def cerebras(self: Self, deep: bool = True) -> Self:
86
+ return self.model(LlmModel.cerebras, deep=deep)
87
+
88
+ def model(self: Self, model: LlmModel, deep: bool = True) -> Self:
89
+ config = self._copy_and_validate(reasoning_model=model, max_history_tokens=LlmModel.context_length(model))
90
+ if deep:
91
+ config = config.map_env(lambda env: env.model(model))
92
+ return config
93
+
94
+ def use_vision(self: Self, value: bool = True) -> Self:
95
+ return self._copy_and_validate(include_screenshot=value)
96
+
97
+ def dev_mode(self: Self) -> Self:
98
+ return self._copy_and_validate(
99
+ raise_condition=RaiseCondition.IMMEDIATELY,
100
+ max_error_length=1000,
101
+ env=self.env.dev_mode(),
102
+ force_env=True,
103
+ )
104
+
105
+ def set_raise_condition(self: Self, value: RaiseCondition) -> Self:
106
+ return self._copy_and_validate(raise_condition=value)
107
+
108
+ def map_env(self: Self, env: Callable[[NotteEnvConfig], NotteEnvConfig]) -> Self:
109
+ return self._copy_and_validate(env=env(self.env), force_env=True)
110
+
111
+ @staticmethod
112
+ def _get_arg_type(python_type: Any) -> Any:
113
+ """Maps Python types to argparse types."""
114
+ type_map = {
115
+ str: str,
116
+ int: int,
117
+ float: float,
118
+ bool: bool,
119
+ }
120
+ return type_map.get(python_type, str)
121
+
122
+ @staticmethod
123
+ def create_base_parser() -> ArgumentParser:
124
+ """Creates a base ArgumentParser with all the fields from the config."""
125
+ parser = ArgumentParser()
126
+ _ = parser.add_argument(
127
+ f"--{DefaultAgentArgs.ENV_HEADLESS.with_prefix()}",
128
+ action="store_true",
129
+ help="Whether to run the browser in headless mode.",
130
+ )
131
+ _ = parser.add_argument(
132
+ f"--{DefaultAgentArgs.ENV_DISABLE_WEB_SECURITY.with_prefix()}",
133
+ action="store_true",
134
+ help="Whether disable web security.",
135
+ )
136
+ _ = parser.add_argument(
137
+ f"--{DefaultAgentArgs.ENV_PERCEPTION_MODEL.with_prefix()}",
138
+ type=str,
139
+ default=None,
140
+ help="The model to use for perception.",
141
+ )
142
+ _ = parser.add_argument(
143
+ f"--{DefaultAgentArgs.ENV_MAX_STEPS.with_prefix()}",
144
+ type=int,
145
+ default=DEFAULT_MAX_NB_STEPS,
146
+ help="The maximum number of steps the agent can take.",
147
+ )
148
+ return parser
149
+
150
+ @classmethod
151
+ def create_parser(cls) -> ArgumentParser:
152
+ """Creates an ArgumentParser with all the fields from the config."""
153
+ parser = cls.create_base_parser()
154
+ hints = get_type_hints(cls)
155
+
156
+ for field_name, field_info in cls.model_fields.items():
157
+ if field_name == "env":
158
+ continue
159
+ field_type = hints.get(field_name)
160
+ if get_origin(field_type) is ClassVar:
161
+ continue
162
+
163
+ default = field_info.default
164
+ help_text = field_info.description or "no description available"
165
+ arg_type = cls._get_arg_type(field_type)
166
+
167
+ _ = parser.add_argument(
168
+ f"--{field_name.replace('_', '-')}",
169
+ type=arg_type,
170
+ default=default,
171
+ help=f"{help_text} (default: {default})",
172
+ )
173
+
174
+ return parser
175
+
176
+ @classmethod
177
+ def from_args(cls: type[Self], args: Namespace) -> Self:
178
+ """Creates an AgentConfig from a Namespace of arguments.
179
+
180
+ The return type will match the class that called this method.
181
+ """
182
+ disallowed_args = ["task", "env.window.headless"]
183
+
184
+ env_args = {
185
+ k.replace("env.", "").replace("-", "_"): v
186
+ for k, v in vars(args).items()
187
+ if k.startswith("env.") and k not in disallowed_args
188
+ }
189
+ agent_args = {
190
+ k.replace("-", "_"): v
191
+ for k, v in vars(args).items()
192
+ if not k.startswith("env.") and k not in disallowed_args
193
+ }
194
+
195
+ def update_env(env: NotteEnvConfig) -> NotteEnvConfig:
196
+ operations: list[Callable[[NotteEnvConfig], NotteEnvConfig]] = []
197
+ if DefaultAgentArgs.ENV_HEADLESS in env_args:
198
+ headless = env_args[DefaultAgentArgs.ENV_HEADLESS]
199
+ operations.append(lambda env: env.headless(headless))
200
+ del env_args[DefaultAgentArgs.ENV_HEADLESS]
201
+ if DefaultAgentArgs.ENV_DISABLE_WEB_SECURITY in env_args:
202
+ disable_web_security = env_args[DefaultAgentArgs.ENV_DISABLE_WEB_SECURITY]
203
+ operations.append(
204
+ lambda env: env.disable_web_security() if disable_web_security else env.enable_web_security()
205
+ )
206
+ del env_args[DefaultAgentArgs.ENV_DISABLE_WEB_SECURITY]
207
+
208
+ env = env._copy_and_validate(**env_args)
209
+ for operation in operations:
210
+ env = operation(env)
211
+ return env
212
+
213
+ return cls(**agent_args).map_env(update_env)
@@ -0,0 +1,246 @@
1
+ import base64
2
+ import json
3
+ from dataclasses import dataclass, field
4
+ from typing import TypeVar
5
+
6
+ from litellm import (
7
+ AllMessageValues,
8
+ ChatCompletionAssistantMessage,
9
+ ChatCompletionAssistantToolCall,
10
+ ChatCompletionImageObject,
11
+ ChatCompletionSystemMessage,
12
+ ChatCompletionTextObject,
13
+ ChatCompletionToolMessage,
14
+ ChatCompletionUserMessage,
15
+ ModelResponse, # type: ignore[reportPrivateImportUsage]
16
+ OpenAIMessageContent,
17
+ )
18
+ from litellm.utils import token_counter # type: ignore[reportUnknownVariableType]
19
+ from loguru import logger
20
+ from notte_core.errors.llm import LLMParsingError
21
+ from notte_core.llms.engine import LlmModel, StructuredContent
22
+ from pydantic import BaseModel
23
+
24
+ # Define valid message roles
25
+
26
+
27
+ @dataclass
28
+ class CachedMessage:
29
+ """Message with cached token count"""
30
+
31
+ message: AllMessageValues
32
+ token_count: int
33
+
34
+
35
+ T = TypeVar("T", bound=BaseModel)
36
+
37
+
38
+ @dataclass
39
+ class Conversation:
40
+ """Manages conversation history and message extraction"""
41
+
42
+ history: list[CachedMessage] = field(default_factory=list)
43
+ json_extractor: StructuredContent = field(default_factory=lambda: StructuredContent(inner_tag="json"))
44
+ autosize: bool = False
45
+ model: str = LlmModel.default()
46
+ max_tokens: int | None = None
47
+ conservative_factor: float = 0.8
48
+
49
+ _total_tokens: int = field(default=0, init=False)
50
+ convert_tools_to_assistant: bool = False
51
+
52
+ def __post_init__(self) -> None:
53
+ if self.max_tokens is None:
54
+ self.max_tokens = LlmModel.context_length(self.model)
55
+
56
+ @property
57
+ def default_max_tokens(self) -> int:
58
+ if self.max_tokens is None:
59
+ raise ValueError("max_tokens is not set")
60
+ return self.max_tokens
61
+
62
+ @property
63
+ def conservative_max_tokens(self) -> int:
64
+ """Since token count isn't 100% accurate, allow to be
65
+ slightly conservative, to make sure we trim under the total context length"""
66
+ return int(self.default_max_tokens * self.conservative_factor)
67
+
68
+ def count_tokens(self, content: AllMessageValues) -> int:
69
+ """Count the number of tokens in a list of messages"""
70
+ return token_counter(model=self.model, messages=[content])
71
+
72
+ def total_tokens(self) -> int:
73
+ """Get total tokens in conversation history"""
74
+ return self._total_tokens
75
+
76
+ def trim_history_to_fit(self, new_content: AllMessageValues) -> None:
77
+ """Trim history to make room for new content while preserving system messages"""
78
+ if not self.autosize:
79
+ return
80
+
81
+ # Always keep system messages
82
+ init_messages: list[CachedMessage] = []
83
+ other_messages: list[CachedMessage] = []
84
+ is_init_msg = True
85
+ for msg in self.history:
86
+ match is_init_msg, msg.message["role"]:
87
+ case True, "system":
88
+ init_messages.append(msg)
89
+ case True, "user":
90
+ # keep first user message as init message (need task description)
91
+ is_init_msg = False
92
+ init_messages.append(msg)
93
+ case _, _:
94
+ other_messages.append(msg)
95
+
96
+ new_content_tokens = self.count_tokens(new_content)
97
+ init_tokens = sum(msg.token_count for msg in init_messages)
98
+ available_tokens = self.conservative_max_tokens - init_tokens - new_content_tokens
99
+
100
+ # Remove oldest non-system messages until we have room
101
+ current_tokens = sum(msg.token_count for msg in other_messages)
102
+ has_trimmed = 0
103
+ while other_messages and current_tokens > available_tokens:
104
+ removed = other_messages.pop(0)
105
+ current_tokens -= removed.token_count
106
+ has_trimmed += 1
107
+
108
+ if has_trimmed > 0:
109
+ logger.info(
110
+ f"Trimmed {has_trimmed} message(s) to stay under max token limit (i.e {self.default_max_tokens // 1000}k)"
111
+ )
112
+
113
+ self.history = init_messages + other_messages
114
+ self._total_tokens = sum(msg.token_count for msg in self.history)
115
+
116
+ def _add_message(self, msg: AllMessageValues) -> None:
117
+ """Internal helper to add a message with token counting"""
118
+ token_count = self.count_tokens(msg)
119
+ if self.autosize:
120
+ self.trim_history_to_fit(msg)
121
+ cached_msg = CachedMessage(message=msg, token_count=token_count)
122
+ self.history.append(cached_msg)
123
+ self._total_tokens += token_count
124
+
125
+ def add_system_message(self, content: str) -> None:
126
+ """Add a system message to the conversation"""
127
+ self._add_message(ChatCompletionSystemMessage(role="system", content=content))
128
+
129
+ def format_image_content(self, image: bytes) -> ChatCompletionImageObject:
130
+ image_str = base64.b64encode(image).decode("utf-8")
131
+ return ChatCompletionImageObject(
132
+ type="image_url",
133
+ image_url={"url": f"data:image/png;base64,{image_str}"},
134
+ )
135
+
136
+ def format_user_contents(self, contents: list[str | bytes]) -> OpenAIMessageContent:
137
+ return [
138
+ (
139
+ ChatCompletionTextObject(type="text", text=content)
140
+ if isinstance(content, str)
141
+ else self.format_image_content(content)
142
+ )
143
+ for content in contents
144
+ ]
145
+
146
+ def add_user_message(self, content: OpenAIMessageContent, image: bytes | None = None) -> None:
147
+ """Add a user message to the conversation"""
148
+ _content: OpenAIMessageContent = content
149
+ if image is not None and isinstance(content, str):
150
+ _content = self.format_user_contents([content, image])
151
+ self._add_message(ChatCompletionUserMessage(role="user", content=_content))
152
+
153
+ def add_user_messages(self, contents: list[str | bytes]) -> None:
154
+ """Add a user message to the conversation"""
155
+ _content: OpenAIMessageContent = self.format_user_contents(contents)
156
+ self._add_message(ChatCompletionUserMessage(role="user", content=_content))
157
+
158
+ def add_assistant_message(self, content: str) -> None:
159
+ """Add an assistant message to the conversation"""
160
+ self._add_message(ChatCompletionAssistantMessage(role="assistant", content=content))
161
+
162
+ def add_tool_message(self, parsed_content: BaseModel, tool_id: str) -> None:
163
+ """Add a tool message to the conversation"""
164
+ content: str = str(parsed_content.model_dump(mode="json", exclude_unset=True))
165
+ if not self.convert_tools_to_assistant:
166
+ self._add_message(
167
+ ChatCompletionToolMessage(
168
+ role="tool",
169
+ content=content,
170
+ tool_call_id=tool_id,
171
+ )
172
+ )
173
+ else:
174
+ # Optional, convert tools to assistant role
175
+ self._add_message(
176
+ ChatCompletionAssistantMessage(
177
+ role="assistant",
178
+ content="",
179
+ tool_calls=[
180
+ ChatCompletionAssistantToolCall(
181
+ id=tool_id,
182
+ type="function",
183
+ function={
184
+ "arguments": content,
185
+ "name": parsed_content.__class__.__name__,
186
+ },
187
+ )
188
+ ],
189
+ )
190
+ )
191
+
192
+ def parse_structured_response(self, response: ModelResponse | str, model: type[T]) -> T:
193
+ """Parse a structured response from the LLM into a Pydantic model
194
+
195
+ Args:
196
+ response: The LLM model response
197
+ model: The Pydantic model class to parse into
198
+
199
+ Returns:
200
+ Instance of the specified Pydantic model
201
+
202
+ Raises:
203
+ LLMParsingError: If response cannot be parsed into the model
204
+ """
205
+ if isinstance(response, str):
206
+ return model.model_validate(response)
207
+ if not response.choices:
208
+ raise LLMParsingError("No choices in LLM response")
209
+
210
+ choice = response.choices[0]
211
+ # Extract content from either streaming or non-streaming response
212
+ content: str | None = None
213
+ if isinstance(choice, dict):
214
+ message = choice.get("message", {}) # type: ignore[reportUnknownMemberType]
215
+ if isinstance(message, dict):
216
+ content = message.get("content") # type: ignore[reportUnknownMemberType]
217
+ else:
218
+ content = getattr(choice, "text")
219
+
220
+ if not content:
221
+ raise LLMParsingError("No content in LLM response message")
222
+
223
+ try:
224
+ if content is None or not isinstance(content, str):
225
+ raise LLMParsingError("No content in LLM response message")
226
+ extracted = self.json_extractor.extract(content)
227
+ return model.model_validate_json(extracted)
228
+ except (json.JSONDecodeError, ValueError) as e:
229
+ raise LLMParsingError(f"Failed to parse response into {model.__name__}: {str(e)}")
230
+
231
+ def messages(self) -> list[AllMessageValues]:
232
+ """Get messages in LiteLLM format
233
+
234
+ Returns:
235
+ List of messages formatted for LiteLLM
236
+
237
+ Note:
238
+ This converts our internal message format to litellm's format.
239
+ litellm only supports 'assistant' role, so we map all roles to that.
240
+ """
241
+ return [msg.message for msg in self.history]
242
+
243
+ def reset(self) -> None:
244
+ """Clear all messages from the conversation"""
245
+ self.history.clear()
246
+ self._total_tokens = 0
@@ -0,0 +1,54 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from typing_extensions import override
4
+
5
+ from notte_agent.common.base import BaseAgent
6
+ from notte_agent.common.types import AgentResponse
7
+
8
+
9
+ class BaseNotifier(ABC):
10
+ """Base class for notification implementations."""
11
+
12
+ @abstractmethod
13
+ async def send_message(self, text: str) -> None:
14
+ """Send a message using the specific notification service."""
15
+ pass
16
+
17
+ async def notify(self, task: str, result: AgentResponse) -> None:
18
+ """Send a notification about the task result.
19
+
20
+ Args:
21
+ task: The task description
22
+ result: The agent's response to be sent
23
+ """
24
+ message = f"""
25
+ Notte Agent Report 🌙
26
+
27
+ Task Details:
28
+ -------------
29
+ Task: {task}
30
+ Execution Time: {round(result.duration_in_s, 2)} seconds
31
+ Status: {"✅ Success" if result.success else "❌ Failed"}
32
+
33
+
34
+ Agent Response:
35
+ --------------
36
+ {result.answer}
37
+
38
+ Powered by Notte 🌒"""
39
+ await self.send_message(text=message)
40
+
41
+
42
+ class NotifierAgent(BaseAgent):
43
+ """Agent wrapper that sends notifications after task completion."""
44
+
45
+ def __init__(self, agent: BaseAgent, notifier: BaseNotifier):
46
+ self.agent: BaseAgent = agent
47
+ self.notifier: BaseNotifier = notifier
48
+
49
+ @override
50
+ async def run(self, task: str, url: str | None = None) -> AgentResponse:
51
+ """Run the agent and send notification about the result."""
52
+ result = await self.agent.run(task, url)
53
+ await self.notifier.notify(task, result)
54
+ return result
@@ -0,0 +1,78 @@
1
+ import json
2
+ import re
3
+ from abc import ABC, abstractmethod
4
+ from typing import Literal
5
+
6
+ from notte_core.actions.base import ExecutableAction
7
+ from notte_core.controller.actions import (
8
+ BaseAction,
9
+ CompletionAction,
10
+ GotoAction,
11
+ ScrapeAction,
12
+ )
13
+ from pydantic import BaseModel
14
+
15
+
16
+ class NotteStepAgentOutput(BaseModel):
17
+ observe: GotoAction | None = None
18
+ step: ExecutableAction | None = None
19
+ scrape: ScrapeAction | None = None
20
+ completion: CompletionAction | None = None
21
+
22
+ @property
23
+ def endpoint(self) -> Literal["observe", "step", "scrape", "done"] | None:
24
+ if self.observe is not None:
25
+ return "observe"
26
+ elif self.step is not None:
27
+ return "step"
28
+ elif self.scrape is not None:
29
+ return "scrape"
30
+ elif self.completion is not None:
31
+ return "done"
32
+ else:
33
+ return None
34
+
35
+ @property
36
+ def action(self) -> BaseAction | None:
37
+ if self.observe is not None:
38
+ return self.observe
39
+ elif self.step is not None:
40
+ return self.step
41
+ elif self.scrape is not None:
42
+ return self.scrape
43
+ else:
44
+ return None
45
+
46
+
47
+ class ParameterizedAction(BaseModel):
48
+ action_id: str
49
+ params: dict[str, str] | None = None
50
+
51
+
52
+ class BaseParser(ABC):
53
+ @abstractmethod
54
+ def parse(self, text: str) -> NotteStepAgentOutput | None:
55
+ raise NotImplementedError
56
+
57
+ @abstractmethod
58
+ def example_format(self, endpoint: Literal["observe", "step", "scrape"]) -> str | None:
59
+ raise NotImplementedError
60
+
61
+ @staticmethod
62
+ def search_pattern(text: str, tag: str) -> str | None:
63
+ pattern = re.compile(rf"<{tag}>(.*?)</{tag}>", re.IGNORECASE | re.DOTALL)
64
+ match = pattern.search(text)
65
+ return match.group(1).strip() if match else None
66
+
67
+ @staticmethod
68
+ def parse_json(text: str, tag: str | None = None) -> dict[str, str]:
69
+ if tag is not None:
70
+ _text = BaseParser.search_pattern(text, tag)
71
+ if _text is None:
72
+ raise ValueError(f"No text found within <{tag}> tags")
73
+ text = _text
74
+ try:
75
+ data: dict[str, str] = json.loads(text)
76
+ except json.JSONDecodeError:
77
+ raise ValueError("Invalid JSON in action")
78
+ return data
@@ -0,0 +1,21 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ from notte_core.browser.observation import Observation
4
+
5
+
6
+ class BasePerception(ABC):
7
+ @abstractmethod
8
+ def perceive_metadata(self, obs: Observation) -> str:
9
+ pass
10
+
11
+ @abstractmethod
12
+ def perceive_actions(self, obs: Observation) -> str:
13
+ pass
14
+
15
+ @abstractmethod
16
+ def perceive_data(self, obs: Observation) -> str:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def perceive(self, obs: Observation) -> str:
21
+ pass