notte-eval 0.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- notte_eval-0.0.dev0/.gitignore +179 -0
- notte_eval-0.0.dev0/PKG-INFO +33 -0
- notte_eval-0.0.dev0/README.md +0 -0
- notte_eval-0.0.dev0/pyproject.toml +118 -0
- notte_eval-0.0.dev0/src/notte_eval/__init__.py +3 -0
- notte_eval-0.0.dev0/src/notte_eval/agent_handlers/__init__.py +58 -0
- notte_eval-0.0.dev0/src/notte_eval/agent_handlers/browseruse.py +201 -0
- notte_eval-0.0.dev0/src/notte_eval/agent_handlers/browseruse_api.py +147 -0
- notte_eval-0.0.dev0/src/notte_eval/agent_handlers/convergence.py +185 -0
- notte_eval-0.0.dev0/src/notte_eval/agent_handlers/falco.py +238 -0
- notte_eval-0.0.dev0/src/notte_eval/agent_handlers/mock.py +35 -0
- notte_eval-0.0.dev0/src/notte_eval/data/__init__.py +0 -0
- notte_eval-0.0.dev0/src/notte_eval/data/eval.py +204 -0
- notte_eval-0.0.dev0/src/notte_eval/data/gaia/GAIA_webvoyager.jsonl +90 -0
- notte_eval-0.0.dev0/src/notte_eval/data/load_data.py +97 -0
- notte_eval-0.0.dev0/src/notte_eval/data/scratch/proxy.jsonl +9 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data.jsonl +643 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data_short.jsonl +96 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data_simple.jsonl +30 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data_single.jsonl +1 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/webvoyager_excluded.jsonl +55 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/convert.py +23 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager.jsonl +643 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_convergence.jsonl +601 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_excluded.jsonl +54 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_simple.jsonl +30 -0
- notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_single.jsonl +1 -0
- notte_eval-0.0.dev0/src/notte_eval/evaluators/__init__.py +26 -0
- notte_eval-0.0.dev0/src/notte_eval/evaluators/evaluator.py +31 -0
- notte_eval-0.0.dev0/src/notte_eval/evaluators/webvoyager.py +109 -0
- notte_eval-0.0.dev0/src/notte_eval/patcher.py +189 -0
- notte_eval-0.0.dev0/src/notte_eval/py.typed +0 -0
- notte_eval-0.0.dev0/src/notte_eval/run.py +398 -0
- notte_eval-0.0.dev0/src/notte_eval/task_types.py +98 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
|
|
110
|
+
# pdm
|
|
111
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
112
|
+
#pdm.lock
|
|
113
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
114
|
+
# in version control.
|
|
115
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
|
116
|
+
.pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
121
|
+
__pypackages__/
|
|
122
|
+
|
|
123
|
+
# Celery stuff
|
|
124
|
+
celerybeat-schedule
|
|
125
|
+
celerybeat.pid
|
|
126
|
+
|
|
127
|
+
# SageMath parsed files
|
|
128
|
+
*.sage.py
|
|
129
|
+
|
|
130
|
+
# Environments
|
|
131
|
+
.env
|
|
132
|
+
.venv
|
|
133
|
+
env/
|
|
134
|
+
venv/
|
|
135
|
+
ENV/
|
|
136
|
+
env.bak/
|
|
137
|
+
venv.bak/
|
|
138
|
+
|
|
139
|
+
# Spyder project settings
|
|
140
|
+
.spyderproject
|
|
141
|
+
.spyproject
|
|
142
|
+
|
|
143
|
+
# Rope project settings
|
|
144
|
+
.ropeproject
|
|
145
|
+
|
|
146
|
+
# mkdocs documentation
|
|
147
|
+
/site
|
|
148
|
+
|
|
149
|
+
# mypy
|
|
150
|
+
.mypy_cache/
|
|
151
|
+
.dmypy.json
|
|
152
|
+
dmypy.json
|
|
153
|
+
|
|
154
|
+
# Pyre type checker
|
|
155
|
+
.pyre/
|
|
156
|
+
|
|
157
|
+
# pytype static type analyzer
|
|
158
|
+
.pytype/
|
|
159
|
+
|
|
160
|
+
# Cython debug symbols
|
|
161
|
+
cython_debug/
|
|
162
|
+
|
|
163
|
+
# PyCharm
|
|
164
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
165
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
166
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
167
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
168
|
+
#.idea/
|
|
169
|
+
|
|
170
|
+
ignore.*
|
|
171
|
+
llm_usage.jsonl
|
|
172
|
+
llm_parsing_error.jsonl
|
|
173
|
+
traces/
|
|
174
|
+
|
|
175
|
+
**/__pycache__/**
|
|
176
|
+
.DS_Store
|
|
177
|
+
**/.DS_Store
|
|
178
|
+
old
|
|
179
|
+
notebook
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: notte-eval
|
|
3
|
+
Version: 0.0.dev0
|
|
4
|
+
Summary: The evaluation for Notte
|
|
5
|
+
Author-email: Notte Team <hello@notte.cc>
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: notte-agent==0.0.dev
|
|
8
|
+
Requires-Dist: notte-browser==0.0.dev
|
|
9
|
+
Requires-Dist: notte-core==0.0.dev
|
|
10
|
+
Requires-Dist: notte-integrations==0.0.dev
|
|
11
|
+
Requires-Dist: notte-sdk==0.0.dev
|
|
12
|
+
Provides-Extra: api
|
|
13
|
+
Requires-Dist: fastapi>=0.115.8; extra == 'api'
|
|
14
|
+
Requires-Dist: uvicorn>=0.29.0; extra == 'api'
|
|
15
|
+
Provides-Extra: browser-use
|
|
16
|
+
Requires-Dist: browser-use>=0.1.40; extra == 'browser-use'
|
|
17
|
+
Requires-Dist: langchain-google-genai>=2.1.1; extra == 'browser-use'
|
|
18
|
+
Provides-Extra: browserbase
|
|
19
|
+
Requires-Dist: browserbase>=1.2.0; extra == 'browserbase'
|
|
20
|
+
Provides-Extra: camoufox
|
|
21
|
+
Requires-Dist: camoufox[geoip]>=0.4.11; extra == 'camoufox'
|
|
22
|
+
Provides-Extra: convergence
|
|
23
|
+
Requires-Dist: proxy-lite; extra == 'convergence'
|
|
24
|
+
Provides-Extra: discord
|
|
25
|
+
Requires-Dist: discord-py<2.5.0,>=2.3.0; extra == 'discord'
|
|
26
|
+
Provides-Extra: embedding
|
|
27
|
+
Requires-Dist: sentence-transformers>=3.4.1; extra == 'embedding'
|
|
28
|
+
Provides-Extra: server
|
|
29
|
+
Requires-Dist: litellm[proxy]>=1.61.16; extra == 'server'
|
|
30
|
+
Provides-Extra: slack
|
|
31
|
+
Requires-Dist: slack-sdk>=3.34.0; extra == 'slack'
|
|
32
|
+
Provides-Extra: vault
|
|
33
|
+
Requires-Dist: hvac>=2.3.0; extra == 'vault'
|
|
File without changes
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "notte-eval"
|
|
3
|
+
version = "0.0.dev"
|
|
4
|
+
description = "The evaluation for Notte"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "Notte Team ", email = "hello@notte.cc" }
|
|
8
|
+
]
|
|
9
|
+
packages = [
|
|
10
|
+
{ include = "notte_eval", from = "src" },
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
requires-python = ">=3.11"
|
|
15
|
+
dependencies = [
|
|
16
|
+
"notte-agent==0.0.dev",
|
|
17
|
+
"notte-browser==0.0.dev",
|
|
18
|
+
"notte-core==0.0.dev",
|
|
19
|
+
"notte-sdk==0.0.dev",
|
|
20
|
+
"notte-integrations==0.0.dev",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.optional-dependencies]
|
|
24
|
+
vault = [
|
|
25
|
+
"hvac>=2.3.0",
|
|
26
|
+
]
|
|
27
|
+
server = [
|
|
28
|
+
"litellm[proxy]>=1.61.16",
|
|
29
|
+
]
|
|
30
|
+
embedding = [
|
|
31
|
+
"sentence-transformers>=3.4.1",
|
|
32
|
+
]
|
|
33
|
+
api = [
|
|
34
|
+
"fastapi>=0.115.8",
|
|
35
|
+
"uvicorn>=0.29.0",
|
|
36
|
+
]
|
|
37
|
+
discord = [
|
|
38
|
+
"discord-py>=2.3.0,<2.5.0",
|
|
39
|
+
]
|
|
40
|
+
slack = [
|
|
41
|
+
"slack-sdk>=3.34.0",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
browserbase = [
|
|
45
|
+
"browserbase>=1.2.0",
|
|
46
|
+
]
|
|
47
|
+
camoufox = [
|
|
48
|
+
"camoufox[geoip]>=0.4.11",
|
|
49
|
+
]
|
|
50
|
+
browser-use = [
|
|
51
|
+
"browser-use>=0.1.40",
|
|
52
|
+
"langchain-google-genai>=2.1.1",
|
|
53
|
+
]
|
|
54
|
+
convergence = [
|
|
55
|
+
"proxy-lite",
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
[build-system]
|
|
59
|
+
requires = ["hatchling"]
|
|
60
|
+
build-backend = "hatchling.build"
|
|
61
|
+
|
|
62
|
+
[dependency-groups]
|
|
63
|
+
|
|
64
|
+
dev = [
|
|
65
|
+
"aiomultiprocess>=0.9.1",
|
|
66
|
+
"basedpyright>=1.27.1",
|
|
67
|
+
"cloudpickle>=3.1.1",
|
|
68
|
+
"joblib>=1.4.2",
|
|
69
|
+
"jupyter>=1.1.1",
|
|
70
|
+
"pandas",
|
|
71
|
+
"pebble>=5.1.1",
|
|
72
|
+
"pre-commit>=4.1.0",
|
|
73
|
+
"pytest>=8.3.4",
|
|
74
|
+
"pytest-asyncio>=0.25.3",
|
|
75
|
+
"pytest-mock>=3.14.0",
|
|
76
|
+
]
|
|
77
|
+
lint = [
|
|
78
|
+
"ruff>=0.9.7",
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
[tool.pytest.ini_options]
|
|
82
|
+
asyncio_default_fixture_loop_scope = "function"
|
|
83
|
+
testpaths = ["tests"]
|
|
84
|
+
timeout = 60
|
|
85
|
+
asyncio_mode = "strict"
|
|
86
|
+
log_cli = true
|
|
87
|
+
log_cli_level = "INFO"
|
|
88
|
+
filterwarnings = [
|
|
89
|
+
"ignore::DeprecationWarning:sklearn.utils.fixes:",
|
|
90
|
+
"ignore::DeprecationWarning:pandas.core.common:",
|
|
91
|
+
"ignore::pydantic.warnings.PydanticDeprecatedSince20:",
|
|
92
|
+
"ignore::DeprecationWarning:importlib.resources._legacy:",
|
|
93
|
+
"ignore::DeprecationWarning:litellm.utils:",
|
|
94
|
+
"ignore:open_text is deprecated*:DeprecationWarning",
|
|
95
|
+
"ignore:distutils Version classes are deprecated. Use packaging.version instead.*:DeprecationWarning",
|
|
96
|
+
'ignore:configuration option "asyncio_default_fixture_loop_scope" is unset',
|
|
97
|
+
"ignore:Valid config keys have changed in V2*:UserWarning"
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
[tool.ruff]
|
|
101
|
+
line-length = 120
|
|
102
|
+
indent-width = 4
|
|
103
|
+
exclude = [".venv"]
|
|
104
|
+
|
|
105
|
+
[tool.ruff.lint]
|
|
106
|
+
extend-select = ["I"]
|
|
107
|
+
|
|
108
|
+
[tool.basedpyright]
|
|
109
|
+
exclude = [".venv", "uv-cache", "**/site-packages/**", "**/dist-packages/**", "tests", "old", "notebook", "dist"]
|
|
110
|
+
enableTypeIgnoreComments = true
|
|
111
|
+
reportIgnoreCommentWithoutRule = false
|
|
112
|
+
failOnWarnings = true
|
|
113
|
+
reportAny = false
|
|
114
|
+
reportExplicitAny = false
|
|
115
|
+
|
|
116
|
+
[tool.uv.sources]
|
|
117
|
+
maincontentextractor = { git = "https://github.com/HawkClaws/main_content_extractor", rev = "7c3ed7f6ed7f6c10223a3357d43ab741663bc812" }
|
|
118
|
+
proxy-lite = { git = "https://github.com/leo-notte/proxy-lite" }
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import importlib
|
|
2
|
+
from enum import StrEnum
|
|
3
|
+
from typing import Any, NamedTuple
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PoolEnum(StrEnum):
|
|
9
|
+
NONE = "None"
|
|
10
|
+
ANCHOR = "Anchor"
|
|
11
|
+
STEEL = "Steel"
|
|
12
|
+
BROWSERBASE = "BrowserBase"
|
|
13
|
+
CAMOUFOX = "Camoufox"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Proxy(BaseModel):
|
|
17
|
+
server: str
|
|
18
|
+
username: str
|
|
19
|
+
password: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def fetch_handler(key: str) -> tuple[type, type]:
|
|
23
|
+
"""
|
|
24
|
+
Import specific module based on key and return input and handler types
|
|
25
|
+
"""
|
|
26
|
+
if key not in HANDLERS_DICT:
|
|
27
|
+
raise ValueError(f"Unknown handler key: {key}")
|
|
28
|
+
|
|
29
|
+
handler = HANDLERS_DICT[key]
|
|
30
|
+
module = importlib.import_module(f"{__package__}.{handler.module_name}")
|
|
31
|
+
|
|
32
|
+
input_type = getattr(module, handler.input_name)
|
|
33
|
+
handler_type = getattr(module, handler.handler_name)
|
|
34
|
+
|
|
35
|
+
return input_type, handler_type
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class HandlerTuple(NamedTuple):
|
|
39
|
+
module_name: str
|
|
40
|
+
input_name: str
|
|
41
|
+
handler_name: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
HANDLERS_DICT = {
|
|
45
|
+
"Falco": HandlerTuple("falco", "FalcoInput", "FalcoBench"),
|
|
46
|
+
"BrowserUse": HandlerTuple("browseruse", "BrowserUseInput", "BrowserUseBench"),
|
|
47
|
+
"BrowserUseAPI": HandlerTuple("browseruse_api", "BrowserUseAPIInput", "BrowserUseAPIBench"),
|
|
48
|
+
"Convergence": HandlerTuple("convergence", "ConvergenceInput", "ConvergenceBench"),
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def trim_image_messages(input_content: list[dict[Any, Any]]) -> None:
|
|
53
|
+
# trim down: remove images in the message history
|
|
54
|
+
for msg in input_content:
|
|
55
|
+
if "content" in msg and isinstance(msg["content"], list):
|
|
56
|
+
for submsg in msg["content"]: # type: ignore
|
|
57
|
+
if "type" in submsg and submsg["type"] == "image_url" and "image_url" in submsg:
|
|
58
|
+
submsg["image_url"] = "benchmark: removed"
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import typing
|
|
5
|
+
|
|
6
|
+
# posthog seems to deadlock tasks otherwise
|
|
7
|
+
os.environ["ANONYMIZED_TELEMETRY"] = "false"
|
|
8
|
+
|
|
9
|
+
from notte_core.utils.webp_replay import ScreenshotReplay
|
|
10
|
+
from pydantic import BaseModel, SecretStr, ValidationError
|
|
11
|
+
from typing_extensions import override
|
|
12
|
+
|
|
13
|
+
from notte_eval.agent_handlers import Proxy, trim_image_messages
|
|
14
|
+
from notte_eval.data.load_data import BenchmarkTask
|
|
15
|
+
from notte_eval.patcher import AgentPatcher, FunctionLog
|
|
16
|
+
from notte_eval.task_types import AgentBenchmark, LLMCall, Step, TaskResult
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from browser_use import Agent as BrowserUseAgent # type: ignore
|
|
20
|
+
from browser_use import AgentHistoryList, Browser, BrowserConfig # type: ignore
|
|
21
|
+
from browser_use.controller.views import DoneAction # type: ignore
|
|
22
|
+
from langchain_openai import ChatOpenAI
|
|
23
|
+
except ImportError:
|
|
24
|
+
raise ImportError("Install notte[browseruse] to fix")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# solely for simplicity of parsing response
|
|
28
|
+
class BUAgentCurrentState(BaseModel):
|
|
29
|
+
evaluation_previous_goal: str
|
|
30
|
+
memory: str
|
|
31
|
+
next_goal: str
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class BUAgentArguments(BaseModel):
|
|
35
|
+
current_state: BUAgentCurrentState
|
|
36
|
+
action: list[dict[str, typing.Any]]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# used for the io to the benchmark (toml)
|
|
40
|
+
class BrowserUseInput(BaseModel):
|
|
41
|
+
use_vision: bool
|
|
42
|
+
model: str
|
|
43
|
+
headless: bool
|
|
44
|
+
max_steps: int
|
|
45
|
+
use_anchor: bool
|
|
46
|
+
proxy: Proxy | None = None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class BrowserUseOutput(BaseModel):
|
|
50
|
+
logged_data: dict[str, list[FunctionLog]]
|
|
51
|
+
per_step_calls: list[tuple[FunctionLog, dict[str, list[FunctionLog]]]]
|
|
52
|
+
history: AgentHistoryList
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class BrowserUseBench(AgentBenchmark[BrowserUseInput, BrowserUseOutput]):
|
|
56
|
+
def __init__(self, params: BrowserUseInput):
|
|
57
|
+
super().__init__(params)
|
|
58
|
+
|
|
59
|
+
@override
|
|
60
|
+
async def run_agent(self, task: BenchmarkTask) -> BrowserUseOutput:
|
|
61
|
+
prompt = f"""You are a helpful web agent.
|
|
62
|
+
Now you are given the task: {task.question}.
|
|
63
|
+
Please interact with : {task.url or "the web"} to get the answer.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
if self.params.proxy is not None:
|
|
67
|
+
proxy = self.params.proxy.model_dump()
|
|
68
|
+
else:
|
|
69
|
+
proxy = None
|
|
70
|
+
|
|
71
|
+
llm = ChatOpenAI(model=self.params.model, api_key=SecretStr(os.getenv("OPENAI_API_KEY", "")))
|
|
72
|
+
|
|
73
|
+
pool = None
|
|
74
|
+
wss_url = None
|
|
75
|
+
if self.params.use_anchor:
|
|
76
|
+
from notte_integrations.sessions.anchor import AnchorSessionsManager
|
|
77
|
+
|
|
78
|
+
pool = AnchorSessionsManager()
|
|
79
|
+
await pool.start()
|
|
80
|
+
|
|
81
|
+
session = pool.create_session_cdp()
|
|
82
|
+
wss_url = session.cdp_url
|
|
83
|
+
|
|
84
|
+
context = None
|
|
85
|
+
try:
|
|
86
|
+
browser = Browser(config=BrowserConfig(headless=self.params.headless, cdp_url=wss_url, proxy=proxy)) # type: ignore
|
|
87
|
+
context = await browser.new_context()
|
|
88
|
+
agent = BrowserUseAgent( # type: ignore
|
|
89
|
+
browser=browser,
|
|
90
|
+
browser_context=context,
|
|
91
|
+
task=prompt,
|
|
92
|
+
llm=llm,
|
|
93
|
+
use_vision=self.params.use_vision,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
patcher = AgentPatcher()
|
|
97
|
+
_ = patcher.log(agent.llm, ["invoke", "ainvoke"])
|
|
98
|
+
_ = patcher.log(agent, ["step", "run"]) # type: ignore
|
|
99
|
+
|
|
100
|
+
result = await agent.run(max_steps=self.params.max_steps)
|
|
101
|
+
finally:
|
|
102
|
+
if context is not None:
|
|
103
|
+
await context.close()
|
|
104
|
+
if pool is not None:
|
|
105
|
+
await pool.stop()
|
|
106
|
+
|
|
107
|
+
return BrowserUseOutput(
|
|
108
|
+
logged_data=patcher.logged_data,
|
|
109
|
+
per_step_calls=patcher.find_encompassed_events("Agent.step"),
|
|
110
|
+
history=result,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
@override
|
|
114
|
+
async def process_output(self, task: BenchmarkTask, out: BrowserUseOutput) -> TaskResult:
|
|
115
|
+
len_steps = len(out.per_step_calls)
|
|
116
|
+
len_history = len(out.history.history)
|
|
117
|
+
|
|
118
|
+
if len_steps != len_history:
|
|
119
|
+
logging.error(
|
|
120
|
+
"Number of step calls isn't the same as the length in history:"
|
|
121
|
+
+ f"{len_steps=}, {len_history=}.\n"
|
|
122
|
+
+ "There will likely be a mismatch."
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
steps: list[Step] = []
|
|
126
|
+
screenshots: list[str] = []
|
|
127
|
+
for (step, in_step_calls), hist in zip(out.per_step_calls, out.history.history):
|
|
128
|
+
screen = hist.state.screenshot
|
|
129
|
+
if screen is not None:
|
|
130
|
+
screenshots.append(screen)
|
|
131
|
+
|
|
132
|
+
llm_calls: list[LLMCall] = []
|
|
133
|
+
llm_calls_logs = in_step_calls["BaseChatModel.ainvoke"]
|
|
134
|
+
for llm_call_log in llm_calls_logs:
|
|
135
|
+
input_content = json.loads(llm_call_log.input_data)
|
|
136
|
+
|
|
137
|
+
input_content = input_content["input"]
|
|
138
|
+
|
|
139
|
+
# trim down images
|
|
140
|
+
trim_image_messages(input_content)
|
|
141
|
+
|
|
142
|
+
output_content = json.loads(llm_call_log.output_data)
|
|
143
|
+
response = output_content["additional_kwargs"]
|
|
144
|
+
tokens = output_content["response_metadata"]["token_usage"]
|
|
145
|
+
|
|
146
|
+
message = ""
|
|
147
|
+
try:
|
|
148
|
+
for tool_call in response["tool_calls"]:
|
|
149
|
+
if "function" not in tool_call or "arguments" not in tool_call["function"]:
|
|
150
|
+
continue
|
|
151
|
+
|
|
152
|
+
args = BUAgentArguments.model_validate_json(tool_call["function"]["arguments"])
|
|
153
|
+
|
|
154
|
+
message += f"🔎 {args.current_state.evaluation_previous_goal}\n"
|
|
155
|
+
message += f"🧠 {args.current_state.memory}\n"
|
|
156
|
+
message += f"🎯 {args.current_state.next_goal}\n"
|
|
157
|
+
message += "🛠️ Actions: \n"
|
|
158
|
+
for action in args.action:
|
|
159
|
+
message += f" - {action}\n"
|
|
160
|
+
except ValidationError:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
llm_calls.append(
|
|
164
|
+
LLMCall(
|
|
165
|
+
input_tokens=tokens["prompt_tokens"],
|
|
166
|
+
output_tokens=tokens["completion_tokens"],
|
|
167
|
+
messages_in=input_content,
|
|
168
|
+
message_out=response,
|
|
169
|
+
pretty_out=message,
|
|
170
|
+
)
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# for llm_call in llm_calls:
|
|
174
|
+
step = Step(
|
|
175
|
+
url=hist.state.url,
|
|
176
|
+
duration_in_s=step.duration_in_s,
|
|
177
|
+
llm_calls=llm_calls,
|
|
178
|
+
)
|
|
179
|
+
steps.append(step)
|
|
180
|
+
|
|
181
|
+
last_out = out.history.history[-1].model_output
|
|
182
|
+
|
|
183
|
+
# default to the full string of the last output, otherwise pick out the answer if we can
|
|
184
|
+
answer = str(last_out)
|
|
185
|
+
try:
|
|
186
|
+
if last_out is not None:
|
|
187
|
+
for action in last_out.action:
|
|
188
|
+
if hasattr(action, "done"):
|
|
189
|
+
answer = typing.cast(DoneAction, getattr(action, "done")).text
|
|
190
|
+
break
|
|
191
|
+
except Exception:
|
|
192
|
+
answer = str(last_out)
|
|
193
|
+
|
|
194
|
+
return TaskResult(
|
|
195
|
+
success=out.history.is_successful() or False,
|
|
196
|
+
duration_in_s=out.logged_data["Agent.run"][0].duration_in_s,
|
|
197
|
+
agent_answer=answer,
|
|
198
|
+
task=task,
|
|
199
|
+
steps=steps,
|
|
200
|
+
screenshots=ScreenshotReplay.from_base64(screenshots),
|
|
201
|
+
)
|