notte-eval 0.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. notte_eval-0.0.dev0/.gitignore +179 -0
  2. notte_eval-0.0.dev0/PKG-INFO +33 -0
  3. notte_eval-0.0.dev0/README.md +0 -0
  4. notte_eval-0.0.dev0/pyproject.toml +118 -0
  5. notte_eval-0.0.dev0/src/notte_eval/__init__.py +3 -0
  6. notte_eval-0.0.dev0/src/notte_eval/agent_handlers/__init__.py +58 -0
  7. notte_eval-0.0.dev0/src/notte_eval/agent_handlers/browseruse.py +201 -0
  8. notte_eval-0.0.dev0/src/notte_eval/agent_handlers/browseruse_api.py +147 -0
  9. notte_eval-0.0.dev0/src/notte_eval/agent_handlers/convergence.py +185 -0
  10. notte_eval-0.0.dev0/src/notte_eval/agent_handlers/falco.py +238 -0
  11. notte_eval-0.0.dev0/src/notte_eval/agent_handlers/mock.py +35 -0
  12. notte_eval-0.0.dev0/src/notte_eval/data/__init__.py +0 -0
  13. notte_eval-0.0.dev0/src/notte_eval/data/eval.py +204 -0
  14. notte_eval-0.0.dev0/src/notte_eval/data/gaia/GAIA_webvoyager.jsonl +90 -0
  15. notte_eval-0.0.dev0/src/notte_eval/data/load_data.py +97 -0
  16. notte_eval-0.0.dev0/src/notte_eval/data/scratch/proxy.jsonl +9 -0
  17. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data.jsonl +643 -0
  18. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data_short.jsonl +96 -0
  19. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data_simple.jsonl +30 -0
  20. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/WebVoyager_data_single.jsonl +1 -0
  21. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/archive/webvoyager_excluded.jsonl +55 -0
  22. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/convert.py +23 -0
  23. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager.jsonl +643 -0
  24. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_convergence.jsonl +601 -0
  25. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_excluded.jsonl +54 -0
  26. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_simple.jsonl +30 -0
  27. notte_eval-0.0.dev0/src/notte_eval/data/webvoyager/webvoyager_single.jsonl +1 -0
  28. notte_eval-0.0.dev0/src/notte_eval/evaluators/__init__.py +26 -0
  29. notte_eval-0.0.dev0/src/notte_eval/evaluators/evaluator.py +31 -0
  30. notte_eval-0.0.dev0/src/notte_eval/evaluators/webvoyager.py +109 -0
  31. notte_eval-0.0.dev0/src/notte_eval/patcher.py +189 -0
  32. notte_eval-0.0.dev0/src/notte_eval/py.typed +0 -0
  33. notte_eval-0.0.dev0/src/notte_eval/run.py +398 -0
  34. notte_eval-0.0.dev0/src/notte_eval/task_types.py +98 -0
@@ -0,0 +1,179 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ ignore.*
171
+ llm_usage.jsonl
172
+ llm_parsing_error.jsonl
173
+ traces/
174
+
175
+ **/__pycache__/**
176
+ .DS_Store
177
+ **/.DS_Store
178
+ old
179
+ notebook
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: notte-eval
3
+ Version: 0.0.dev0
4
+ Summary: The evaluation for Notte
5
+ Author-email: Notte Team <hello@notte.cc>
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: notte-agent==0.0.dev
8
+ Requires-Dist: notte-browser==0.0.dev
9
+ Requires-Dist: notte-core==0.0.dev
10
+ Requires-Dist: notte-integrations==0.0.dev
11
+ Requires-Dist: notte-sdk==0.0.dev
12
+ Provides-Extra: api
13
+ Requires-Dist: fastapi>=0.115.8; extra == 'api'
14
+ Requires-Dist: uvicorn>=0.29.0; extra == 'api'
15
+ Provides-Extra: browser-use
16
+ Requires-Dist: browser-use>=0.1.40; extra == 'browser-use'
17
+ Requires-Dist: langchain-google-genai>=2.1.1; extra == 'browser-use'
18
+ Provides-Extra: browserbase
19
+ Requires-Dist: browserbase>=1.2.0; extra == 'browserbase'
20
+ Provides-Extra: camoufox
21
+ Requires-Dist: camoufox[geoip]>=0.4.11; extra == 'camoufox'
22
+ Provides-Extra: convergence
23
+ Requires-Dist: proxy-lite; extra == 'convergence'
24
+ Provides-Extra: discord
25
+ Requires-Dist: discord-py<2.5.0,>=2.3.0; extra == 'discord'
26
+ Provides-Extra: embedding
27
+ Requires-Dist: sentence-transformers>=3.4.1; extra == 'embedding'
28
+ Provides-Extra: server
29
+ Requires-Dist: litellm[proxy]>=1.61.16; extra == 'server'
30
+ Provides-Extra: slack
31
+ Requires-Dist: slack-sdk>=3.34.0; extra == 'slack'
32
+ Provides-Extra: vault
33
+ Requires-Dist: hvac>=2.3.0; extra == 'vault'
File without changes
@@ -0,0 +1,118 @@
1
+ [project]
2
+ name = "notte-eval"
3
+ version = "0.0.dev"
4
+ description = "The evaluation for Notte"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Notte Team ", email = "hello@notte.cc" }
8
+ ]
9
+ packages = [
10
+ { include = "notte_eval", from = "src" },
11
+ ]
12
+
13
+
14
+ requires-python = ">=3.11"
15
+ dependencies = [
16
+ "notte-agent==0.0.dev",
17
+ "notte-browser==0.0.dev",
18
+ "notte-core==0.0.dev",
19
+ "notte-sdk==0.0.dev",
20
+ "notte-integrations==0.0.dev",
21
+ ]
22
+
23
+ [project.optional-dependencies]
24
+ vault = [
25
+ "hvac>=2.3.0",
26
+ ]
27
+ server = [
28
+ "litellm[proxy]>=1.61.16",
29
+ ]
30
+ embedding = [
31
+ "sentence-transformers>=3.4.1",
32
+ ]
33
+ api = [
34
+ "fastapi>=0.115.8",
35
+ "uvicorn>=0.29.0",
36
+ ]
37
+ discord = [
38
+ "discord-py>=2.3.0,<2.5.0",
39
+ ]
40
+ slack = [
41
+ "slack-sdk>=3.34.0",
42
+ ]
43
+
44
+ browserbase = [
45
+ "browserbase>=1.2.0",
46
+ ]
47
+ camoufox = [
48
+ "camoufox[geoip]>=0.4.11",
49
+ ]
50
+ browser-use = [
51
+ "browser-use>=0.1.40",
52
+ "langchain-google-genai>=2.1.1",
53
+ ]
54
+ convergence = [
55
+ "proxy-lite",
56
+ ]
57
+
58
+ [build-system]
59
+ requires = ["hatchling"]
60
+ build-backend = "hatchling.build"
61
+
62
+ [dependency-groups]
63
+
64
+ dev = [
65
+ "aiomultiprocess>=0.9.1",
66
+ "basedpyright>=1.27.1",
67
+ "cloudpickle>=3.1.1",
68
+ "joblib>=1.4.2",
69
+ "jupyter>=1.1.1",
70
+ "pandas",
71
+ "pebble>=5.1.1",
72
+ "pre-commit>=4.1.0",
73
+ "pytest>=8.3.4",
74
+ "pytest-asyncio>=0.25.3",
75
+ "pytest-mock>=3.14.0",
76
+ ]
77
+ lint = [
78
+ "ruff>=0.9.7",
79
+ ]
80
+
81
+ [tool.pytest.ini_options]
82
+ asyncio_default_fixture_loop_scope = "function"
83
+ testpaths = ["tests"]
84
+ timeout = 60
85
+ asyncio_mode = "strict"
86
+ log_cli = true
87
+ log_cli_level = "INFO"
88
+ filterwarnings = [
89
+ "ignore::DeprecationWarning:sklearn.utils.fixes:",
90
+ "ignore::DeprecationWarning:pandas.core.common:",
91
+ "ignore::pydantic.warnings.PydanticDeprecatedSince20:",
92
+ "ignore::DeprecationWarning:importlib.resources._legacy:",
93
+ "ignore::DeprecationWarning:litellm.utils:",
94
+ "ignore:open_text is deprecated*:DeprecationWarning",
95
+ "ignore:distutils Version classes are deprecated. Use packaging.version instead.*:DeprecationWarning",
96
+ 'ignore:configuration option "asyncio_default_fixture_loop_scope" is unset',
97
+ "ignore:Valid config keys have changed in V2*:UserWarning"
98
+ ]
99
+
100
+ [tool.ruff]
101
+ line-length = 120
102
+ indent-width = 4
103
+ exclude = [".venv"]
104
+
105
+ [tool.ruff.lint]
106
+ extend-select = ["I"]
107
+
108
+ [tool.basedpyright]
109
+ exclude = [".venv", "uv-cache", "**/site-packages/**", "**/dist-packages/**", "tests", "old", "notebook", "dist"]
110
+ enableTypeIgnoreComments = true
111
+ reportIgnoreCommentWithoutRule = false
112
+ failOnWarnings = true
113
+ reportAny = false
114
+ reportExplicitAny = false
115
+
116
+ [tool.uv.sources]
117
+ maincontentextractor = { git = "https://github.com/HawkClaws/main_content_extractor", rev = "7c3ed7f6ed7f6c10223a3357d43ab741663bc812" }
118
+ proxy-lite = { git = "https://github.com/leo-notte/proxy-lite" }
@@ -0,0 +1,3 @@
1
+ from notte_core import check_notte_version
2
+
3
+ __version__ = check_notte_version("notte_eval")
@@ -0,0 +1,58 @@
1
+ import importlib
2
+ from enum import StrEnum
3
+ from typing import Any, NamedTuple
4
+
5
+ from pydantic import BaseModel
6
+
7
+
8
+ class PoolEnum(StrEnum):
9
+ NONE = "None"
10
+ ANCHOR = "Anchor"
11
+ STEEL = "Steel"
12
+ BROWSERBASE = "BrowserBase"
13
+ CAMOUFOX = "Camoufox"
14
+
15
+
16
+ class Proxy(BaseModel):
17
+ server: str
18
+ username: str
19
+ password: str
20
+
21
+
22
+ def fetch_handler(key: str) -> tuple[type, type]:
23
+ """
24
+ Import specific module based on key and return input and handler types
25
+ """
26
+ if key not in HANDLERS_DICT:
27
+ raise ValueError(f"Unknown handler key: {key}")
28
+
29
+ handler = HANDLERS_DICT[key]
30
+ module = importlib.import_module(f"{__package__}.{handler.module_name}")
31
+
32
+ input_type = getattr(module, handler.input_name)
33
+ handler_type = getattr(module, handler.handler_name)
34
+
35
+ return input_type, handler_type
36
+
37
+
38
+ class HandlerTuple(NamedTuple):
39
+ module_name: str
40
+ input_name: str
41
+ handler_name: str
42
+
43
+
44
+ HANDLERS_DICT = {
45
+ "Falco": HandlerTuple("falco", "FalcoInput", "FalcoBench"),
46
+ "BrowserUse": HandlerTuple("browseruse", "BrowserUseInput", "BrowserUseBench"),
47
+ "BrowserUseAPI": HandlerTuple("browseruse_api", "BrowserUseAPIInput", "BrowserUseAPIBench"),
48
+ "Convergence": HandlerTuple("convergence", "ConvergenceInput", "ConvergenceBench"),
49
+ }
50
+
51
+
52
+ def trim_image_messages(input_content: list[dict[Any, Any]]) -> None:
53
+ # trim down: remove images in the message history
54
+ for msg in input_content:
55
+ if "content" in msg and isinstance(msg["content"], list):
56
+ for submsg in msg["content"]: # type: ignore
57
+ if "type" in submsg and submsg["type"] == "image_url" and "image_url" in submsg:
58
+ submsg["image_url"] = "benchmark: removed"
@@ -0,0 +1,201 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ import typing
5
+
6
+ # posthog seems to deadlock tasks otherwise
7
+ os.environ["ANONYMIZED_TELEMETRY"] = "false"
8
+
9
+ from notte_core.utils.webp_replay import ScreenshotReplay
10
+ from pydantic import BaseModel, SecretStr, ValidationError
11
+ from typing_extensions import override
12
+
13
+ from notte_eval.agent_handlers import Proxy, trim_image_messages
14
+ from notte_eval.data.load_data import BenchmarkTask
15
+ from notte_eval.patcher import AgentPatcher, FunctionLog
16
+ from notte_eval.task_types import AgentBenchmark, LLMCall, Step, TaskResult
17
+
18
+ try:
19
+ from browser_use import Agent as BrowserUseAgent # type: ignore
20
+ from browser_use import AgentHistoryList, Browser, BrowserConfig # type: ignore
21
+ from browser_use.controller.views import DoneAction # type: ignore
22
+ from langchain_openai import ChatOpenAI
23
+ except ImportError:
24
+ raise ImportError("Install notte[browseruse] to fix")
25
+
26
+
27
+ # solely for simplicity of parsing response
28
+ class BUAgentCurrentState(BaseModel):
29
+ evaluation_previous_goal: str
30
+ memory: str
31
+ next_goal: str
32
+
33
+
34
+ class BUAgentArguments(BaseModel):
35
+ current_state: BUAgentCurrentState
36
+ action: list[dict[str, typing.Any]]
37
+
38
+
39
+ # used for the io to the benchmark (toml)
40
+ class BrowserUseInput(BaseModel):
41
+ use_vision: bool
42
+ model: str
43
+ headless: bool
44
+ max_steps: int
45
+ use_anchor: bool
46
+ proxy: Proxy | None = None
47
+
48
+
49
+ class BrowserUseOutput(BaseModel):
50
+ logged_data: dict[str, list[FunctionLog]]
51
+ per_step_calls: list[tuple[FunctionLog, dict[str, list[FunctionLog]]]]
52
+ history: AgentHistoryList
53
+
54
+
55
+ class BrowserUseBench(AgentBenchmark[BrowserUseInput, BrowserUseOutput]):
56
+ def __init__(self, params: BrowserUseInput):
57
+ super().__init__(params)
58
+
59
+ @override
60
+ async def run_agent(self, task: BenchmarkTask) -> BrowserUseOutput:
61
+ prompt = f"""You are a helpful web agent.
62
+ Now you are given the task: {task.question}.
63
+ Please interact with : {task.url or "the web"} to get the answer.
64
+ """
65
+
66
+ if self.params.proxy is not None:
67
+ proxy = self.params.proxy.model_dump()
68
+ else:
69
+ proxy = None
70
+
71
+ llm = ChatOpenAI(model=self.params.model, api_key=SecretStr(os.getenv("OPENAI_API_KEY", "")))
72
+
73
+ pool = None
74
+ wss_url = None
75
+ if self.params.use_anchor:
76
+ from notte_integrations.sessions.anchor import AnchorSessionsManager
77
+
78
+ pool = AnchorSessionsManager()
79
+ await pool.start()
80
+
81
+ session = pool.create_session_cdp()
82
+ wss_url = session.cdp_url
83
+
84
+ context = None
85
+ try:
86
+ browser = Browser(config=BrowserConfig(headless=self.params.headless, cdp_url=wss_url, proxy=proxy)) # type: ignore
87
+ context = await browser.new_context()
88
+ agent = BrowserUseAgent( # type: ignore
89
+ browser=browser,
90
+ browser_context=context,
91
+ task=prompt,
92
+ llm=llm,
93
+ use_vision=self.params.use_vision,
94
+ )
95
+
96
+ patcher = AgentPatcher()
97
+ _ = patcher.log(agent.llm, ["invoke", "ainvoke"])
98
+ _ = patcher.log(agent, ["step", "run"]) # type: ignore
99
+
100
+ result = await agent.run(max_steps=self.params.max_steps)
101
+ finally:
102
+ if context is not None:
103
+ await context.close()
104
+ if pool is not None:
105
+ await pool.stop()
106
+
107
+ return BrowserUseOutput(
108
+ logged_data=patcher.logged_data,
109
+ per_step_calls=patcher.find_encompassed_events("Agent.step"),
110
+ history=result,
111
+ )
112
+
113
+ @override
114
+ async def process_output(self, task: BenchmarkTask, out: BrowserUseOutput) -> TaskResult:
115
+ len_steps = len(out.per_step_calls)
116
+ len_history = len(out.history.history)
117
+
118
+ if len_steps != len_history:
119
+ logging.error(
120
+ "Number of step calls isn't the same as the length in history:"
121
+ + f"{len_steps=}, {len_history=}.\n"
122
+ + "There will likely be a mismatch."
123
+ )
124
+
125
+ steps: list[Step] = []
126
+ screenshots: list[str] = []
127
+ for (step, in_step_calls), hist in zip(out.per_step_calls, out.history.history):
128
+ screen = hist.state.screenshot
129
+ if screen is not None:
130
+ screenshots.append(screen)
131
+
132
+ llm_calls: list[LLMCall] = []
133
+ llm_calls_logs = in_step_calls["BaseChatModel.ainvoke"]
134
+ for llm_call_log in llm_calls_logs:
135
+ input_content = json.loads(llm_call_log.input_data)
136
+
137
+ input_content = input_content["input"]
138
+
139
+ # trim down images
140
+ trim_image_messages(input_content)
141
+
142
+ output_content = json.loads(llm_call_log.output_data)
143
+ response = output_content["additional_kwargs"]
144
+ tokens = output_content["response_metadata"]["token_usage"]
145
+
146
+ message = ""
147
+ try:
148
+ for tool_call in response["tool_calls"]:
149
+ if "function" not in tool_call or "arguments" not in tool_call["function"]:
150
+ continue
151
+
152
+ args = BUAgentArguments.model_validate_json(tool_call["function"]["arguments"])
153
+
154
+ message += f"🔎 {args.current_state.evaluation_previous_goal}\n"
155
+ message += f"🧠 {args.current_state.memory}\n"
156
+ message += f"🎯 {args.current_state.next_goal}\n"
157
+ message += "🛠️ Actions: \n"
158
+ for action in args.action:
159
+ message += f" - {action}\n"
160
+ except ValidationError:
161
+ pass
162
+
163
+ llm_calls.append(
164
+ LLMCall(
165
+ input_tokens=tokens["prompt_tokens"],
166
+ output_tokens=tokens["completion_tokens"],
167
+ messages_in=input_content,
168
+ message_out=response,
169
+ pretty_out=message,
170
+ )
171
+ )
172
+
173
+ # for llm_call in llm_calls:
174
+ step = Step(
175
+ url=hist.state.url,
176
+ duration_in_s=step.duration_in_s,
177
+ llm_calls=llm_calls,
178
+ )
179
+ steps.append(step)
180
+
181
+ last_out = out.history.history[-1].model_output
182
+
183
+ # default to the full string of the last output, otherwise pick out the answer if we can
184
+ answer = str(last_out)
185
+ try:
186
+ if last_out is not None:
187
+ for action in last_out.action:
188
+ if hasattr(action, "done"):
189
+ answer = typing.cast(DoneAction, getattr(action, "done")).text
190
+ break
191
+ except Exception:
192
+ answer = str(last_out)
193
+
194
+ return TaskResult(
195
+ success=out.history.is_successful() or False,
196
+ duration_in_s=out.logged_data["Agent.run"][0].duration_in_s,
197
+ agent_answer=answer,
198
+ task=task,
199
+ steps=steps,
200
+ screenshots=ScreenshotReplay.from_base64(screenshots),
201
+ )