hud-python 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +7 -4
- hud/adapters/common/adapter.py +14 -3
- hud/adapters/common/tests/test_adapter.py +16 -4
- hud/datasets.py +188 -0
- hud/env/docker_client.py +14 -2
- hud/env/local_docker_client.py +28 -6
- hud/gym.py +0 -9
- hud/{mcp_agent → mcp}/__init__.py +2 -0
- hud/mcp/base.py +631 -0
- hud/{mcp_agent → mcp}/claude.py +52 -47
- hud/mcp/client.py +312 -0
- hud/{mcp_agent → mcp}/langchain.py +52 -33
- hud/{mcp_agent → mcp}/openai.py +56 -40
- hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
- hud/mcp/tests/test_claude.py +294 -0
- hud/mcp/tests/test_client.py +324 -0
- hud/mcp/tests/test_openai.py +238 -0
- hud/settings.py +6 -0
- hud/task.py +2 -88
- hud/taskset.py +2 -23
- hud/telemetry/__init__.py +5 -0
- hud/telemetry/_trace.py +180 -17
- hud/telemetry/context.py +79 -0
- hud/telemetry/exporter.py +165 -6
- hud/telemetry/job.py +141 -0
- hud/telemetry/tests/test_trace.py +36 -25
- hud/tools/__init__.py +14 -1
- hud/tools/computer/hud.py +13 -0
- hud/tools/executors/__init__.py +19 -2
- hud/tools/executors/pyautogui.py +84 -50
- hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
- hud/tools/playwright_tool.py +73 -67
- hud/tools/tests/test_edit.py +8 -1
- hud/tools/tests/test_tools.py +3 -0
- hud/trajectory.py +5 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/METADATA +20 -14
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/RECORD +42 -47
- hud/evaluators/__init__.py +0 -9
- hud/evaluators/base.py +0 -32
- hud/evaluators/inspect.py +0 -24
- hud/evaluators/judge.py +0 -189
- hud/evaluators/match.py +0 -156
- hud/evaluators/remote.py +0 -65
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +0 -12
- hud/evaluators/tests/test_judge.py +0 -231
- hud/evaluators/tests/test_match.py +0 -115
- hud/evaluators/tests/test_remote.py +0 -98
- hud/mcp_agent/base.py +0 -723
- /hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/WHEEL +0 -0
- {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
|
-
Project-URL: Homepage, https://github.com/hud-evals/hud-
|
|
6
|
-
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-
|
|
5
|
+
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
7
7
|
Project-URL: Documentation, https://docs.hud.so
|
|
8
8
|
Author-email: HUD SDK <founders@hud.so>
|
|
9
9
|
License: MIT License
|
|
@@ -35,28 +35,22 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
35
35
|
Classifier: Programming Language :: Python :: 3.12
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.13
|
|
37
37
|
Requires-Python: <3.14,>=3.11
|
|
38
|
-
Requires-Dist: aiodocker>=0.24.0
|
|
39
38
|
Requires-Dist: anthropic
|
|
39
|
+
Requires-Dist: datasets>=4.0.0
|
|
40
40
|
Requires-Dist: dotenv>=0.9.9
|
|
41
41
|
Requires-Dist: httpx<1,>=0.23.0
|
|
42
|
-
Requires-Dist: inspect-ai>=0.3.80
|
|
43
|
-
Requires-Dist: ipykernel
|
|
44
42
|
Requires-Dist: langchain
|
|
45
43
|
Requires-Dist: langchain-anthropic
|
|
46
44
|
Requires-Dist: langchain-openai
|
|
47
45
|
Requires-Dist: mcp-use>=1.3.7
|
|
48
46
|
Requires-Dist: mcp==1.12.2
|
|
49
|
-
Requires-Dist: numpy
|
|
50
47
|
Requires-Dist: openai
|
|
51
48
|
Requires-Dist: pathspec>=0.12.1
|
|
52
|
-
Requires-Dist: pillow>=11.1.0
|
|
53
|
-
Requires-Dist: pyautogui>=0.9.54
|
|
54
49
|
Requires-Dist: pydantic-settings<3,>=2
|
|
55
50
|
Requires-Dist: pydantic<3,>=2
|
|
56
|
-
Requires-Dist: textdistance<5,>=4.5.0
|
|
57
|
-
Requires-Dist: toml>=0.10.2
|
|
58
51
|
Requires-Dist: wrapt>=1.14.0
|
|
59
52
|
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
|
|
60
54
|
Requires-Dist: anthropic; extra == 'dev'
|
|
61
55
|
Requires-Dist: dotenv; extra == 'dev'
|
|
62
56
|
Requires-Dist: ipykernel; extra == 'dev'
|
|
@@ -64,17 +58,29 @@ Requires-Dist: ipython<9; extra == 'dev'
|
|
|
64
58
|
Requires-Dist: jupyter-client; extra == 'dev'
|
|
65
59
|
Requires-Dist: jupyter-core; extra == 'dev'
|
|
66
60
|
Requires-Dist: openai; extra == 'dev'
|
|
61
|
+
Requires-Dist: pillow>=11.1.0; extra == 'dev'
|
|
67
62
|
Requires-Dist: playwright; extra == 'dev'
|
|
63
|
+
Requires-Dist: pyautogui>=0.9.54; extra == 'dev'
|
|
68
64
|
Requires-Dist: pyright==1.1.401; extra == 'dev'
|
|
69
65
|
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
70
66
|
Requires-Dist: pytest-cov; extra == 'dev'
|
|
71
67
|
Requires-Dist: pytest-mock; extra == 'dev'
|
|
72
68
|
Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
73
69
|
Requires-Dist: ruff==0.11.8; extra == 'dev'
|
|
70
|
+
Requires-Dist: toml>=0.10.2; extra == 'dev'
|
|
71
|
+
Provides-Extra: v2
|
|
72
|
+
Requires-Dist: aiodocker>=0.24.0; extra == 'v2'
|
|
73
|
+
Requires-Dist: inspect-ai>=0.3.80; extra == 'v2'
|
|
74
|
+
Requires-Dist: ipykernel; extra == 'v2'
|
|
75
|
+
Requires-Dist: numpy; extra == 'v2'
|
|
76
|
+
Requires-Dist: pillow>=11.1.0; extra == 'v2'
|
|
77
|
+
Requires-Dist: pyautogui>=0.9.54; extra == 'v2'
|
|
78
|
+
Requires-Dist: textdistance<5,>=4.5.0; extra == 'v2'
|
|
79
|
+
Requires-Dist: toml>=0.10.2; extra == 'v2'
|
|
74
80
|
Description-Content-Type: text/markdown
|
|
75
81
|
|
|
76
82
|
<div align="left">
|
|
77
|
-
<img src="https://raw.githubusercontent.com/hud-evals/hud-
|
|
83
|
+
<img src="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
|
|
78
84
|
</div>
|
|
79
85
|
|
|
80
86
|
<h3>
|
|
@@ -88,7 +94,7 @@ Evaluate your Computer Use AI agents across web browsers, desktop environments,
|
|
|
88
94
|
We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
89
95
|
|
|
90
96
|
|
|
91
|
-
> **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-
|
|
97
|
+
> **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-python/issues), as the SDK is still evolving!
|
|
92
98
|
|
|
93
99
|
[](https://pypi.org/project/hud-python/)
|
|
94
100
|
|
|
@@ -272,7 +278,7 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
272
278
|
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
|
|
273
279
|
title = {{HUD: An Evaluation Platform for Agents}},
|
|
274
280
|
date = {2025-04},
|
|
275
|
-
url = {https://github.com/hud-evals/hud-
|
|
281
|
+
url = {https://github.com/hud-evals/hud-python},
|
|
276
282
|
langid = {en}
|
|
277
283
|
}
|
|
278
284
|
```
|
|
@@ -1,23 +1,24 @@
|
|
|
1
|
-
hud/__init__.py,sha256=
|
|
1
|
+
hud/__init__.py,sha256=j5Zzth7_M-5DU_KJT2ZV9OfikD2aE6lzyiZA4OrLzi8,1578
|
|
2
|
+
hud/datasets.py,sha256=UZCzzXREbPhlw2ZdUFZ8EDz0lErWEeBPOPQxH71p6EA,6196
|
|
2
3
|
hud/exceptions.py,sha256=Xna_pdEK_ESwkcffsRmT5GXq4xSHLV5cu7Qu3MjstSE,5516
|
|
3
|
-
hud/gym.py,sha256
|
|
4
|
+
hud/gym.py,sha256=-hp5HdPBWf6-j0CgSoX_f2CTLssf1Wo5UhfyrnPbvkc,4774
|
|
4
5
|
hud/job.py,sha256=0vWbr3E5bYstVRzXS_6l-57JGUFcrZpmFrNkOSQ8Aa0,26969
|
|
5
|
-
hud/settings.py,sha256=
|
|
6
|
-
hud/task.py,sha256=
|
|
7
|
-
hud/taskset.py,sha256=
|
|
8
|
-
hud/trajectory.py,sha256=
|
|
6
|
+
hud/settings.py,sha256=KPzeF9OUecApYH8YYMW-8vIRhFP_6htzzZvC4RCUARc,2183
|
|
7
|
+
hud/task.py,sha256=l2mQM5Yc45kWjMXJkg1hVJfG0DLzTHAIXEvl4WLG-ho,5451
|
|
8
|
+
hud/taskset.py,sha256=QjHbcxSy7h7fmtzRHW1ewxtOIydtH7ZotttDoiABTEY,6573
|
|
9
|
+
hud/trajectory.py,sha256=LBVkFz6U_rmyooCZHN81tdOx0Z7DuAgzf0KQLejc4Fo,3937
|
|
9
10
|
hud/types.py,sha256=h7fUowbdyGF4Fg8TUnvCFoa2fflRRPi6xx7YgpBwFis,3109
|
|
10
|
-
hud/version.py,sha256=
|
|
11
|
+
hud/version.py,sha256=xXGUzDnO0wgnaDf7cvjChhGymzp3vrfpGjE5wBibi8E,104
|
|
11
12
|
hud/adapters/__init__.py,sha256=zz24KdC_e9TJPgWo6y57_8SzevEE5ak4Cm6tXzMxwRk,266
|
|
12
13
|
hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
|
|
13
14
|
hud/adapters/claude/adapter.py,sha256=vCpotJ5gzQs4PP2iCXVavIcyG8c_4m1P6fuXStwUxSo,6675
|
|
14
15
|
hud/adapters/claude/tests/__init__.py,sha256=9GZj0rz4tTkiPnLfxTmyBPr-s8UZc3gph6WH8fs8T34,39
|
|
15
16
|
hud/adapters/claude/tests/test_adapter.py,sha256=cAdHEoqLngLiV7QwlWJ0KuNgb1vNv9WZTPQMnxhMDKI,18319
|
|
16
17
|
hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
|
|
17
|
-
hud/adapters/common/adapter.py,sha256=
|
|
18
|
+
hud/adapters/common/adapter.py,sha256=fTpw7wA501nxM3ufl6WMWq4Nc3vXlUeBGS7WgvZVFjU,6180
|
|
18
19
|
hud/adapters/common/types.py,sha256=6frue7_gZlSYtOHhF2tFHqzjltzzHsTVs6-H-jQwZ4Y,9955
|
|
19
20
|
hud/adapters/common/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
-
hud/adapters/common/tests/test_adapter.py,sha256=
|
|
21
|
+
hud/adapters/common/tests/test_adapter.py,sha256=7QRpQPGM1PlMi8RcqJAT4ruGvLT9TgGmc9R5tzncN1M,8965
|
|
21
22
|
hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
|
|
22
23
|
hud/adapters/operator/adapter.py,sha256=Uz4Sr73T57B7v4RRP0uaibHI17N2hBx6Z9YYjgJCUXA,3732
|
|
23
24
|
hud/adapters/operator/tests/__init__.py,sha256=yTsDVusVXZBQL6DnXpLgKQCBRuOYUAVQ8Blk_k5GETk,41
|
|
@@ -34,61 +35,55 @@ hud/agent/tests/__init__.py,sha256=HbAW7FvSvzzKPU5LpveZceU8XTcDkRe1Bmte3OGi2f0,2
|
|
|
34
35
|
hud/agent/tests/test_base.py,sha256=MAHx4QWsX4y4jXDoA1sxWw8uFvL7lIzGlXrnHfOTmkw,8511
|
|
35
36
|
hud/env/__init__.py,sha256=wVEesXMXM5hcNXQHt0-PN4-9RnE69DEnQENS7uJSv_Y,266
|
|
36
37
|
hud/env/client.py,sha256=brhfLkWGSuvxl3vqGMCQT-vXfj8rUbJMhE3zJg9WMDA,869
|
|
37
|
-
hud/env/docker_client.py,sha256=
|
|
38
|
+
hud/env/docker_client.py,sha256=55PTFansUDzsRMT_43eSTVO9rb_wzl_s4aBpBqmMeXk,11749
|
|
38
39
|
hud/env/environment.py,sha256=wjMBwGs5qkkXsVlXR_Z2QPZi4cwXE82ckdzRgHiXPjw,17019
|
|
39
|
-
hud/env/local_docker_client.py,sha256=
|
|
40
|
+
hud/env/local_docker_client.py,sha256=IIuPSV_KJsfCONJAIVkgq_2zgUJl-FE4e5tDkkbRp0Y,12442
|
|
40
41
|
hud/env/remote_client.py,sha256=tP5Gn1YtYgsjdXA4vM4FibAAHnR-9OOH4GrTog97cf8,6670
|
|
41
42
|
hud/env/remote_docker_client.py,sha256=sBoOz3cq9HMgVvX8qCYEhRLvdswMZLG9G4Ybc60RzDo,9574
|
|
42
|
-
hud/
|
|
43
|
-
hud/
|
|
44
|
-
hud/
|
|
45
|
-
hud/
|
|
46
|
-
hud/
|
|
47
|
-
hud/
|
|
48
|
-
hud/
|
|
49
|
-
hud/
|
|
50
|
-
hud/
|
|
51
|
-
hud/
|
|
52
|
-
hud/
|
|
53
|
-
hud/mcp_agent/__init__.py,sha256=0R8SGgg2XU25y7B4lnBRv1n33d9TV6vaPXLafoiya2Y,324
|
|
54
|
-
hud/mcp_agent/base.py,sha256=P92Bcj3VH8veWgG6Yrq6cnE2gOnRaVG0NhEXdI-C8CA,29142
|
|
55
|
-
hud/mcp_agent/claude.py,sha256=5ORCs8PecqkRy2h5pVadxCIzJkjXZPPgkfOsGwJcJR4,11691
|
|
56
|
-
hud/mcp_agent/langchain.py,sha256=JOD10jeFuW4ekgEu7fzKWuveBTTOV0CTIld98fNMbz0,8136
|
|
57
|
-
hud/mcp_agent/openai.py,sha256=7SvbuKraLzlN4aGRsSkFtAVr1YldQmZ_9R8pRTWdQU0,12579
|
|
58
|
-
hud/mcp_agent/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
|
|
59
|
-
hud/mcp_agent/tests/test_base.py,sha256=7j_Id__Fd-d0VDRmfqyYM_p8JtF35mTPR90I8LeUXrI,16109
|
|
43
|
+
hud/mcp/__init__.py,sha256=VBAZfpD8Ww59CkWb4CB0rGhNGqJYtc5y1gWZWHDaViQ,371
|
|
44
|
+
hud/mcp/base.py,sha256=H4CRVGG4aEXAk_qRk3iOi-KLf8AVuffmoXPTaSXD4_0,24376
|
|
45
|
+
hud/mcp/claude.py,sha256=XxXHjNnBvrS2Y98m0xTfFjZYgACCoFVTiNd01neffbM,12034
|
|
46
|
+
hud/mcp/client.py,sha256=qrmpk2syjJ56y-09Dg44RVjUCFfmf5bPXaQSY-2ih-k,11494
|
|
47
|
+
hud/mcp/langchain.py,sha256=hbKSCSQBf4W_pPpGEdy_KNoPA-T7Bsn_BLIDxaLzvVU,9251
|
|
48
|
+
hud/mcp/openai.py,sha256=tpYK4ixLWqxAUXatXhoIZUXMlK1oP8TUZjnkSxBQVMc,13244
|
|
49
|
+
hud/mcp/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
|
|
50
|
+
hud/mcp/tests/test_base.py,sha256=lrRZoyDN7T67kOfv1A5WESaSHsYCaodD2CJnFli-4A4,19125
|
|
51
|
+
hud/mcp/tests/test_claude.py,sha256=kGDThen8ij9QWx_YH3P9UvLlra1ueEMgA_clQ1q60II,11312
|
|
52
|
+
hud/mcp/tests/test_client.py,sha256=ffxKzLmY75v-9l3aceUkn7aTdoO3j6deA4KBE3l9gaQ,11975
|
|
53
|
+
hud/mcp/tests/test_openai.py,sha256=AhnBT_y-zMykQyJARDwKWiQWJsBGwNIlH6fGAzhJh88,9091
|
|
60
54
|
hud/server/__init__.py,sha256=IPxPCqtPLguryN-nBq78Sakypw2bRiE2iHv3SXG8YRk,139
|
|
61
55
|
hud/server/requests.py,sha256=AnFW4ELojjvfF6xjS2no6_fg4Rph2aR2hjPzYTede0Q,8841
|
|
62
56
|
hud/server/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
57
|
hud/server/tests/test_requests.py,sha256=63YCbykcib5MxKxm-OgHJPLX3QC7hmgIwnWaYukVM6s,9077
|
|
64
|
-
hud/telemetry/__init__.py,sha256=
|
|
65
|
-
hud/telemetry/_trace.py,sha256=
|
|
66
|
-
hud/telemetry/context.py,sha256=
|
|
67
|
-
hud/telemetry/exporter.py,sha256=
|
|
58
|
+
hud/telemetry/__init__.py,sha256=qSQhbXYy7c_sG7KhVr-5eiCmeREj6GQ2cijhbIR2-Z4,717
|
|
59
|
+
hud/telemetry/_trace.py,sha256=Di2zKByHaljL6H4VkA-Gh_085jRJQw2VTiMOHX_FKp0,11433
|
|
60
|
+
hud/telemetry/context.py,sha256=qwCdUQ3UX_Y_zfIHSAQ1cdJNv-VLh5y8ovXfLpjHKVY,7492
|
|
61
|
+
hud/telemetry/exporter.py,sha256=10NwliO35J0fStvspgzb93N5MTko3pYNJe0fuTs-gPQ,23225
|
|
62
|
+
hud/telemetry/job.py,sha256=eyjr7Ha2ijM0MIF5f0d1xFOScFUdFIqlmO8GzQZoAJc,4905
|
|
68
63
|
hud/telemetry/mcp_models.py,sha256=0FQZoXtKOKeUsc2L61UbANpUDC7VNL842R2YFR61UBQ,8980
|
|
69
64
|
hud/telemetry/instrumentation/__init__.py,sha256=vHmSqaJMMehgRNn6EN2SMoYDD12rSHkLeVmj7Uy1my0,88
|
|
70
65
|
hud/telemetry/instrumentation/mcp.py,sha256=RbEaqmp8QHj1XqpIzwDSE8gH2cN5UjaBTouRxiPWxmc,9339
|
|
71
66
|
hud/telemetry/instrumentation/registry.py,sha256=UVaSsEA693lvKYd5R3n3ve6GcAB1fwqubRwIVeZiNmo,1821
|
|
72
67
|
hud/telemetry/tests/__init__.py,sha256=QMN8OzfrBUDbQESwrwHCqXLdDwCjYWX8BJcpeLUJfqA,33
|
|
73
68
|
hud/telemetry/tests/test_context.py,sha256=RdtjYHsyvlkKoTQxk0VezaAISEoVQReYqQiqK3jgFLQ,6746
|
|
74
|
-
hud/telemetry/tests/test_trace.py,sha256=
|
|
75
|
-
hud/tools/__init__.py,sha256=
|
|
69
|
+
hud/telemetry/tests/test_trace.py,sha256=mCm5AH-NpuDVvRG-CZhMMqHiJ4dahvcy9KHmWmo6o3A,12494
|
|
70
|
+
hud/tools/__init__.py,sha256=T4PnE5nuBCXsTKXUYBHmaF1Ojc6D5vAa6wA2cFWJfTc,986
|
|
76
71
|
hud/tools/base.py,sha256=lmd7N7IccIWrPpA0NZundIglFTTiLFW9VP_PJI2EXug,2069
|
|
77
72
|
hud/tools/bash.py,sha256=o841_HF1NJFfUWLOVUw9s0iB4BoIxhA-8vMasJOhZ70,4319
|
|
78
73
|
hud/tools/edit.py,sha256=9vJ2XSnWOPViujQbZZuDjLahvzxoPHyAeXxgKfpUDHo,11796
|
|
79
|
-
hud/tools/playwright_tool.py,sha256=
|
|
74
|
+
hud/tools/playwright_tool.py,sha256=IQT1hk5U4H8BI988iZq0B2oS_fbgkaX01Z-ZXL4r71o,13724
|
|
80
75
|
hud/tools/utils.py,sha256=bfVyYMcBOJvr1QdptCjVb6jaHVGIL5WUxmY59kzMekQ,1447
|
|
81
76
|
hud/tools/computer/__init__.py,sha256=ehKY7u0_4cZ9h7YQlOQjbKPWfd5LhQq8ZQn2w2-l2mY,302
|
|
82
77
|
hud/tools/computer/anthropic.py,sha256=M-djQmd0vPZm95FDszaMh4wSaLFPhlcCUb-JkSuflnU,16104
|
|
83
|
-
hud/tools/computer/hud.py,sha256=
|
|
78
|
+
hud/tools/computer/hud.py,sha256=13_xjvf5-yO-7lYBoI44Br31CxL5EumSCQxq876h7rs,13840
|
|
84
79
|
hud/tools/computer/openai.py,sha256=pcMGfoT6O8Rh9IrW_H1Mw2cIwk-FzCswrgjW19piRU8,10538
|
|
85
|
-
hud/tools/executors/__init__.py,sha256=
|
|
80
|
+
hud/tools/executors/__init__.py,sha256=jHxfus9SLhkL6YGtebR5RyKYyVAix3yu5EkUp2Q27Kg,732
|
|
86
81
|
hud/tools/executors/base.py,sha256=4h04Byt4ktaNk_aLOOI798pkMCLiqA7pE2PoaEn_hfg,11647
|
|
87
|
-
hud/tools/executors/pyautogui.py,sha256=
|
|
82
|
+
hud/tools/executors/pyautogui.py,sha256=Kc2OcFw-sEuRBRFtO1ZrWeHs1p-p5FtEpESkzpRhOHk,22098
|
|
88
83
|
hud/tools/executors/xdo.py,sha256=C6ecIVPUba7c6vKpgIcNxKcc698hwelQjj4YYUxT2_4,17751
|
|
89
84
|
hud/tools/executors/tests/__init__.py,sha256=opFpGSH6cEqIZgt9izXd3Yt85pC7xkxiYmOZQTHf4AY,32
|
|
90
85
|
hud/tools/executors/tests/test_base_executor.py,sha256=dvpKHCIjrBhT6E2U3hsjAwuivCAYXplvd08EHN6cxTI,12306
|
|
91
|
-
hud/tools/executors/tests/test_pyautogui_executor.py,sha256=
|
|
86
|
+
hud/tools/executors/tests/test_pyautogui_executor.py,sha256=br-wVvXnRx9G6X0yJ_xeKZf2xl8o4LCnYLeaIbkpuzY,6608
|
|
92
87
|
hud/tools/helper/README.md,sha256=GDS-K-wMnDO3-gtWjisgk5153zBmU29XSrs2ZhlOWQY,1727
|
|
93
88
|
hud/tools/helper/__init__.py,sha256=VqgQkY-y9h-WnGXZRK387fSr1BzrOQoAy3975WDAs4c,209
|
|
94
89
|
hud/tools/helper/mcp_server.py,sha256=t8UaGq91hDKef6zO3ApnJydwcKEqgLF6RdDcJ1GmfEA,2248
|
|
@@ -98,10 +93,10 @@ hud/tools/tests/__init__.py,sha256=eEYYkxX5Hz9woXVOBJ2H2_CQoEih0vH6nRt3sH2Z8v8,4
|
|
|
98
93
|
hud/tools/tests/test_bash.py,sha256=LV3LjijwkQqxuxIXFSepD2x3sYoY4uhdw8EBv4JOyLU,4847
|
|
99
94
|
hud/tools/tests/test_computer.py,sha256=HxYHxKJ0eWyZzC3abzviFBU-auc8x6Sh2ciR_uVXMXw,1595
|
|
100
95
|
hud/tools/tests/test_computer_actions.py,sha256=YtUNFL7anhpXrcvg8EoUY1CqIV-TAAyaNFLZO9CiJ40,1194
|
|
101
|
-
hud/tools/tests/test_edit.py,sha256=
|
|
96
|
+
hud/tools/tests/test_edit.py,sha256=_Bfh9Qc_zSYK5vS9kfhm5G9tkVvX1dsEIFqE3jkeSv0,8527
|
|
102
97
|
hud/tools/tests/test_init.py,sha256=PD_SS6X6SPhEjStJqYxdJRtsa7RbL6cTokAGIn5bWhA,702
|
|
103
98
|
hud/tools/tests/test_playwright_tool.py,sha256=1qED_NF2QXUZmBRbWSmcKImMLUQ3m5CbA_9tLUiaxTQ,6696
|
|
104
|
-
hud/tools/tests/test_tools.py,sha256=
|
|
99
|
+
hud/tools/tests/test_tools.py,sha256=KgSPgdqldpifbHeQHBFdYJVf3boWbvK6LRRRORPfTOg,4595
|
|
105
100
|
hud/tools/tests/test_utils.py,sha256=oYxEnLpSA5sEeYFGUTj74QRNv0AHP3AjmYYHXgIW0BY,5496
|
|
106
101
|
hud/utils/__init__.py,sha256=oSl_gGoS272X2VFnBYX8hLxcP2xgGoBYQXAuLhtQgw8,260
|
|
107
102
|
hud/utils/agent.py,sha256=CpNgjKWMaNqo-EATH_vfJHIN53rEkZngm2LXfUFlldQ,1225
|
|
@@ -116,9 +111,9 @@ hud/utils/tests/test_config.py,sha256=dPlXYWuMrxX-NOYbf0vdJ27TJpfacKG8eiKOSGOcfD
|
|
|
116
111
|
hud/utils/tests/test_init.py,sha256=UxlNTwjlSE2q3M0R86EmMYmmXmbRvzZaC-S2av26QXI,529
|
|
117
112
|
hud/utils/tests/test_progress.py,sha256=QunwDgi_heQXhDgmC25zgjr-sFUu5FdJ_1aYigMKeIc,6351
|
|
118
113
|
hud/utils/tests/test_telemetry.py,sha256=t0An1RTBaE0dZVEpF4uwuq5k1R-PXFR5k4u71h60tx8,1224
|
|
119
|
-
hud/utils/tests/test_version.py,sha256=
|
|
114
|
+
hud/utils/tests/test_version.py,sha256=b0JOKjG9oz4PvBZxUS3hbQYWi8yTEz52VHqTlLXgSYM,159
|
|
120
115
|
hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
121
|
-
hud_python-0.3.
|
|
122
|
-
hud_python-0.3.
|
|
123
|
-
hud_python-0.3.
|
|
124
|
-
hud_python-0.3.
|
|
116
|
+
hud_python-0.3.2.dist-info/METADATA,sha256=p5VyDJbUHnHXC1cUh50DJnVF8j8NIFqjEi-bdd8vP8c,10249
|
|
117
|
+
hud_python-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
118
|
+
hud_python-0.3.2.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
|
|
119
|
+
hud_python-0.3.2.dist-info/RECORD,,
|
hud/evaluators/__init__.py
DELETED
hud/evaluators/base.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
5
|
-
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
-
|
|
8
|
-
if TYPE_CHECKING:
|
|
9
|
-
from hud.task import Task
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class EvaluationResult(BaseModel):
|
|
13
|
-
"""Result of an evaluation.
|
|
14
|
-
|
|
15
|
-
Attributes:
|
|
16
|
-
score: Float score between 0 and 1
|
|
17
|
-
reason: Explanation of the evaluation
|
|
18
|
-
mode: Mode used for matching, if applicable
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
score: float
|
|
22
|
-
reason: str
|
|
23
|
-
mode: str | None = None
|
|
24
|
-
criteria_scores: dict[str, float] | None = Field(default_factory=dict)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class Evaluator(ABC):
|
|
28
|
-
"""Abstract base class for evaluators."""
|
|
29
|
-
|
|
30
|
-
@abstractmethod
|
|
31
|
-
def evaluate(self, task: Task, response: str) -> EvaluationResult:
|
|
32
|
-
"""Evaluate a task and response."""
|
hud/evaluators/inspect.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any
|
|
4
|
-
|
|
5
|
-
from hud.evaluators.base import EvaluationResult
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def inspect_evaluate(
|
|
9
|
-
response: Any,
|
|
10
|
-
answer: Any,
|
|
11
|
-
) -> EvaluationResult:
|
|
12
|
-
"""Evaluate using Inspect-ai's evaluation models.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
response: The response to evaluate
|
|
16
|
-
answer: The reference answer to compare against
|
|
17
|
-
model_name: The Inspect model to use
|
|
18
|
-
prompt: Optional custom prompt for evaluation
|
|
19
|
-
metrics: Optional list of metrics to evaluate against
|
|
20
|
-
|
|
21
|
-
Returns:
|
|
22
|
-
EvaluationResult with the evaluation results
|
|
23
|
-
"""
|
|
24
|
-
return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")
|
hud/evaluators/judge.py
DELETED
|
@@ -1,189 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
import base64
|
|
5
|
-
from typing import Any, Protocol, TypedDict
|
|
6
|
-
|
|
7
|
-
from hud.evaluators.base import EvaluationResult
|
|
8
|
-
from hud.server import make_request
|
|
9
|
-
from hud.settings import settings
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class LLM(Protocol):
|
|
13
|
-
"""Protocol for LLM interfaces that can be used for evaluation."""
|
|
14
|
-
|
|
15
|
-
async def ainvoke(self, prompt: str, /) -> str: ...
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Criterion(TypedDict, total=False):
|
|
19
|
-
"""Criterion for judge-based evaluation."""
|
|
20
|
-
|
|
21
|
-
description: str
|
|
22
|
-
weight: float
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
async def _call_eval_endpoint(
|
|
26
|
-
response: Any, answer: Any, criteria: list[Any], mode: str
|
|
27
|
-
) -> dict[str, Any]:
|
|
28
|
-
"""Call the run_eval endpoint to evaluate the response."""
|
|
29
|
-
try:
|
|
30
|
-
result = await make_request(
|
|
31
|
-
method="POST",
|
|
32
|
-
url=f"{settings.base_url}/evaluations/run_eval",
|
|
33
|
-
json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
|
|
34
|
-
api_key=settings.api_key,
|
|
35
|
-
)
|
|
36
|
-
return result
|
|
37
|
-
except Exception as e:
|
|
38
|
-
# Fallback to local evaluation if remote call fails
|
|
39
|
-
return {
|
|
40
|
-
"score": -1.0,
|
|
41
|
-
"reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
|
|
42
|
-
"criteria_scores": {},
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def _process_input(data: Any) -> Any:
|
|
47
|
-
"""Process input data, detecting and handling base64 images."""
|
|
48
|
-
if isinstance(data, bytes):
|
|
49
|
-
# Convert bytes to base64 string
|
|
50
|
-
return base64.b64encode(data).decode("utf-8")
|
|
51
|
-
|
|
52
|
-
if isinstance(data, str) and _is_base64_image(data):
|
|
53
|
-
# It's already a base64 string, just return it
|
|
54
|
-
return data
|
|
55
|
-
|
|
56
|
-
if isinstance(data, list) and all(isinstance(item, str) for item in data):
|
|
57
|
-
# Process list of strings
|
|
58
|
-
return data
|
|
59
|
-
|
|
60
|
-
# For other types, convert to string
|
|
61
|
-
return str(data) if not isinstance(data, str | dict) else data
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def _is_base64_image(data: Any) -> bool:
|
|
65
|
-
"""Check if a string is a base64 encoded image."""
|
|
66
|
-
if not isinstance(data, str):
|
|
67
|
-
return False
|
|
68
|
-
|
|
69
|
-
# Check for common image data URI pattern
|
|
70
|
-
if data.startswith(("data:image/", "data:application/octet-stream")):
|
|
71
|
-
return True
|
|
72
|
-
|
|
73
|
-
# Check if it's a base64 encoded string with image header
|
|
74
|
-
try:
|
|
75
|
-
# First, validate it's base64 decodable
|
|
76
|
-
padding_needed = len(data) % 4
|
|
77
|
-
if padding_needed:
|
|
78
|
-
data += "=" * (4 - padding_needed)
|
|
79
|
-
|
|
80
|
-
# Try to decode the first few bytes to check for image signatures
|
|
81
|
-
sample = base64.b64decode(data[:30])
|
|
82
|
-
|
|
83
|
-
# Check for common image format signatures
|
|
84
|
-
return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
|
|
85
|
-
except Exception:
|
|
86
|
-
return False
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def judge(
|
|
90
|
-
response: Any,
|
|
91
|
-
answer: Any,
|
|
92
|
-
llm: LLM | None = None,
|
|
93
|
-
criteria: list[str] | list[dict] | None = None,
|
|
94
|
-
) -> EvaluationResult:
|
|
95
|
-
"""Judge a response against an answer using an LLM.
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
response: The response to evaluate
|
|
99
|
-
answer: The reference answer to compare against
|
|
100
|
-
llm: Optional langchain LLM to use for evaluation
|
|
101
|
-
criteria: Evaluation criteria as strings or dictionaries
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
EvaluationResult with evaluation results
|
|
105
|
-
"""
|
|
106
|
-
# Process inputs
|
|
107
|
-
processed_response = _process_input(response)
|
|
108
|
-
processed_answer = _process_input(answer)
|
|
109
|
-
|
|
110
|
-
# If LLM is provided, use it for evaluation
|
|
111
|
-
if llm:
|
|
112
|
-
return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
|
|
113
|
-
|
|
114
|
-
# Otherwise, use the remote evaluation service
|
|
115
|
-
mode = "LLM"
|
|
116
|
-
if isinstance(answer, bytes) or _is_base64_image(answer):
|
|
117
|
-
mode = "VLM"
|
|
118
|
-
|
|
119
|
-
# Call the eval endpoint synchronously
|
|
120
|
-
result = asyncio.run(
|
|
121
|
-
_call_eval_endpoint(
|
|
122
|
-
response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
|
|
123
|
-
)
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
return EvaluationResult(
|
|
127
|
-
score=result.get("score", -1.0),
|
|
128
|
-
reason=result.get("reason", "Response evaluated"),
|
|
129
|
-
mode=mode,
|
|
130
|
-
criteria_scores=result.get("criteria_scores", {}),
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def _evaluate_with_llm(
|
|
135
|
-
response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
|
|
136
|
-
) -> EvaluationResult:
|
|
137
|
-
"""Evaluate a response against an answer using a provided LLM."""
|
|
138
|
-
criteria_text = ""
|
|
139
|
-
if criteria:
|
|
140
|
-
criteria_text = "Use the following criteria:\n"
|
|
141
|
-
for c in criteria:
|
|
142
|
-
if isinstance(c, dict) and "description" in c:
|
|
143
|
-
criteria_text += f"- {c['description']}\n"
|
|
144
|
-
elif isinstance(c, str):
|
|
145
|
-
criteria_text += f"- {c}\n"
|
|
146
|
-
|
|
147
|
-
prompt = f"""Evaluate the quality of a response given a reference answer.
|
|
148
|
-
|
|
149
|
-
REFERENCE ANSWER:
|
|
150
|
-
{answer}
|
|
151
|
-
|
|
152
|
-
RESPONSE TO EVALUATE:
|
|
153
|
-
{response}
|
|
154
|
-
|
|
155
|
-
{criteria_text}
|
|
156
|
-
Rate the response on a scale from 0.0 to 1.0, where 1.0 is perfect.
|
|
157
|
-
Provide a brief explanation for your rating.
|
|
158
|
-
Format your answer as a JSON object with 'score' (float) and 'reason' (string) fields.
|
|
159
|
-
"""
|
|
160
|
-
|
|
161
|
-
try:
|
|
162
|
-
# Run the evaluation asynchronously
|
|
163
|
-
result_text = asyncio.run(llm.ainvoke(prompt))
|
|
164
|
-
|
|
165
|
-
# Attempt to parse JSON response
|
|
166
|
-
import json
|
|
167
|
-
import re
|
|
168
|
-
|
|
169
|
-
# Try to extract JSON if wrapped in other text
|
|
170
|
-
json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
|
|
171
|
-
if json_match:
|
|
172
|
-
json_str = json_match.group(0)
|
|
173
|
-
result = json.loads(json_str)
|
|
174
|
-
|
|
175
|
-
return EvaluationResult(
|
|
176
|
-
score=float(result.get("score", 0.5)),
|
|
177
|
-
reason=result.get("reason", "Evaluated with custom LLM"),
|
|
178
|
-
mode="custom_llm",
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
# If can't parse as JSON, use default values
|
|
182
|
-
return EvaluationResult(
|
|
183
|
-
score=0.5,
|
|
184
|
-
reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
|
|
185
|
-
mode="custom_llm",
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
except Exception as e:
|
|
189
|
-
return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")
|
hud/evaluators/match.py
DELETED
|
@@ -1,156 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from difflib import SequenceMatcher
|
|
5
|
-
from typing import TYPE_CHECKING, Protocol
|
|
6
|
-
|
|
7
|
-
from textdistance import levenshtein
|
|
8
|
-
|
|
9
|
-
from hud.evaluators.base import EvaluationResult
|
|
10
|
-
|
|
11
|
-
if TYPE_CHECKING:
|
|
12
|
-
from collections.abc import Sequence
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class _Stringable(Protocol):
|
|
16
|
-
def __str__(self) -> str: ...
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def match_single(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
20
|
-
"""Check if the answer is present within the response.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
response: The response to evaluate
|
|
24
|
-
answer: The expected answer
|
|
25
|
-
|
|
26
|
-
Returns:
|
|
27
|
-
EvaluationResult with score=1.0 if match, 0.0 otherwise
|
|
28
|
-
"""
|
|
29
|
-
passed = str(answer).lower().strip() in str(response).lower().strip()
|
|
30
|
-
return EvaluationResult(
|
|
31
|
-
score=1.0 if passed else 0.0,
|
|
32
|
-
reason="Exact match" if passed else "No exact match found",
|
|
33
|
-
mode="single",
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def match_all(response: _Stringable, answers: Sequence[_Stringable]) -> EvaluationResult:
|
|
38
|
-
"""Count how many expected answers are in the response.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
response: The response to evaluate
|
|
42
|
-
answers: List of expected answers
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
EvaluationResult with score=proportion of matches (0.0-1.0)
|
|
46
|
-
"""
|
|
47
|
-
response_str = str(response).lower()
|
|
48
|
-
matches = 0
|
|
49
|
-
|
|
50
|
-
for answer in answers:
|
|
51
|
-
if str(answer).lower() in response_str:
|
|
52
|
-
matches += 1
|
|
53
|
-
|
|
54
|
-
score = matches / len(answers) if answers else 0.0
|
|
55
|
-
|
|
56
|
-
if matches == len(answers):
|
|
57
|
-
reason = f"All {matches} expected items found"
|
|
58
|
-
else:
|
|
59
|
-
reason = f"Only {matches} of {len(answers)} expected items found"
|
|
60
|
-
|
|
61
|
-
return EvaluationResult(score=score, reason=reason, mode="all")
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def match_fuzzy(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
65
|
-
"""Calculate similarity using Levenshtein distance.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
response: The response to evaluate
|
|
69
|
-
answer: The expected answer
|
|
70
|
-
|
|
71
|
-
Returns:
|
|
72
|
-
EvaluationResult with score=similarity (0.0-1.0)
|
|
73
|
-
"""
|
|
74
|
-
s1 = str(response).lower()
|
|
75
|
-
s2 = str(answer).lower()
|
|
76
|
-
|
|
77
|
-
if s1 == s2:
|
|
78
|
-
score = 1.0
|
|
79
|
-
elif len(s1) == 0 or len(s2) == 0:
|
|
80
|
-
score = 0.0
|
|
81
|
-
else:
|
|
82
|
-
# Use Levenshtein distance
|
|
83
|
-
distance = levenshtein.distance(s1, s2)
|
|
84
|
-
max_len = max(len(s1), len(s2))
|
|
85
|
-
score = 1.0 - (distance / max_len)
|
|
86
|
-
|
|
87
|
-
return EvaluationResult(
|
|
88
|
-
score=score, reason=f"Fuzzy match with {score:.1%} similarity", mode="fuzz"
|
|
89
|
-
)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def match_regex(response: _Stringable, pattern: str) -> EvaluationResult:
|
|
93
|
-
"""Check if response matches regex pattern.
|
|
94
|
-
|
|
95
|
-
Args:
|
|
96
|
-
response: The response to evaluate
|
|
97
|
-
pattern: Regular expression pattern to match
|
|
98
|
-
|
|
99
|
-
Returns:
|
|
100
|
-
EvaluationResult with score=1.0 if match, 0.0 otherwise
|
|
101
|
-
"""
|
|
102
|
-
try:
|
|
103
|
-
regex = re.compile(pattern, re.DOTALL)
|
|
104
|
-
passed = bool(regex.search(str(response)))
|
|
105
|
-
return EvaluationResult(
|
|
106
|
-
score=1.0 if passed else 0.0,
|
|
107
|
-
reason="Regex pattern matched" if passed else "Regex pattern did not match",
|
|
108
|
-
mode="regex",
|
|
109
|
-
)
|
|
110
|
-
except re.error:
|
|
111
|
-
return EvaluationResult(score=0.0, reason="Invalid regex pattern", mode="regex")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def match_diff(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
115
|
-
"""Compare difference between response and answer.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
response: The response to evaluate
|
|
119
|
-
answer: The expected answer
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
EvaluationResult with score=similarity (0.0-1.0)
|
|
123
|
-
"""
|
|
124
|
-
if isinstance(response, int | float) and isinstance(answer, int | float):
|
|
125
|
-
score = _match_numeric_diff(response, answer)
|
|
126
|
-
reason = f"Numeric difference: {abs(response - answer)}"
|
|
127
|
-
else:
|
|
128
|
-
score = _match_string_diff(response, answer)
|
|
129
|
-
reason = f"String difference with {score:.1%} similarity"
|
|
130
|
-
|
|
131
|
-
return EvaluationResult(score=score, reason=reason, mode="diff")
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def _match_string_diff(response: _Stringable, answer: _Stringable) -> float:
|
|
135
|
-
"""Compare difference between response and answer strings."""
|
|
136
|
-
matcher = SequenceMatcher(None, str(response), str(answer))
|
|
137
|
-
return matcher.ratio()
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
def _match_numeric_diff(response: float, answer: float) -> float:
|
|
141
|
-
"""Calculate normalized difference between numeric values.
|
|
142
|
-
|
|
143
|
-
Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
|
|
144
|
-
"""
|
|
145
|
-
if response == answer:
|
|
146
|
-
return 1.0
|
|
147
|
-
|
|
148
|
-
# Simple absolute difference normalized to a 0-1 scale
|
|
149
|
-
diff = abs(response - answer)
|
|
150
|
-
max_val = max(abs(response), abs(answer))
|
|
151
|
-
|
|
152
|
-
if max_val == 0:
|
|
153
|
-
return 1.0 # Both are zero
|
|
154
|
-
|
|
155
|
-
# Normalize and invert so 1.0 means identical
|
|
156
|
-
return max(0.0, 1.0 - min(1.0, diff / max_val))
|