hud-python 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (54) hide show
  1. hud/__init__.py +7 -4
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +14 -2
  6. hud/env/local_docker_client.py +28 -6
  7. hud/gym.py +0 -9
  8. hud/{mcp_agent → mcp}/__init__.py +2 -0
  9. hud/mcp/base.py +631 -0
  10. hud/{mcp_agent → mcp}/claude.py +52 -47
  11. hud/mcp/client.py +312 -0
  12. hud/{mcp_agent → mcp}/langchain.py +52 -33
  13. hud/{mcp_agent → mcp}/openai.py +56 -40
  14. hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
  15. hud/mcp/tests/test_claude.py +294 -0
  16. hud/mcp/tests/test_client.py +324 -0
  17. hud/mcp/tests/test_openai.py +238 -0
  18. hud/settings.py +6 -0
  19. hud/task.py +2 -88
  20. hud/taskset.py +2 -23
  21. hud/telemetry/__init__.py +5 -0
  22. hud/telemetry/_trace.py +180 -17
  23. hud/telemetry/context.py +79 -0
  24. hud/telemetry/exporter.py +165 -6
  25. hud/telemetry/job.py +141 -0
  26. hud/telemetry/tests/test_trace.py +36 -25
  27. hud/tools/__init__.py +14 -1
  28. hud/tools/computer/hud.py +13 -0
  29. hud/tools/executors/__init__.py +19 -2
  30. hud/tools/executors/pyautogui.py +84 -50
  31. hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
  32. hud/tools/playwright_tool.py +73 -67
  33. hud/tools/tests/test_edit.py +8 -1
  34. hud/tools/tests/test_tools.py +3 -0
  35. hud/trajectory.py +5 -1
  36. hud/utils/tests/test_version.py +1 -1
  37. hud/version.py +1 -1
  38. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/METADATA +20 -14
  39. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/RECORD +42 -47
  40. hud/evaluators/__init__.py +0 -9
  41. hud/evaluators/base.py +0 -32
  42. hud/evaluators/inspect.py +0 -24
  43. hud/evaluators/judge.py +0 -189
  44. hud/evaluators/match.py +0 -156
  45. hud/evaluators/remote.py +0 -65
  46. hud/evaluators/tests/__init__.py +0 -0
  47. hud/evaluators/tests/test_inspect.py +0 -12
  48. hud/evaluators/tests/test_judge.py +0 -231
  49. hud/evaluators/tests/test_match.py +0 -115
  50. hud/evaluators/tests/test_remote.py +0 -98
  51. hud/mcp_agent/base.py +0 -723
  52. /hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
  53. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/WHEEL +0 -0
  54. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: SDK for the HUD platform.
5
- Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
- Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
5
+ Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
+ Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
7
7
  Project-URL: Documentation, https://docs.hud.so
8
8
  Author-email: HUD SDK <founders@hud.so>
9
9
  License: MIT License
@@ -35,28 +35,22 @@ Classifier: Programming Language :: Python :: 3.11
35
35
  Classifier: Programming Language :: Python :: 3.12
36
36
  Classifier: Programming Language :: Python :: 3.13
37
37
  Requires-Python: <3.14,>=3.11
38
- Requires-Dist: aiodocker>=0.24.0
39
38
  Requires-Dist: anthropic
39
+ Requires-Dist: datasets>=4.0.0
40
40
  Requires-Dist: dotenv>=0.9.9
41
41
  Requires-Dist: httpx<1,>=0.23.0
42
- Requires-Dist: inspect-ai>=0.3.80
43
- Requires-Dist: ipykernel
44
42
  Requires-Dist: langchain
45
43
  Requires-Dist: langchain-anthropic
46
44
  Requires-Dist: langchain-openai
47
45
  Requires-Dist: mcp-use>=1.3.7
48
46
  Requires-Dist: mcp==1.12.2
49
- Requires-Dist: numpy
50
47
  Requires-Dist: openai
51
48
  Requires-Dist: pathspec>=0.12.1
52
- Requires-Dist: pillow>=11.1.0
53
- Requires-Dist: pyautogui>=0.9.54
54
49
  Requires-Dist: pydantic-settings<3,>=2
55
50
  Requires-Dist: pydantic<3,>=2
56
- Requires-Dist: textdistance<5,>=4.5.0
57
- Requires-Dist: toml>=0.10.2
58
51
  Requires-Dist: wrapt>=1.14.0
59
52
  Provides-Extra: dev
53
+ Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
60
54
  Requires-Dist: anthropic; extra == 'dev'
61
55
  Requires-Dist: dotenv; extra == 'dev'
62
56
  Requires-Dist: ipykernel; extra == 'dev'
@@ -64,17 +58,29 @@ Requires-Dist: ipython<9; extra == 'dev'
64
58
  Requires-Dist: jupyter-client; extra == 'dev'
65
59
  Requires-Dist: jupyter-core; extra == 'dev'
66
60
  Requires-Dist: openai; extra == 'dev'
61
+ Requires-Dist: pillow>=11.1.0; extra == 'dev'
67
62
  Requires-Dist: playwright; extra == 'dev'
63
+ Requires-Dist: pyautogui>=0.9.54; extra == 'dev'
68
64
  Requires-Dist: pyright==1.1.401; extra == 'dev'
69
65
  Requires-Dist: pytest-asyncio; extra == 'dev'
70
66
  Requires-Dist: pytest-cov; extra == 'dev'
71
67
  Requires-Dist: pytest-mock; extra == 'dev'
72
68
  Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
73
69
  Requires-Dist: ruff==0.11.8; extra == 'dev'
70
+ Requires-Dist: toml>=0.10.2; extra == 'dev'
71
+ Provides-Extra: v2
72
+ Requires-Dist: aiodocker>=0.24.0; extra == 'v2'
73
+ Requires-Dist: inspect-ai>=0.3.80; extra == 'v2'
74
+ Requires-Dist: ipykernel; extra == 'v2'
75
+ Requires-Dist: numpy; extra == 'v2'
76
+ Requires-Dist: pillow>=11.1.0; extra == 'v2'
77
+ Requires-Dist: pyautogui>=0.9.54; extra == 'v2'
78
+ Requires-Dist: textdistance<5,>=4.5.0; extra == 'v2'
79
+ Requires-Dist: toml>=0.10.2; extra == 'v2'
74
80
  Description-Content-Type: text/markdown
75
81
 
76
82
  <div align="left">
77
- <img src="https://raw.githubusercontent.com/hud-evals/hud-sdk/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
83
+ <img src="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
78
84
  </div>
79
85
 
80
86
  <h3>
@@ -88,7 +94,7 @@ Evaluate your Computer Use AI agents across web browsers, desktop environments,
88
94
  We're here to help with eval strategies, custom environments, or improving your agent architecture!
89
95
 
90
96
 
91
- > **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-sdk/issues), as the SDK is still evolving!
97
+ > **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-python/issues), as the SDK is still evolving!
92
98
 
93
99
  [![PyPI version](https://img.shields.io/pypi/v/hud-python)](https://pypi.org/project/hud-python/)
94
100
 
@@ -272,7 +278,7 @@ If you use this SDK in your research, please cite it as follows:
272
278
  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
273
279
  title = {{HUD: An Evaluation Platform for Agents}},
274
280
  date = {2025-04},
275
- url = {https://github.com/hud-evals/hud-sdk},
281
+ url = {https://github.com/hud-evals/hud-python},
276
282
  langid = {en}
277
283
  }
278
284
  ```
@@ -1,23 +1,24 @@
1
- hud/__init__.py,sha256=yg4CC0iQWE67NGb6tUTmlO1kV-tM3njbigTuFYyzgAI,1477
1
+ hud/__init__.py,sha256=j5Zzth7_M-5DU_KJT2ZV9OfikD2aE6lzyiZA4OrLzi8,1578
2
+ hud/datasets.py,sha256=UZCzzXREbPhlw2ZdUFZ8EDz0lErWEeBPOPQxH71p6EA,6196
2
3
  hud/exceptions.py,sha256=Xna_pdEK_ESwkcffsRmT5GXq4xSHLV5cu7Qu3MjstSE,5516
3
- hud/gym.py,sha256=JNWlO2GXev0xIjoTI9HMEbcQgGpzc6fku7-RYoYAxHI,4996
4
+ hud/gym.py,sha256=-hp5HdPBWf6-j0CgSoX_f2CTLssf1Wo5UhfyrnPbvkc,4774
4
5
  hud/job.py,sha256=0vWbr3E5bYstVRzXS_6l-57JGUFcrZpmFrNkOSQ8Aa0,26969
5
- hud/settings.py,sha256=rZFd_fzPUZKOklhMpTTSIJrMD1-eH9h6WrbD27SSXZ8,2014
6
- hud/task.py,sha256=WxftOrmaHqNvEsie1ZVIXJELYpfC9ejJL7b9TPQXEDg,8913
7
- hud/taskset.py,sha256=9IRwHeAdsk_IEibayM-hElE3gTp0mgmi-huN67h9-tc,7019
8
- hud/trajectory.py,sha256=ctAwrGIkdULr4xI6G-1Dp2fhDol4o_PmnPcqTzAEIUc,3797
6
+ hud/settings.py,sha256=KPzeF9OUecApYH8YYMW-8vIRhFP_6htzzZvC4RCUARc,2183
7
+ hud/task.py,sha256=l2mQM5Yc45kWjMXJkg1hVJfG0DLzTHAIXEvl4WLG-ho,5451
8
+ hud/taskset.py,sha256=QjHbcxSy7h7fmtzRHW1ewxtOIydtH7ZotttDoiABTEY,6573
9
+ hud/trajectory.py,sha256=LBVkFz6U_rmyooCZHN81tdOx0Z7DuAgzf0KQLejc4Fo,3937
9
10
  hud/types.py,sha256=h7fUowbdyGF4Fg8TUnvCFoa2fflRRPi6xx7YgpBwFis,3109
10
- hud/version.py,sha256=RPgsYJZ_E7U1ryhfdldTxD4xSiU4nQTEdzeq7iLynA4,104
11
+ hud/version.py,sha256=xXGUzDnO0wgnaDf7cvjChhGymzp3vrfpGjE5wBibi8E,104
11
12
  hud/adapters/__init__.py,sha256=zz24KdC_e9TJPgWo6y57_8SzevEE5ak4Cm6tXzMxwRk,266
12
13
  hud/adapters/claude/__init__.py,sha256=i7QEF-29FLb9qxp1eYtXs-adIk_tG54tL-9g6d3xodk,100
13
14
  hud/adapters/claude/adapter.py,sha256=vCpotJ5gzQs4PP2iCXVavIcyG8c_4m1P6fuXStwUxSo,6675
14
15
  hud/adapters/claude/tests/__init__.py,sha256=9GZj0rz4tTkiPnLfxTmyBPr-s8UZc3gph6WH8fs8T34,39
15
16
  hud/adapters/claude/tests/test_adapter.py,sha256=cAdHEoqLngLiV7QwlWJ0KuNgb1vNv9WZTPQMnxhMDKI,18319
16
17
  hud/adapters/common/__init__.py,sha256=BjdZWJVs_AKtpFrt-tNsdQRjnz7D97DFEQirJ-r0mp8,118
17
- hud/adapters/common/adapter.py,sha256=GETzlsEl-uYkL-U4cQHBnfLAvm1dbXec4fKC2ypR1L0,5821
18
+ hud/adapters/common/adapter.py,sha256=fTpw7wA501nxM3ufl6WMWq4Nc3vXlUeBGS7WgvZVFjU,6180
18
19
  hud/adapters/common/types.py,sha256=6frue7_gZlSYtOHhF2tFHqzjltzzHsTVs6-H-jQwZ4Y,9955
19
20
  hud/adapters/common/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- hud/adapters/common/tests/test_adapter.py,sha256=rTD36LjvytHqMIyOLDyrn0RLIkd20s6f6dwoBEarJaw,8744
21
+ hud/adapters/common/tests/test_adapter.py,sha256=7QRpQPGM1PlMi8RcqJAT4ruGvLT9TgGmc9R5tzncN1M,8965
21
22
  hud/adapters/operator/__init__.py,sha256=31vTRs268_TOLd-TeQRKau5bDYy78wxCNpJFhD5_l8U,104
22
23
  hud/adapters/operator/adapter.py,sha256=Uz4Sr73T57B7v4RRP0uaibHI17N2hBx6Z9YYjgJCUXA,3732
23
24
  hud/adapters/operator/tests/__init__.py,sha256=yTsDVusVXZBQL6DnXpLgKQCBRuOYUAVQ8Blk_k5GETk,41
@@ -34,61 +35,55 @@ hud/agent/tests/__init__.py,sha256=HbAW7FvSvzzKPU5LpveZceU8XTcDkRe1Bmte3OGi2f0,2
34
35
  hud/agent/tests/test_base.py,sha256=MAHx4QWsX4y4jXDoA1sxWw8uFvL7lIzGlXrnHfOTmkw,8511
35
36
  hud/env/__init__.py,sha256=wVEesXMXM5hcNXQHt0-PN4-9RnE69DEnQENS7uJSv_Y,266
36
37
  hud/env/client.py,sha256=brhfLkWGSuvxl3vqGMCQT-vXfj8rUbJMhE3zJg9WMDA,869
37
- hud/env/docker_client.py,sha256=_EfxCbld2lk5BCBegBMMGXrYxOtxoa8N468T1wFbGrE,11234
38
+ hud/env/docker_client.py,sha256=55PTFansUDzsRMT_43eSTVO9rb_wzl_s4aBpBqmMeXk,11749
38
39
  hud/env/environment.py,sha256=wjMBwGs5qkkXsVlXR_Z2QPZi4cwXE82ckdzRgHiXPjw,17019
39
- hud/env/local_docker_client.py,sha256=EbULGazP9KlD1tQrFKSghC0MO2-G60iNVLinEPtQ33M,11573
40
+ hud/env/local_docker_client.py,sha256=IIuPSV_KJsfCONJAIVkgq_2zgUJl-FE4e5tDkkbRp0Y,12442
40
41
  hud/env/remote_client.py,sha256=tP5Gn1YtYgsjdXA4vM4FibAAHnR-9OOH4GrTog97cf8,6670
41
42
  hud/env/remote_docker_client.py,sha256=sBoOz3cq9HMgVvX8qCYEhRLvdswMZLG9G4Ybc60RzDo,9574
42
- hud/evaluators/__init__.py,sha256=V5nktEAw3EDn2Y537pjia5Y1IjdLBIPrDjTs6YTCdX4,153
43
- hud/evaluators/base.py,sha256=ALO9Rj-R_9HtHIHYp84bsQQD12De0XnCTwad78_T5-k,771
44
- hud/evaluators/inspect.py,sha256=ZvrTXLpgibyvQ5aNXAMP4quyXISrRQHg9besDcuCx7U,692
45
- hud/evaluators/judge.py,sha256=N3gEQGwVin9Ir80wWw6VtaL0xrlzitbmItaLm0he5gY,5962
46
- hud/evaluators/match.py,sha256=8YVQD942myX72Jkme2JFIVlmKhFXEa3CgGTjLC8O5n4,4701
47
- hud/evaluators/remote.py,sha256=kmD_XIU20KvX0NKgaEEKTTKHp0KVRa_3jUEgONh2nkY,2054
48
- hud/evaluators/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
49
- hud/evaluators/tests/test_inspect.py,sha256=8dMjgQfXOJGcS8gP6TzoBbQiG_NYuRL6IobMG7euJdU,376
50
- hud/evaluators/tests/test_judge.py,sha256=c1GaAeq_WpBVgBlx-gQncHrOPokzKNxlbgiC8W8hxYI,7829
51
- hud/evaluators/tests/test_match.py,sha256=C04GoluyT9i41YZ65xEjN7tKHQbENbrpNhNtUd4ivmA,3919
52
- hud/evaluators/tests/test_remote.py,sha256=YdJpyyuRLkYP0e3jTUkD3zobS2WHQPePn8yBZtYOIN4,3243
53
- hud/mcp_agent/__init__.py,sha256=0R8SGgg2XU25y7B4lnBRv1n33d9TV6vaPXLafoiya2Y,324
54
- hud/mcp_agent/base.py,sha256=P92Bcj3VH8veWgG6Yrq6cnE2gOnRaVG0NhEXdI-C8CA,29142
55
- hud/mcp_agent/claude.py,sha256=5ORCs8PecqkRy2h5pVadxCIzJkjXZPPgkfOsGwJcJR4,11691
56
- hud/mcp_agent/langchain.py,sha256=JOD10jeFuW4ekgEu7fzKWuveBTTOV0CTIld98fNMbz0,8136
57
- hud/mcp_agent/openai.py,sha256=7SvbuKraLzlN4aGRsSkFtAVr1YldQmZ_9R8pRTWdQU0,12579
58
- hud/mcp_agent/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
59
- hud/mcp_agent/tests/test_base.py,sha256=7j_Id__Fd-d0VDRmfqyYM_p8JtF35mTPR90I8LeUXrI,16109
43
+ hud/mcp/__init__.py,sha256=VBAZfpD8Ww59CkWb4CB0rGhNGqJYtc5y1gWZWHDaViQ,371
44
+ hud/mcp/base.py,sha256=H4CRVGG4aEXAk_qRk3iOi-KLf8AVuffmoXPTaSXD4_0,24376
45
+ hud/mcp/claude.py,sha256=XxXHjNnBvrS2Y98m0xTfFjZYgACCoFVTiNd01neffbM,12034
46
+ hud/mcp/client.py,sha256=qrmpk2syjJ56y-09Dg44RVjUCFfmf5bPXaQSY-2ih-k,11494
47
+ hud/mcp/langchain.py,sha256=hbKSCSQBf4W_pPpGEdy_KNoPA-T7Bsn_BLIDxaLzvVU,9251
48
+ hud/mcp/openai.py,sha256=tpYK4ixLWqxAUXatXhoIZUXMlK1oP8TUZjnkSxBQVMc,13244
49
+ hud/mcp/tests/__init__.py,sha256=W-O-_4i34d9TTyEHV-O_q1Ai1gLhzwDaaPo02_TWQIY,34
50
+ hud/mcp/tests/test_base.py,sha256=lrRZoyDN7T67kOfv1A5WESaSHsYCaodD2CJnFli-4A4,19125
51
+ hud/mcp/tests/test_claude.py,sha256=kGDThen8ij9QWx_YH3P9UvLlra1ueEMgA_clQ1q60II,11312
52
+ hud/mcp/tests/test_client.py,sha256=ffxKzLmY75v-9l3aceUkn7aTdoO3j6deA4KBE3l9gaQ,11975
53
+ hud/mcp/tests/test_openai.py,sha256=AhnBT_y-zMykQyJARDwKWiQWJsBGwNIlH6fGAzhJh88,9091
60
54
  hud/server/__init__.py,sha256=IPxPCqtPLguryN-nBq78Sakypw2bRiE2iHv3SXG8YRk,139
61
55
  hud/server/requests.py,sha256=AnFW4ELojjvfF6xjS2no6_fg4Rph2aR2hjPzYTede0Q,8841
62
56
  hud/server/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
57
  hud/server/tests/test_requests.py,sha256=63YCbykcib5MxKxm-OgHJPLX3QC7hmgIwnWaYukVM6s,9077
64
- hud/telemetry/__init__.py,sha256=tNRbewoND4DvZUNw2aRYpR4kjRuh4o1XoNUuis62Zes,558
65
- hud/telemetry/_trace.py,sha256=zggA2kTqLvM59F-DyRSRGKX-in27jcdrnr_xiTRWJvA,5375
66
- hud/telemetry/context.py,sha256=LyibWGCq4FhGzauxIws36Hzzd0S0ryMYCPqYwoot4DQ,4638
67
- hud/telemetry/exporter.py,sha256=LaD-U8TldKVn-rQkO88l8ZBlFxhVRbOuQmVKOGMMWnU,17875
58
+ hud/telemetry/__init__.py,sha256=qSQhbXYy7c_sG7KhVr-5eiCmeREj6GQ2cijhbIR2-Z4,717
59
+ hud/telemetry/_trace.py,sha256=Di2zKByHaljL6H4VkA-Gh_085jRJQw2VTiMOHX_FKp0,11433
60
+ hud/telemetry/context.py,sha256=qwCdUQ3UX_Y_zfIHSAQ1cdJNv-VLh5y8ovXfLpjHKVY,7492
61
+ hud/telemetry/exporter.py,sha256=10NwliO35J0fStvspgzb93N5MTko3pYNJe0fuTs-gPQ,23225
62
+ hud/telemetry/job.py,sha256=eyjr7Ha2ijM0MIF5f0d1xFOScFUdFIqlmO8GzQZoAJc,4905
68
63
  hud/telemetry/mcp_models.py,sha256=0FQZoXtKOKeUsc2L61UbANpUDC7VNL842R2YFR61UBQ,8980
69
64
  hud/telemetry/instrumentation/__init__.py,sha256=vHmSqaJMMehgRNn6EN2SMoYDD12rSHkLeVmj7Uy1my0,88
70
65
  hud/telemetry/instrumentation/mcp.py,sha256=RbEaqmp8QHj1XqpIzwDSE8gH2cN5UjaBTouRxiPWxmc,9339
71
66
  hud/telemetry/instrumentation/registry.py,sha256=UVaSsEA693lvKYd5R3n3ve6GcAB1fwqubRwIVeZiNmo,1821
72
67
  hud/telemetry/tests/__init__.py,sha256=QMN8OzfrBUDbQESwrwHCqXLdDwCjYWX8BJcpeLUJfqA,33
73
68
  hud/telemetry/tests/test_context.py,sha256=RdtjYHsyvlkKoTQxk0VezaAISEoVQReYqQiqK3jgFLQ,6746
74
- hud/telemetry/tests/test_trace.py,sha256=fZt8WXflZivhBgWHhePWmmNbTYg0qF3oF3bTKE0KXiM,12016
75
- hud/tools/__init__.py,sha256=vDxoIGHaj7FOnqki2Q92gGmEZx3f6Vx9RGkQo-X3cJ4,577
69
+ hud/telemetry/tests/test_trace.py,sha256=mCm5AH-NpuDVvRG-CZhMMqHiJ4dahvcy9KHmWmo6o3A,12494
70
+ hud/tools/__init__.py,sha256=T4PnE5nuBCXsTKXUYBHmaF1Ojc6D5vAa6wA2cFWJfTc,986
76
71
  hud/tools/base.py,sha256=lmd7N7IccIWrPpA0NZundIglFTTiLFW9VP_PJI2EXug,2069
77
72
  hud/tools/bash.py,sha256=o841_HF1NJFfUWLOVUw9s0iB4BoIxhA-8vMasJOhZ70,4319
78
73
  hud/tools/edit.py,sha256=9vJ2XSnWOPViujQbZZuDjLahvzxoPHyAeXxgKfpUDHo,11796
79
- hud/tools/playwright_tool.py,sha256=tq1La66esH2CwGYBkpvBDNPuswsspHtSE1cSqGVJEjU,13295
74
+ hud/tools/playwright_tool.py,sha256=IQT1hk5U4H8BI988iZq0B2oS_fbgkaX01Z-ZXL4r71o,13724
80
75
  hud/tools/utils.py,sha256=bfVyYMcBOJvr1QdptCjVb6jaHVGIL5WUxmY59kzMekQ,1447
81
76
  hud/tools/computer/__init__.py,sha256=ehKY7u0_4cZ9h7YQlOQjbKPWfd5LhQq8ZQn2w2-l2mY,302
82
77
  hud/tools/computer/anthropic.py,sha256=M-djQmd0vPZm95FDszaMh4wSaLFPhlcCUb-JkSuflnU,16104
83
- hud/tools/computer/hud.py,sha256=xyFYLqVoLsps0Dbs9kAfg941kXLnMHx7SL8a2skhjHw,13351
78
+ hud/tools/computer/hud.py,sha256=13_xjvf5-yO-7lYBoI44Br31CxL5EumSCQxq876h7rs,13840
84
79
  hud/tools/computer/openai.py,sha256=pcMGfoT6O8Rh9IrW_H1Mw2cIwk-FzCswrgjW19piRU8,10538
85
- hud/tools/executors/__init__.py,sha256=Ybc8mP48ps3Z2QHjYcc0Yrhmn2ZNqZF1WLvl-0lyQ_w,262
80
+ hud/tools/executors/__init__.py,sha256=jHxfus9SLhkL6YGtebR5RyKYyVAix3yu5EkUp2Q27Kg,732
86
81
  hud/tools/executors/base.py,sha256=4h04Byt4ktaNk_aLOOI798pkMCLiqA7pE2PoaEn_hfg,11647
87
- hud/tools/executors/pyautogui.py,sha256=1MWWXhyaPLeFkWXIr7pR_pii_XjDJxhpXdCCHFgF-1A,20803
82
+ hud/tools/executors/pyautogui.py,sha256=Kc2OcFw-sEuRBRFtO1ZrWeHs1p-p5FtEpESkzpRhOHk,22098
88
83
  hud/tools/executors/xdo.py,sha256=C6ecIVPUba7c6vKpgIcNxKcc698hwelQjj4YYUxT2_4,17751
89
84
  hud/tools/executors/tests/__init__.py,sha256=opFpGSH6cEqIZgt9izXd3Yt85pC7xkxiYmOZQTHf4AY,32
90
85
  hud/tools/executors/tests/test_base_executor.py,sha256=dvpKHCIjrBhT6E2U3hsjAwuivCAYXplvd08EHN6cxTI,12306
91
- hud/tools/executors/tests/test_pyautogui_executor.py,sha256=cBgTAieWVT6C9dRtdqBWtRFmayteVQEp6DNofdCYLqc,6521
86
+ hud/tools/executors/tests/test_pyautogui_executor.py,sha256=br-wVvXnRx9G6X0yJ_xeKZf2xl8o4LCnYLeaIbkpuzY,6608
92
87
  hud/tools/helper/README.md,sha256=GDS-K-wMnDO3-gtWjisgk5153zBmU29XSrs2ZhlOWQY,1727
93
88
  hud/tools/helper/__init__.py,sha256=VqgQkY-y9h-WnGXZRK387fSr1BzrOQoAy3975WDAs4c,209
94
89
  hud/tools/helper/mcp_server.py,sha256=t8UaGq91hDKef6zO3ApnJydwcKEqgLF6RdDcJ1GmfEA,2248
@@ -98,10 +93,10 @@ hud/tools/tests/__init__.py,sha256=eEYYkxX5Hz9woXVOBJ2H2_CQoEih0vH6nRt3sH2Z8v8,4
98
93
  hud/tools/tests/test_bash.py,sha256=LV3LjijwkQqxuxIXFSepD2x3sYoY4uhdw8EBv4JOyLU,4847
99
94
  hud/tools/tests/test_computer.py,sha256=HxYHxKJ0eWyZzC3abzviFBU-auc8x6Sh2ciR_uVXMXw,1595
100
95
  hud/tools/tests/test_computer_actions.py,sha256=YtUNFL7anhpXrcvg8EoUY1CqIV-TAAyaNFLZO9CiJ40,1194
101
- hud/tools/tests/test_edit.py,sha256=UpUkn-fEXyFr9dKPT7pjmZ8ASUePkPnqwVMplUHowR4,8301
96
+ hud/tools/tests/test_edit.py,sha256=_Bfh9Qc_zSYK5vS9kfhm5G9tkVvX1dsEIFqE3jkeSv0,8527
102
97
  hud/tools/tests/test_init.py,sha256=PD_SS6X6SPhEjStJqYxdJRtsa7RbL6cTokAGIn5bWhA,702
103
98
  hud/tools/tests/test_playwright_tool.py,sha256=1qED_NF2QXUZmBRbWSmcKImMLUQ3m5CbA_9tLUiaxTQ,6696
104
- hud/tools/tests/test_tools.py,sha256=fNZMv93VhkFiXRHKt5krXZAASU2IsiA1149erUMa2ek,4418
99
+ hud/tools/tests/test_tools.py,sha256=KgSPgdqldpifbHeQHBFdYJVf3boWbvK6LRRRORPfTOg,4595
105
100
  hud/tools/tests/test_utils.py,sha256=oYxEnLpSA5sEeYFGUTj74QRNv0AHP3AjmYYHXgIW0BY,5496
106
101
  hud/utils/__init__.py,sha256=oSl_gGoS272X2VFnBYX8hLxcP2xgGoBYQXAuLhtQgw8,260
107
102
  hud/utils/agent.py,sha256=CpNgjKWMaNqo-EATH_vfJHIN53rEkZngm2LXfUFlldQ,1225
@@ -116,9 +111,9 @@ hud/utils/tests/test_config.py,sha256=dPlXYWuMrxX-NOYbf0vdJ27TJpfacKG8eiKOSGOcfD
116
111
  hud/utils/tests/test_init.py,sha256=UxlNTwjlSE2q3M0R86EmMYmmXmbRvzZaC-S2av26QXI,529
117
112
  hud/utils/tests/test_progress.py,sha256=QunwDgi_heQXhDgmC25zgjr-sFUu5FdJ_1aYigMKeIc,6351
118
113
  hud/utils/tests/test_telemetry.py,sha256=t0An1RTBaE0dZVEpF4uwuq5k1R-PXFR5k4u71h60tx8,1224
119
- hud/utils/tests/test_version.py,sha256=J-6wPtkqEIDGjTCmUIylXicun-t7JdEHaJ8gcuj2mlY,159
114
+ hud/utils/tests/test_version.py,sha256=b0JOKjG9oz4PvBZxUS3hbQYWi8yTEz52VHqTlLXgSYM,159
120
115
  hud/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
- hud_python-0.3.0.dist-info/METADATA,sha256=fgbwnVd2CqZ4TSYMjjj9ygmhG3Yu_UnahOlqPXMcRQ4,9876
122
- hud_python-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
123
- hud_python-0.3.0.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
124
- hud_python-0.3.0.dist-info/RECORD,,
116
+ hud_python-0.3.2.dist-info/METADATA,sha256=p5VyDJbUHnHXC1cUh50DJnVF8j8NIFqjEi-bdd8vP8c,10249
117
+ hud_python-0.3.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
118
+ hud_python-0.3.2.dist-info/licenses/LICENSE,sha256=yIzBheVUf86FC1bztAcr7RYWWNxyd3B-UJQ3uddg1HA,1078
119
+ hud_python-0.3.2.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- """
2
- Evaluators for assessing task responses.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from hud.evaluators.base import Evaluator
8
-
9
- __all__ = ["Evaluator"]
hud/evaluators/base.py DELETED
@@ -1,32 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from abc import ABC, abstractmethod
4
- from typing import TYPE_CHECKING
5
-
6
- from pydantic import BaseModel, Field
7
-
8
- if TYPE_CHECKING:
9
- from hud.task import Task
10
-
11
-
12
- class EvaluationResult(BaseModel):
13
- """Result of an evaluation.
14
-
15
- Attributes:
16
- score: Float score between 0 and 1
17
- reason: Explanation of the evaluation
18
- mode: Mode used for matching, if applicable
19
- """
20
-
21
- score: float
22
- reason: str
23
- mode: str | None = None
24
- criteria_scores: dict[str, float] | None = Field(default_factory=dict)
25
-
26
-
27
- class Evaluator(ABC):
28
- """Abstract base class for evaluators."""
29
-
30
- @abstractmethod
31
- def evaluate(self, task: Task, response: str) -> EvaluationResult:
32
- """Evaluate a task and response."""
hud/evaluators/inspect.py DELETED
@@ -1,24 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any
4
-
5
- from hud.evaluators.base import EvaluationResult
6
-
7
-
8
- def inspect_evaluate(
9
- response: Any,
10
- answer: Any,
11
- ) -> EvaluationResult:
12
- """Evaluate using Inspect-ai's evaluation models.
13
-
14
- Args:
15
- response: The response to evaluate
16
- answer: The reference answer to compare against
17
- model_name: The Inspect model to use
18
- prompt: Optional custom prompt for evaluation
19
- metrics: Optional list of metrics to evaluate against
20
-
21
- Returns:
22
- EvaluationResult with the evaluation results
23
- """
24
- return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")
hud/evaluators/judge.py DELETED
@@ -1,189 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import base64
5
- from typing import Any, Protocol, TypedDict
6
-
7
- from hud.evaluators.base import EvaluationResult
8
- from hud.server import make_request
9
- from hud.settings import settings
10
-
11
-
12
- class LLM(Protocol):
13
- """Protocol for LLM interfaces that can be used for evaluation."""
14
-
15
- async def ainvoke(self, prompt: str, /) -> str: ...
16
-
17
-
18
- class Criterion(TypedDict, total=False):
19
- """Criterion for judge-based evaluation."""
20
-
21
- description: str
22
- weight: float
23
-
24
-
25
- async def _call_eval_endpoint(
26
- response: Any, answer: Any, criteria: list[Any], mode: str
27
- ) -> dict[str, Any]:
28
- """Call the run_eval endpoint to evaluate the response."""
29
- try:
30
- result = await make_request(
31
- method="POST",
32
- url=f"{settings.base_url}/evaluations/run_eval",
33
- json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
34
- api_key=settings.api_key,
35
- )
36
- return result
37
- except Exception as e:
38
- # Fallback to local evaluation if remote call fails
39
- return {
40
- "score": -1.0,
41
- "reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
42
- "criteria_scores": {},
43
- }
44
-
45
-
46
- def _process_input(data: Any) -> Any:
47
- """Process input data, detecting and handling base64 images."""
48
- if isinstance(data, bytes):
49
- # Convert bytes to base64 string
50
- return base64.b64encode(data).decode("utf-8")
51
-
52
- if isinstance(data, str) and _is_base64_image(data):
53
- # It's already a base64 string, just return it
54
- return data
55
-
56
- if isinstance(data, list) and all(isinstance(item, str) for item in data):
57
- # Process list of strings
58
- return data
59
-
60
- # For other types, convert to string
61
- return str(data) if not isinstance(data, str | dict) else data
62
-
63
-
64
- def _is_base64_image(data: Any) -> bool:
65
- """Check if a string is a base64 encoded image."""
66
- if not isinstance(data, str):
67
- return False
68
-
69
- # Check for common image data URI pattern
70
- if data.startswith(("data:image/", "data:application/octet-stream")):
71
- return True
72
-
73
- # Check if it's a base64 encoded string with image header
74
- try:
75
- # First, validate it's base64 decodable
76
- padding_needed = len(data) % 4
77
- if padding_needed:
78
- data += "=" * (4 - padding_needed)
79
-
80
- # Try to decode the first few bytes to check for image signatures
81
- sample = base64.b64decode(data[:30])
82
-
83
- # Check for common image format signatures
84
- return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
85
- except Exception:
86
- return False
87
-
88
-
89
- def judge(
90
- response: Any,
91
- answer: Any,
92
- llm: LLM | None = None,
93
- criteria: list[str] | list[dict] | None = None,
94
- ) -> EvaluationResult:
95
- """Judge a response against an answer using an LLM.
96
-
97
- Args:
98
- response: The response to evaluate
99
- answer: The reference answer to compare against
100
- llm: Optional langchain LLM to use for evaluation
101
- criteria: Evaluation criteria as strings or dictionaries
102
-
103
- Returns:
104
- EvaluationResult with evaluation results
105
- """
106
- # Process inputs
107
- processed_response = _process_input(response)
108
- processed_answer = _process_input(answer)
109
-
110
- # If LLM is provided, use it for evaluation
111
- if llm:
112
- return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
113
-
114
- # Otherwise, use the remote evaluation service
115
- mode = "LLM"
116
- if isinstance(answer, bytes) or _is_base64_image(answer):
117
- mode = "VLM"
118
-
119
- # Call the eval endpoint synchronously
120
- result = asyncio.run(
121
- _call_eval_endpoint(
122
- response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
123
- )
124
- )
125
-
126
- return EvaluationResult(
127
- score=result.get("score", -1.0),
128
- reason=result.get("reason", "Response evaluated"),
129
- mode=mode,
130
- criteria_scores=result.get("criteria_scores", {}),
131
- )
132
-
133
-
134
- def _evaluate_with_llm(
135
- response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
136
- ) -> EvaluationResult:
137
- """Evaluate a response against an answer using a provided LLM."""
138
- criteria_text = ""
139
- if criteria:
140
- criteria_text = "Use the following criteria:\n"
141
- for c in criteria:
142
- if isinstance(c, dict) and "description" in c:
143
- criteria_text += f"- {c['description']}\n"
144
- elif isinstance(c, str):
145
- criteria_text += f"- {c}\n"
146
-
147
- prompt = f"""Evaluate the quality of a response given a reference answer.
148
-
149
- REFERENCE ANSWER:
150
- {answer}
151
-
152
- RESPONSE TO EVALUATE:
153
- {response}
154
-
155
- {criteria_text}
156
- Rate the response on a scale from 0.0 to 1.0, where 1.0 is perfect.
157
- Provide a brief explanation for your rating.
158
- Format your answer as a JSON object with 'score' (float) and 'reason' (string) fields.
159
- """
160
-
161
- try:
162
- # Run the evaluation asynchronously
163
- result_text = asyncio.run(llm.ainvoke(prompt))
164
-
165
- # Attempt to parse JSON response
166
- import json
167
- import re
168
-
169
- # Try to extract JSON if wrapped in other text
170
- json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
171
- if json_match:
172
- json_str = json_match.group(0)
173
- result = json.loads(json_str)
174
-
175
- return EvaluationResult(
176
- score=float(result.get("score", 0.5)),
177
- reason=result.get("reason", "Evaluated with custom LLM"),
178
- mode="custom_llm",
179
- )
180
-
181
- # If can't parse as JSON, use default values
182
- return EvaluationResult(
183
- score=0.5,
184
- reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
185
- mode="custom_llm",
186
- )
187
-
188
- except Exception as e:
189
- return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")
hud/evaluators/match.py DELETED
@@ -1,156 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from difflib import SequenceMatcher
5
- from typing import TYPE_CHECKING, Protocol
6
-
7
- from textdistance import levenshtein
8
-
9
- from hud.evaluators.base import EvaluationResult
10
-
11
- if TYPE_CHECKING:
12
- from collections.abc import Sequence
13
-
14
-
15
- class _Stringable(Protocol):
16
- def __str__(self) -> str: ...
17
-
18
-
19
- def match_single(response: _Stringable, answer: _Stringable) -> EvaluationResult:
20
- """Check if the answer is present within the response.
21
-
22
- Args:
23
- response: The response to evaluate
24
- answer: The expected answer
25
-
26
- Returns:
27
- EvaluationResult with score=1.0 if match, 0.0 otherwise
28
- """
29
- passed = str(answer).lower().strip() in str(response).lower().strip()
30
- return EvaluationResult(
31
- score=1.0 if passed else 0.0,
32
- reason="Exact match" if passed else "No exact match found",
33
- mode="single",
34
- )
35
-
36
-
37
- def match_all(response: _Stringable, answers: Sequence[_Stringable]) -> EvaluationResult:
38
- """Count how many expected answers are in the response.
39
-
40
- Args:
41
- response: The response to evaluate
42
- answers: List of expected answers
43
-
44
- Returns:
45
- EvaluationResult with score=proportion of matches (0.0-1.0)
46
- """
47
- response_str = str(response).lower()
48
- matches = 0
49
-
50
- for answer in answers:
51
- if str(answer).lower() in response_str:
52
- matches += 1
53
-
54
- score = matches / len(answers) if answers else 0.0
55
-
56
- if matches == len(answers):
57
- reason = f"All {matches} expected items found"
58
- else:
59
- reason = f"Only {matches} of {len(answers)} expected items found"
60
-
61
- return EvaluationResult(score=score, reason=reason, mode="all")
62
-
63
-
64
- def match_fuzzy(response: _Stringable, answer: _Stringable) -> EvaluationResult:
65
- """Calculate similarity using Levenshtein distance.
66
-
67
- Args:
68
- response: The response to evaluate
69
- answer: The expected answer
70
-
71
- Returns:
72
- EvaluationResult with score=similarity (0.0-1.0)
73
- """
74
- s1 = str(response).lower()
75
- s2 = str(answer).lower()
76
-
77
- if s1 == s2:
78
- score = 1.0
79
- elif len(s1) == 0 or len(s2) == 0:
80
- score = 0.0
81
- else:
82
- # Use Levenshtein distance
83
- distance = levenshtein.distance(s1, s2)
84
- max_len = max(len(s1), len(s2))
85
- score = 1.0 - (distance / max_len)
86
-
87
- return EvaluationResult(
88
- score=score, reason=f"Fuzzy match with {score:.1%} similarity", mode="fuzz"
89
- )
90
-
91
-
92
- def match_regex(response: _Stringable, pattern: str) -> EvaluationResult:
93
- """Check if response matches regex pattern.
94
-
95
- Args:
96
- response: The response to evaluate
97
- pattern: Regular expression pattern to match
98
-
99
- Returns:
100
- EvaluationResult with score=1.0 if match, 0.0 otherwise
101
- """
102
- try:
103
- regex = re.compile(pattern, re.DOTALL)
104
- passed = bool(regex.search(str(response)))
105
- return EvaluationResult(
106
- score=1.0 if passed else 0.0,
107
- reason="Regex pattern matched" if passed else "Regex pattern did not match",
108
- mode="regex",
109
- )
110
- except re.error:
111
- return EvaluationResult(score=0.0, reason="Invalid regex pattern", mode="regex")
112
-
113
-
114
- def match_diff(response: _Stringable, answer: _Stringable) -> EvaluationResult:
115
- """Compare difference between response and answer.
116
-
117
- Args:
118
- response: The response to evaluate
119
- answer: The expected answer
120
-
121
- Returns:
122
- EvaluationResult with score=similarity (0.0-1.0)
123
- """
124
- if isinstance(response, int | float) and isinstance(answer, int | float):
125
- score = _match_numeric_diff(response, answer)
126
- reason = f"Numeric difference: {abs(response - answer)}"
127
- else:
128
- score = _match_string_diff(response, answer)
129
- reason = f"String difference with {score:.1%} similarity"
130
-
131
- return EvaluationResult(score=score, reason=reason, mode="diff")
132
-
133
-
134
- def _match_string_diff(response: _Stringable, answer: _Stringable) -> float:
135
- """Compare difference between response and answer strings."""
136
- matcher = SequenceMatcher(None, str(response), str(answer))
137
- return matcher.ratio()
138
-
139
-
140
- def _match_numeric_diff(response: float, answer: float) -> float:
141
- """Calculate normalized difference between numeric values.
142
-
143
- Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
144
- """
145
- if response == answer:
146
- return 1.0
147
-
148
- # Simple absolute difference normalized to a 0-1 scale
149
- diff = abs(response - answer)
150
- max_val = max(abs(response), abs(answer))
151
-
152
- if max_val == 0:
153
- return 1.0 # Both are zero
154
-
155
- # Normalize and invert so 1.0 means identical
156
- return max(0.0, 1.0 - min(1.0, diff / max_val))