hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +4 -3
- hud/adapters/claude/adapter.py +5 -14
- hud/adapters/common/adapter.py +3 -3
- hud/adapters/common/tests/__init__.py +0 -0
- hud/adapters/common/tests/test_adapter.py +277 -0
- hud/adapters/common/types.py +3 -3
- hud/adapters/operator/adapter.py +16 -23
- hud/agent/__init__.py +8 -1
- hud/agent/base.py +28 -28
- hud/agent/claude.py +69 -60
- hud/agent/langchain.py +32 -26
- hud/agent/operator.py +75 -67
- hud/env/__init__.py +5 -5
- hud/env/client.py +2 -2
- hud/env/docker_client.py +37 -39
- hud/env/environment.py +91 -66
- hud/env/local_docker_client.py +5 -7
- hud/env/remote_client.py +39 -32
- hud/env/remote_docker_client.py +13 -3
- hud/evaluators/__init__.py +2 -3
- hud/evaluators/base.py +4 -3
- hud/evaluators/inspect.py +3 -8
- hud/evaluators/judge.py +34 -58
- hud/evaluators/match.py +42 -49
- hud/evaluators/remote.py +13 -26
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +12 -0
- hud/evaluators/tests/test_judge.py +231 -0
- hud/evaluators/tests/test_match.py +115 -0
- hud/evaluators/tests/test_remote.py +98 -0
- hud/exceptions.py +167 -0
- hud/gym.py +9 -7
- hud/job.py +179 -109
- hud/server/__init__.py +2 -2
- hud/server/requests.py +148 -186
- hud/server/tests/__init__.py +0 -0
- hud/server/tests/test_requests.py +275 -0
- hud/settings.py +3 -2
- hud/task.py +9 -19
- hud/taskset.py +44 -11
- hud/trajectory.py +6 -9
- hud/types.py +12 -9
- hud/utils/__init__.py +2 -2
- hud/utils/common.py +36 -15
- hud/utils/config.py +45 -30
- hud/utils/progress.py +34 -21
- hud/utils/telemetry.py +10 -11
- hud/utils/tests/__init__.py +0 -0
- hud/utils/tests/test_common.py +52 -0
- hud/utils/tests/test_config.py +129 -0
- hud/utils/tests/test_progress.py +225 -0
- hud/utils/tests/test_telemetry.py +37 -0
- hud/utils/tests/test_version.py +8 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
- hud_python-0.2.4.dist-info/RECORD +62 -0
- hud_python-0.2.2.dist-info/RECORD +0 -46
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
hud/env/remote_client.py
CHANGED
|
@@ -5,23 +5,25 @@ from base64 import b64decode
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
7
|
from hud.env.client import Client
|
|
8
|
+
from hud.exceptions import HudResponseError
|
|
8
9
|
from hud.server import make_request
|
|
9
10
|
from hud.settings import settings
|
|
10
11
|
from hud.types import EnvironmentStatus
|
|
11
12
|
from hud.utils import ExecuteResult
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
|
-
from hud.utils.config import
|
|
15
|
+
from hud.utils.config import FunctionConfig
|
|
15
16
|
|
|
16
17
|
logger = logging.getLogger("hud.env.remote_env_client")
|
|
17
18
|
|
|
19
|
+
|
|
18
20
|
class RemoteClient(Client):
|
|
19
21
|
"""
|
|
20
22
|
Remote environment client implementation.
|
|
21
|
-
|
|
23
|
+
|
|
22
24
|
Uses the HUD API to manage a remote environment.
|
|
23
25
|
"""
|
|
24
|
-
|
|
26
|
+
|
|
25
27
|
@classmethod
|
|
26
28
|
async def create(
|
|
27
29
|
cls,
|
|
@@ -33,21 +35,23 @@ class RemoteClient(Client):
|
|
|
33
35
|
) -> tuple[RemoteClient, dict[str, Any]]:
|
|
34
36
|
"""
|
|
35
37
|
Creates a remote environment client from a dockerfile or gym_id.
|
|
36
|
-
|
|
38
|
+
|
|
37
39
|
Args:
|
|
38
40
|
dockerfile: The dockerfile content to build the environment
|
|
39
41
|
gym_id: The gym_id of the environment to create
|
|
40
42
|
metadata: Metadata to associate with the environment
|
|
41
|
-
|
|
43
|
+
|
|
42
44
|
Returns:
|
|
43
|
-
|
|
45
|
+
A tuple containing the remote environment client and the build metadata
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
HudResponseError: If the environment creation is successful but the response is invalid.
|
|
44
49
|
"""
|
|
45
50
|
|
|
46
51
|
# Validate arguments
|
|
47
52
|
if metadata is None:
|
|
48
53
|
metadata = {}
|
|
49
54
|
|
|
50
|
-
|
|
51
55
|
request_data = {
|
|
52
56
|
# still named run_id for backwards compatibility
|
|
53
57
|
"run_id": job_id,
|
|
@@ -63,33 +67,38 @@ class RemoteClient(Client):
|
|
|
63
67
|
json=request_data,
|
|
64
68
|
api_key=settings.api_key,
|
|
65
69
|
)
|
|
66
|
-
|
|
70
|
+
|
|
67
71
|
# Get the environment ID from the response
|
|
68
72
|
env_id = response.get("id")
|
|
69
73
|
if not env_id:
|
|
70
|
-
raise
|
|
71
|
-
|
|
74
|
+
raise HudResponseError(
|
|
75
|
+
message="Failed to create remote environment: No ID returned in API response. "
|
|
76
|
+
"Please contact support if this issue persists.",
|
|
77
|
+
response_json=response,
|
|
78
|
+
)
|
|
79
|
+
|
|
72
80
|
# Create the controller instance
|
|
73
81
|
controller = cls(env_id)
|
|
74
|
-
|
|
82
|
+
|
|
75
83
|
build_data = response.get("metadata", {})
|
|
76
|
-
|
|
84
|
+
|
|
77
85
|
if response.get("readme"):
|
|
78
|
-
logger.info(
|
|
79
|
-
|
|
80
|
-
|
|
86
|
+
logger.info(
|
|
87
|
+
"[HUD] %s gym created, see how to use it at %s", gym_id, response.get("readme")
|
|
88
|
+
)
|
|
89
|
+
|
|
81
90
|
return controller, build_data
|
|
82
91
|
|
|
83
92
|
def __init__(self, env_id: str) -> None:
|
|
84
93
|
"""
|
|
85
94
|
Initialize the RemoteClient.
|
|
86
|
-
|
|
95
|
+
|
|
87
96
|
Args:
|
|
88
97
|
env_id: ID of the remote environment to control
|
|
89
98
|
"""
|
|
90
99
|
super().__init__()
|
|
91
100
|
self._env_id = env_id
|
|
92
|
-
|
|
101
|
+
|
|
93
102
|
@property
|
|
94
103
|
def env_id(self) -> str:
|
|
95
104
|
"""The ID of the remote environment."""
|
|
@@ -98,7 +107,7 @@ class RemoteClient(Client):
|
|
|
98
107
|
async def get_status(self) -> EnvironmentStatus:
|
|
99
108
|
"""
|
|
100
109
|
Get the current status of the remote environment.
|
|
101
|
-
|
|
110
|
+
|
|
102
111
|
Returns:
|
|
103
112
|
EnvironmentStatus: The current status of the environment
|
|
104
113
|
"""
|
|
@@ -111,7 +120,7 @@ class RemoteClient(Client):
|
|
|
111
120
|
logger.debug("Environment status response: %s", response)
|
|
112
121
|
|
|
113
122
|
status = response.get("state", "").lower()
|
|
114
|
-
|
|
123
|
+
|
|
115
124
|
if status == "running":
|
|
116
125
|
return EnvironmentStatus.RUNNING
|
|
117
126
|
elif status == "initializing" or status == "pending":
|
|
@@ -122,12 +131,12 @@ class RemoteClient(Client):
|
|
|
122
131
|
# Any other status is considered an error
|
|
123
132
|
logger.warning("Abnormal environment status response: %s", response)
|
|
124
133
|
return EnvironmentStatus.ERROR
|
|
125
|
-
|
|
134
|
+
|
|
126
135
|
except Exception:
|
|
127
136
|
# If we can't connect to the API or there's any other error
|
|
128
137
|
logger.info("(potentially transient) Error getting environment status")
|
|
129
138
|
return EnvironmentStatus.ERROR
|
|
130
|
-
|
|
139
|
+
|
|
131
140
|
async def execute(
|
|
132
141
|
self,
|
|
133
142
|
command: list[str],
|
|
@@ -138,11 +147,11 @@ class RemoteClient(Client):
|
|
|
138
147
|
"""
|
|
139
148
|
Execute a command in the environment.
|
|
140
149
|
No-op in some environments (like browser use).
|
|
141
|
-
|
|
150
|
+
|
|
142
151
|
Args:
|
|
143
152
|
command: Command to execute
|
|
144
153
|
workdir: Working directory for the command (ignored for remote environments)
|
|
145
|
-
|
|
154
|
+
|
|
146
155
|
Returns:
|
|
147
156
|
ExecuteResult: Result of the command execution
|
|
148
157
|
"""
|
|
@@ -150,21 +159,20 @@ class RemoteClient(Client):
|
|
|
150
159
|
method="POST",
|
|
151
160
|
url=f"{settings.base_url}/v2/environments/{self.env_id}/execute",
|
|
152
161
|
json={
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
162
|
+
"command": command,
|
|
163
|
+
"workdir": workdir,
|
|
164
|
+
"timeout": timeout,
|
|
156
165
|
},
|
|
157
166
|
api_key=settings.api_key,
|
|
158
167
|
)
|
|
159
|
-
|
|
168
|
+
|
|
160
169
|
return ExecuteResult(
|
|
161
170
|
stdout=b64decode(data["stdout"]),
|
|
162
171
|
stderr=b64decode(data["stderr"]),
|
|
163
|
-
exit_code=data["exit_code"]
|
|
172
|
+
exit_code=data["exit_code"],
|
|
164
173
|
)
|
|
165
174
|
|
|
166
|
-
|
|
167
|
-
async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
|
|
175
|
+
async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
|
|
168
176
|
"""
|
|
169
177
|
Invoke a function in the environment.
|
|
170
178
|
"""
|
|
@@ -174,9 +182,8 @@ class RemoteClient(Client):
|
|
|
174
182
|
json=config.model_dump(),
|
|
175
183
|
api_key=settings.api_key,
|
|
176
184
|
)
|
|
177
|
-
|
|
178
|
-
return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
|
|
179
185
|
|
|
186
|
+
return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
|
|
180
187
|
|
|
181
188
|
async def close(self) -> None:
|
|
182
189
|
"""
|
hud/env/remote_docker_client.py
CHANGED
|
@@ -5,6 +5,7 @@ from base64 import b64decode, b64encode
|
|
|
5
5
|
from typing import Any
|
|
6
6
|
|
|
7
7
|
from hud.env.docker_client import DockerClient
|
|
8
|
+
from hud.exceptions import HudResponseError
|
|
8
9
|
from hud.server import make_request
|
|
9
10
|
from hud.settings import settings
|
|
10
11
|
from hud.types import EnvironmentStatus
|
|
@@ -39,7 +40,10 @@ class RemoteDockerClient(DockerClient):
|
|
|
39
40
|
metadata: Metadata to associate with the environment
|
|
40
41
|
|
|
41
42
|
Returns:
|
|
42
|
-
|
|
43
|
+
A tuple containing the remote environment client and the build metadata
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
HudResponseError: If the environment creation fails.
|
|
43
47
|
"""
|
|
44
48
|
|
|
45
49
|
# Validate arguments
|
|
@@ -48,7 +52,7 @@ class RemoteDockerClient(DockerClient):
|
|
|
48
52
|
|
|
49
53
|
logger.info("Creating remote environment")
|
|
50
54
|
|
|
51
|
-
true_gym_id = await get_gym_id("
|
|
55
|
+
true_gym_id = await get_gym_id("docker")
|
|
52
56
|
|
|
53
57
|
# augment metadata with dockerfile
|
|
54
58
|
if "environment_config" not in metadata:
|
|
@@ -73,7 +77,13 @@ class RemoteDockerClient(DockerClient):
|
|
|
73
77
|
# Get the environment ID from the response
|
|
74
78
|
env_id = response.get("id")
|
|
75
79
|
if not env_id:
|
|
76
|
-
raise
|
|
80
|
+
raise HudResponseError(
|
|
81
|
+
message=(
|
|
82
|
+
"Failed to create remote environment: No ID returned in API response. "
|
|
83
|
+
"Please contact support if this issue persists."
|
|
84
|
+
),
|
|
85
|
+
response_json=response,
|
|
86
|
+
)
|
|
77
87
|
|
|
78
88
|
# Create the controller instance
|
|
79
89
|
controller = cls(env_id)
|
hud/evaluators/__init__.py
CHANGED
hud/evaluators/base.py
CHANGED
|
@@ -11,21 +11,22 @@ if TYPE_CHECKING:
|
|
|
11
11
|
|
|
12
12
|
class EvaluationResult(BaseModel):
|
|
13
13
|
"""Result of an evaluation.
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
Attributes:
|
|
16
16
|
score: Float score between 0 and 1
|
|
17
17
|
reason: Explanation of the evaluation
|
|
18
18
|
mode: Mode used for matching, if applicable
|
|
19
19
|
"""
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
score: float
|
|
22
22
|
reason: str
|
|
23
23
|
mode: str | None = None
|
|
24
24
|
criteria_scores: dict[str, float] | None = Field(default_factory=dict)
|
|
25
25
|
|
|
26
|
+
|
|
26
27
|
class Evaluator(ABC):
|
|
27
28
|
"""Abstract base class for evaluators."""
|
|
28
|
-
|
|
29
|
+
|
|
29
30
|
@abstractmethod
|
|
30
31
|
def evaluate(self, task: Task, response: str) -> EvaluationResult:
|
|
31
32
|
"""Evaluate a task and response."""
|
hud/evaluators/inspect.py
CHANGED
|
@@ -10,20 +10,15 @@ def inspect_evaluate(
|
|
|
10
10
|
answer: Any,
|
|
11
11
|
) -> EvaluationResult:
|
|
12
12
|
"""Evaluate using Inspect-ai's evaluation models.
|
|
13
|
-
|
|
13
|
+
|
|
14
14
|
Args:
|
|
15
15
|
response: The response to evaluate
|
|
16
16
|
answer: The reference answer to compare against
|
|
17
17
|
model_name: The Inspect model to use
|
|
18
18
|
prompt: Optional custom prompt for evaluation
|
|
19
19
|
metrics: Optional list of metrics to evaluate against
|
|
20
|
-
|
|
20
|
+
|
|
21
21
|
Returns:
|
|
22
22
|
EvaluationResult with the evaluation results
|
|
23
23
|
"""
|
|
24
|
-
return EvaluationResult(
|
|
25
|
-
score=0.0,
|
|
26
|
-
reason="Inspect evaluation not implemented",
|
|
27
|
-
mode="inspect"
|
|
28
|
-
)
|
|
29
|
-
|
|
24
|
+
return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")
|
hud/evaluators/judge.py
CHANGED
|
@@ -11,33 +11,26 @@ from hud.settings import settings
|
|
|
11
11
|
|
|
12
12
|
class LLM(Protocol):
|
|
13
13
|
"""Protocol for LLM interfaces that can be used for evaluation."""
|
|
14
|
-
|
|
14
|
+
|
|
15
|
+
async def ainvoke(self, prompt: str, /) -> str: ...
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class Criterion(TypedDict, total=False):
|
|
18
19
|
"""Criterion for judge-based evaluation."""
|
|
19
|
-
|
|
20
|
+
|
|
20
21
|
description: str
|
|
21
22
|
weight: float
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
async def _call_eval_endpoint(
|
|
25
|
-
response: Any,
|
|
26
|
-
answer: Any,
|
|
27
|
-
criteria: list[Any],
|
|
28
|
-
mode: str
|
|
26
|
+
response: Any, answer: Any, criteria: list[Any], mode: str
|
|
29
27
|
) -> dict[str, Any]:
|
|
30
28
|
"""Call the run_eval endpoint to evaluate the response."""
|
|
31
29
|
try:
|
|
32
30
|
result = await make_request(
|
|
33
31
|
method="POST",
|
|
34
32
|
url=f"{settings.base_url}/evaluations/run_eval",
|
|
35
|
-
json={
|
|
36
|
-
"response": response,
|
|
37
|
-
"answer": answer,
|
|
38
|
-
"criteria": criteria,
|
|
39
|
-
"mode": mode
|
|
40
|
-
},
|
|
33
|
+
json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
|
|
41
34
|
api_key=settings.api_key,
|
|
42
35
|
)
|
|
43
36
|
return result
|
|
@@ -46,31 +39,24 @@ async def _call_eval_endpoint(
|
|
|
46
39
|
return {
|
|
47
40
|
"score": -1.0,
|
|
48
41
|
"reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
|
|
49
|
-
"criteria_scores": {}
|
|
42
|
+
"criteria_scores": {},
|
|
50
43
|
}
|
|
51
44
|
|
|
52
45
|
|
|
53
|
-
def _determine_mode(answer: Any) -> str:
|
|
54
|
-
"""Determine the evaluation mode based on answer type."""
|
|
55
|
-
if isinstance(answer, bytes) or _is_base64_image(answer):
|
|
56
|
-
return "VLM"
|
|
57
|
-
return "LLM"
|
|
58
|
-
|
|
59
|
-
|
|
60
46
|
def _process_input(data: Any) -> Any:
|
|
61
47
|
"""Process input data, detecting and handling base64 images."""
|
|
62
48
|
if isinstance(data, bytes):
|
|
63
49
|
# Convert bytes to base64 string
|
|
64
50
|
return base64.b64encode(data).decode("utf-8")
|
|
65
|
-
|
|
51
|
+
|
|
66
52
|
if isinstance(data, str) and _is_base64_image(data):
|
|
67
53
|
# It's already a base64 string, just return it
|
|
68
54
|
return data
|
|
69
|
-
|
|
55
|
+
|
|
70
56
|
if isinstance(data, list) and all(isinstance(item, str) for item in data):
|
|
71
57
|
# Process list of strings
|
|
72
58
|
return data
|
|
73
|
-
|
|
59
|
+
|
|
74
60
|
# For other types, convert to string
|
|
75
61
|
return str(data) if not isinstance(data, str | dict) else data
|
|
76
62
|
|
|
@@ -79,11 +65,11 @@ def _is_base64_image(data: Any) -> bool:
|
|
|
79
65
|
"""Check if a string is a base64 encoded image."""
|
|
80
66
|
if not isinstance(data, str):
|
|
81
67
|
return False
|
|
82
|
-
|
|
68
|
+
|
|
83
69
|
# Check for common image data URI pattern
|
|
84
70
|
if data.startswith(("data:image/", "data:application/octet-stream")):
|
|
85
71
|
return True
|
|
86
|
-
|
|
72
|
+
|
|
87
73
|
# Check if it's a base64 encoded string with image header
|
|
88
74
|
try:
|
|
89
75
|
# First, validate it's base64 decodable
|
|
@@ -95,9 +81,7 @@ def _is_base64_image(data: Any) -> bool:
|
|
|
95
81
|
sample = base64.b64decode(data[:30])
|
|
96
82
|
|
|
97
83
|
# Check for common image format signatures
|
|
98
|
-
return (
|
|
99
|
-
sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
|
|
100
|
-
)
|
|
84
|
+
return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
|
|
101
85
|
except Exception:
|
|
102
86
|
return False
|
|
103
87
|
|
|
@@ -109,50 +93,46 @@ def judge(
|
|
|
109
93
|
criteria: list[str] | list[dict] | None = None,
|
|
110
94
|
) -> EvaluationResult:
|
|
111
95
|
"""Judge a response against an answer using an LLM.
|
|
112
|
-
|
|
96
|
+
|
|
113
97
|
Args:
|
|
114
98
|
response: The response to evaluate
|
|
115
99
|
answer: The reference answer to compare against
|
|
116
100
|
llm: Optional langchain LLM to use for evaluation
|
|
117
101
|
criteria: Evaluation criteria as strings or dictionaries
|
|
118
|
-
|
|
102
|
+
|
|
119
103
|
Returns:
|
|
120
104
|
EvaluationResult with evaluation results
|
|
121
105
|
"""
|
|
122
106
|
# Process inputs
|
|
123
107
|
processed_response = _process_input(response)
|
|
124
108
|
processed_answer = _process_input(answer)
|
|
125
|
-
|
|
109
|
+
|
|
126
110
|
# If LLM is provided, use it for evaluation
|
|
127
111
|
if llm:
|
|
128
112
|
return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
|
|
129
|
-
|
|
113
|
+
|
|
130
114
|
# Otherwise, use the remote evaluation service
|
|
131
115
|
mode = "LLM"
|
|
132
116
|
if isinstance(answer, bytes) or _is_base64_image(answer):
|
|
133
117
|
mode = "VLM"
|
|
134
|
-
|
|
118
|
+
|
|
135
119
|
# Call the eval endpoint synchronously
|
|
136
|
-
result = asyncio.run(
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
120
|
+
result = asyncio.run(
|
|
121
|
+
_call_eval_endpoint(
|
|
122
|
+
response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
143
126
|
return EvaluationResult(
|
|
144
127
|
score=result.get("score", -1.0),
|
|
145
128
|
reason=result.get("reason", "Response evaluated"),
|
|
146
129
|
mode=mode,
|
|
147
|
-
criteria_scores=result.get("criteria_scores", {})
|
|
130
|
+
criteria_scores=result.get("criteria_scores", {}),
|
|
148
131
|
)
|
|
149
132
|
|
|
150
133
|
|
|
151
134
|
def _evaluate_with_llm(
|
|
152
|
-
response: Any,
|
|
153
|
-
answer: Any,
|
|
154
|
-
llm: LLM,
|
|
155
|
-
criteria: list[str] | list[dict] | None = None
|
|
135
|
+
response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
|
|
156
136
|
) -> EvaluationResult:
|
|
157
137
|
"""Evaluate a response against an answer using a provided LLM."""
|
|
158
138
|
criteria_text = ""
|
|
@@ -163,7 +143,7 @@ def _evaluate_with_llm(
|
|
|
163
143
|
criteria_text += f"- {c['description']}\n"
|
|
164
144
|
elif isinstance(c, str):
|
|
165
145
|
criteria_text += f"- {c}\n"
|
|
166
|
-
|
|
146
|
+
|
|
167
147
|
prompt = f"""Evaluate the quality of a response given a reference answer.
|
|
168
148
|
|
|
169
149
|
REFERENCE ANSWER:
|
|
@@ -181,33 +161,29 @@ Format your answer as a JSON object with 'score' (float) and 'reason' (string) f
|
|
|
181
161
|
try:
|
|
182
162
|
# Run the evaluation asynchronously
|
|
183
163
|
result_text = asyncio.run(llm.ainvoke(prompt))
|
|
184
|
-
|
|
164
|
+
|
|
185
165
|
# Attempt to parse JSON response
|
|
186
166
|
import json
|
|
187
167
|
import re
|
|
188
|
-
|
|
168
|
+
|
|
189
169
|
# Try to extract JSON if wrapped in other text
|
|
190
170
|
json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
|
|
191
171
|
if json_match:
|
|
192
172
|
json_str = json_match.group(0)
|
|
193
173
|
result = json.loads(json_str)
|
|
194
|
-
|
|
174
|
+
|
|
195
175
|
return EvaluationResult(
|
|
196
176
|
score=float(result.get("score", 0.5)),
|
|
197
177
|
reason=result.get("reason", "Evaluated with custom LLM"),
|
|
198
|
-
mode="custom_llm"
|
|
178
|
+
mode="custom_llm",
|
|
199
179
|
)
|
|
200
|
-
|
|
180
|
+
|
|
201
181
|
# If can't parse as JSON, use default values
|
|
202
182
|
return EvaluationResult(
|
|
203
183
|
score=0.5,
|
|
204
184
|
reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
|
|
205
|
-
mode="custom_llm"
|
|
185
|
+
mode="custom_llm",
|
|
206
186
|
)
|
|
207
|
-
|
|
187
|
+
|
|
208
188
|
except Exception as e:
|
|
209
|
-
return EvaluationResult(
|
|
210
|
-
score=0.0,
|
|
211
|
-
reason=f"LLM evaluation error: {e!s}",
|
|
212
|
-
mode="custom_llm"
|
|
213
|
-
)
|
|
189
|
+
return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")
|