tsugite-cli 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsugite/__init__.py +6 -0
- tsugite/agent_composition.py +163 -0
- tsugite/agent_inheritance.py +479 -0
- tsugite/agent_preparation.py +236 -0
- tsugite/agent_runner/__init__.py +45 -0
- tsugite/agent_runner/helpers.py +106 -0
- tsugite/agent_runner/history_integration.py +248 -0
- tsugite/agent_runner/metrics.py +100 -0
- tsugite/agent_runner/runner.py +1879 -0
- tsugite/agent_runner/validation.py +70 -0
- tsugite/agent_utils.py +167 -0
- tsugite/attachments/__init__.py +65 -0
- tsugite/attachments/auto_context.py +199 -0
- tsugite/attachments/base.py +34 -0
- tsugite/attachments/file.py +51 -0
- tsugite/attachments/inline.py +31 -0
- tsugite/attachments/storage.py +178 -0
- tsugite/attachments/url.py +59 -0
- tsugite/attachments/youtube.py +101 -0
- tsugite/benchmark/__init__.py +62 -0
- tsugite/benchmark/config.py +183 -0
- tsugite/benchmark/core.py +292 -0
- tsugite/benchmark/discovery.py +377 -0
- tsugite/benchmark/evaluators.py +671 -0
- tsugite/benchmark/execution.py +657 -0
- tsugite/benchmark/metrics.py +204 -0
- tsugite/benchmark/reports.py +420 -0
- tsugite/benchmark/utils.py +288 -0
- tsugite/builtin_agents/chat-assistant.md +53 -0
- tsugite/builtin_agents/default.md +140 -0
- tsugite/builtin_agents.py +5 -0
- tsugite/cache.py +195 -0
- tsugite/cli/__init__.py +1042 -0
- tsugite/cli/agents.py +148 -0
- tsugite/cli/attachments.py +193 -0
- tsugite/cli/benchmark.py +663 -0
- tsugite/cli/cache.py +113 -0
- tsugite/cli/config.py +272 -0
- tsugite/cli/helpers.py +534 -0
- tsugite/cli/history.py +193 -0
- tsugite/cli/init.py +387 -0
- tsugite/cli/mcp.py +193 -0
- tsugite/cli/tools.py +419 -0
- tsugite/config.py +204 -0
- tsugite/console.py +48 -0
- tsugite/constants.py +21 -0
- tsugite/core/__init__.py +19 -0
- tsugite/core/agent.py +774 -0
- tsugite/core/executor.py +300 -0
- tsugite/core/memory.py +67 -0
- tsugite/core/tools.py +271 -0
- tsugite/docker_cli.py +270 -0
- tsugite/events/__init__.py +55 -0
- tsugite/events/base.py +46 -0
- tsugite/events/bus.py +62 -0
- tsugite/events/events.py +224 -0
- tsugite/exceptions.py +40 -0
- tsugite/history/__init__.py +29 -0
- tsugite/history/index.py +210 -0
- tsugite/history/models.py +106 -0
- tsugite/history/storage.py +157 -0
- tsugite/mcp_client.py +219 -0
- tsugite/mcp_config.py +174 -0
- tsugite/md_agents.py +751 -0
- tsugite/models.py +257 -0
- tsugite/renderer.py +151 -0
- tsugite/shell_tool_config.py +265 -0
- tsugite/templates/assistant.md +14 -0
- tsugite/tools/__init__.py +265 -0
- tsugite/tools/agents.py +312 -0
- tsugite/tools/edit_strategies.py +393 -0
- tsugite/tools/fs.py +329 -0
- tsugite/tools/http.py +239 -0
- tsugite/tools/interactive.py +430 -0
- tsugite/tools/shell.py +129 -0
- tsugite/tools/shell_tools.py +214 -0
- tsugite/tools/tasks.py +339 -0
- tsugite/tsugite.py +7 -0
- tsugite/ui/__init__.py +46 -0
- tsugite/ui/base.py +638 -0
- tsugite/ui/chat.py +265 -0
- tsugite/ui/chat.tcss +92 -0
- tsugite/ui/chat_history.py +286 -0
- tsugite/ui/helpers.py +102 -0
- tsugite/ui/jsonl.py +125 -0
- tsugite/ui/live_template.py +529 -0
- tsugite/ui/plain.py +419 -0
- tsugite/ui/textual_chat.py +642 -0
- tsugite/ui/textual_handler.py +225 -0
- tsugite/ui/widgets/__init__.py +6 -0
- tsugite/ui/widgets/base_scroll_log.py +27 -0
- tsugite/ui/widgets/message_list.py +121 -0
- tsugite/ui/widgets/thought_log.py +80 -0
- tsugite/ui_context.py +90 -0
- tsugite/utils.py +367 -0
- tsugite/xdg.py +104 -0
- tsugite_cli-0.3.3.dist-info/METADATA +325 -0
- tsugite_cli-0.3.3.dist-info/RECORD +101 -0
- tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
- tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
- tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
"""Evaluators for different aspects of benchmark test results."""
|
|
2
|
+
|
|
3
|
+
import difflib
|
|
4
|
+
import json
|
|
5
|
+
import re
|
|
6
|
+
from abc import ABC, abstractmethod
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
from .config import (
|
|
10
|
+
MODEL_COSTS,
|
|
11
|
+
SIMILARITY_THRESHOLDS,
|
|
12
|
+
get_cost_tier,
|
|
13
|
+
)
|
|
14
|
+
from .utils import json_similarity, normalize_code
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BaseEvaluator(ABC):
|
|
18
|
+
"""Base class for all evaluators."""
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def evaluate(self, **kwargs) -> Dict[str, Any]:
|
|
22
|
+
"""Evaluate the given inputs and return a score and metrics."""
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CorrectnessEvaluator(BaseEvaluator):
|
|
27
|
+
"""Evaluates correctness of outputs against expected results."""
|
|
28
|
+
|
|
29
|
+
def evaluate(self, output: str, expected: str, output_type: str = "string") -> Dict[str, Any]:
|
|
30
|
+
"""Evaluate correctness based on output type."""
|
|
31
|
+
result = {
|
|
32
|
+
"passed": False,
|
|
33
|
+
"score": 0.0,
|
|
34
|
+
"similarity": 0.0,
|
|
35
|
+
"exact_match": False,
|
|
36
|
+
"error": None,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
try:
|
|
40
|
+
if output_type == "string":
|
|
41
|
+
result.update(self._evaluate_string(output, expected))
|
|
42
|
+
elif output_type == "json":
|
|
43
|
+
result.update(self._evaluate_json(output, expected))
|
|
44
|
+
elif output_type == "code":
|
|
45
|
+
result.update(self._evaluate_code(output, expected))
|
|
46
|
+
elif output_type == "number":
|
|
47
|
+
result.update(self._evaluate_number(output, expected))
|
|
48
|
+
else:
|
|
49
|
+
result.update(self._evaluate_string(output, expected))
|
|
50
|
+
|
|
51
|
+
except Exception as e:
|
|
52
|
+
result["error"] = str(e)
|
|
53
|
+
|
|
54
|
+
return result
|
|
55
|
+
|
|
56
|
+
def _evaluate_string(self, output: str, expected: str) -> Dict[str, Any]:
|
|
57
|
+
"""Evaluate string output."""
|
|
58
|
+
output_clean = output.strip()
|
|
59
|
+
expected_clean = expected.strip()
|
|
60
|
+
|
|
61
|
+
exact_match = output_clean == expected_clean
|
|
62
|
+
similarity = difflib.SequenceMatcher(None, output_clean.lower(), expected_clean.lower()).ratio()
|
|
63
|
+
|
|
64
|
+
# Consider it passed if exact match or very high similarity
|
|
65
|
+
passed = exact_match or similarity >= SIMILARITY_THRESHOLDS.string_high_similarity
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"passed": passed,
|
|
69
|
+
"score": 1.0 if exact_match else similarity,
|
|
70
|
+
"similarity": similarity,
|
|
71
|
+
"exact_match": exact_match,
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def _evaluate_json(self, output: str, expected: str) -> Dict[str, Any]:
|
|
75
|
+
"""Evaluate JSON output."""
|
|
76
|
+
try:
|
|
77
|
+
output_json = json.loads(output.strip())
|
|
78
|
+
expected_json = json.loads(expected.strip())
|
|
79
|
+
|
|
80
|
+
exact_match = output_json == expected_json
|
|
81
|
+
score = 1.0 if exact_match else json_similarity(output_json, expected_json)
|
|
82
|
+
|
|
83
|
+
return {
|
|
84
|
+
"passed": exact_match or score >= SIMILARITY_THRESHOLDS.json_similarity,
|
|
85
|
+
"score": score,
|
|
86
|
+
"similarity": score,
|
|
87
|
+
"exact_match": exact_match,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
except json.JSONDecodeError as e:
|
|
91
|
+
return {
|
|
92
|
+
"passed": False,
|
|
93
|
+
"score": 0.0,
|
|
94
|
+
"similarity": 0.0,
|
|
95
|
+
"exact_match": False,
|
|
96
|
+
"error": f"JSON decode error: {e}",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
def _evaluate_code(self, output: str, expected: str) -> Dict[str, Any]:
|
|
100
|
+
"""Evaluate code output (simplified)."""
|
|
101
|
+
# Normalize whitespace and remove comments
|
|
102
|
+
output_normalized = normalize_code(output)
|
|
103
|
+
expected_normalized = normalize_code(expected)
|
|
104
|
+
|
|
105
|
+
exact_match = output_normalized == expected_normalized
|
|
106
|
+
similarity = difflib.SequenceMatcher(None, output_normalized, expected_normalized).ratio()
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"passed": exact_match or similarity >= SIMILARITY_THRESHOLDS.code_similarity,
|
|
110
|
+
"score": 1.0 if exact_match else similarity,
|
|
111
|
+
"similarity": similarity,
|
|
112
|
+
"exact_match": exact_match,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
def _evaluate_number(self, output: str, expected: str) -> Dict[str, Any]:
|
|
116
|
+
"""Evaluate numeric output."""
|
|
117
|
+
try:
|
|
118
|
+
# Extract numbers from strings
|
|
119
|
+
output_nums = re.findall(r"-?\d+\.?\d*", output.strip())
|
|
120
|
+
expected_nums = re.findall(r"-?\d+\.?\d*", expected.strip())
|
|
121
|
+
|
|
122
|
+
if not output_nums or not expected_nums:
|
|
123
|
+
return {
|
|
124
|
+
"passed": False,
|
|
125
|
+
"score": 0.0,
|
|
126
|
+
"similarity": 0.0,
|
|
127
|
+
"exact_match": False,
|
|
128
|
+
"error": "No numbers found",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
output_val = float(output_nums[0])
|
|
132
|
+
expected_val = float(expected_nums[0])
|
|
133
|
+
|
|
134
|
+
exact_match = abs(output_val - expected_val) < 1e-10
|
|
135
|
+
relative_error = abs(output_val - expected_val) / max(abs(expected_val), 1e-10)
|
|
136
|
+
score = max(0.0, 1.0 - relative_error)
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
"passed": exact_match or relative_error < 0.01,
|
|
140
|
+
"score": score,
|
|
141
|
+
"similarity": score,
|
|
142
|
+
"exact_match": exact_match,
|
|
143
|
+
"relative_error": relative_error,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
except (ValueError, IndexError) as e:
|
|
147
|
+
return {
|
|
148
|
+
"passed": False,
|
|
149
|
+
"score": 0.0,
|
|
150
|
+
"similarity": 0.0,
|
|
151
|
+
"exact_match": False,
|
|
152
|
+
"error": f"Number evaluation error: {e}",
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class PerformanceEvaluator(BaseEvaluator):
|
|
157
|
+
"""Evaluates performance metrics like speed and efficiency."""
|
|
158
|
+
|
|
159
|
+
def evaluate(self, duration: float, timeout: float, baseline_duration: Optional[float] = None) -> Dict[str, Any]:
|
|
160
|
+
"""Evaluate performance based on duration."""
|
|
161
|
+
result = {
|
|
162
|
+
"duration": duration,
|
|
163
|
+
"timeout": timeout,
|
|
164
|
+
"timed_out": duration >= timeout,
|
|
165
|
+
"speed_score": 0.0,
|
|
166
|
+
"efficiency_tier": "unknown",
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
# Calculate speed score (inverse of duration, normalized)
|
|
170
|
+
if timeout > 0:
|
|
171
|
+
# Score based on how much of the timeout was used
|
|
172
|
+
time_ratio = duration / timeout
|
|
173
|
+
speed_score = max(0.0, 1.0 - min(time_ratio, 1.0))
|
|
174
|
+
|
|
175
|
+
# Reward significantly faster executions with a small bonus
|
|
176
|
+
if time_ratio <= 0.1:
|
|
177
|
+
speed_score = min(1.0, speed_score + 0.05)
|
|
178
|
+
elif time_ratio <= 0.3:
|
|
179
|
+
speed_score = min(1.0, speed_score + 0.02)
|
|
180
|
+
else:
|
|
181
|
+
speed_score = 1.0 if duration < 1.0 else max(0.0, 1.0 / duration)
|
|
182
|
+
|
|
183
|
+
result["speed_score"] = speed_score
|
|
184
|
+
|
|
185
|
+
# Efficiency tiers (based on performance thresholds)
|
|
186
|
+
time_ratio = duration / timeout if timeout > 0 else 0
|
|
187
|
+
if time_ratio <= 0.1:
|
|
188
|
+
tier = "Excellent"
|
|
189
|
+
elif time_ratio <= 0.3:
|
|
190
|
+
tier = "Good"
|
|
191
|
+
elif time_ratio <= 0.6:
|
|
192
|
+
tier = "Fair"
|
|
193
|
+
elif time_ratio <= 1.0:
|
|
194
|
+
tier = "Poor"
|
|
195
|
+
else:
|
|
196
|
+
tier = "Timeout"
|
|
197
|
+
|
|
198
|
+
result["efficiency_tier"] = tier
|
|
199
|
+
|
|
200
|
+
# Compare to baseline if provided
|
|
201
|
+
if baseline_duration:
|
|
202
|
+
improvement = max(0.0, (baseline_duration - duration) / baseline_duration)
|
|
203
|
+
result["improvement_over_baseline"] = improvement
|
|
204
|
+
result["relative_speed"] = baseline_duration / duration if duration > 0 else float("inf")
|
|
205
|
+
|
|
206
|
+
return result
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class QualityEvaluator(BaseEvaluator):
|
|
210
|
+
"""Evaluates quality of outputs using criteria-based assessment."""
|
|
211
|
+
|
|
212
|
+
async def evaluate(self, output: str, criteria: Dict[str, Any]) -> Dict[str, Any]:
|
|
213
|
+
"""Evaluate output quality based on criteria."""
|
|
214
|
+
result = {
|
|
215
|
+
"score": 0.0,
|
|
216
|
+
"criteria_scores": {},
|
|
217
|
+
"overall_quality": "unknown",
|
|
218
|
+
"feedback": [],
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
total_score = 0.0
|
|
222
|
+
total_weight = 0.0
|
|
223
|
+
|
|
224
|
+
# Evaluate each criterion
|
|
225
|
+
for criterion, config in criteria.items():
|
|
226
|
+
weight = config.get("weight", 1.0)
|
|
227
|
+
criterion_score = await self._evaluate_criterion(output, criterion, config)
|
|
228
|
+
|
|
229
|
+
result["criteria_scores"][criterion] = criterion_score
|
|
230
|
+
total_score += criterion_score * weight
|
|
231
|
+
total_weight += weight
|
|
232
|
+
|
|
233
|
+
# Calculate overall score
|
|
234
|
+
if total_weight > 0:
|
|
235
|
+
result["score"] = total_score / total_weight
|
|
236
|
+
else:
|
|
237
|
+
result["score"] = 0.0
|
|
238
|
+
|
|
239
|
+
# Determine quality tier
|
|
240
|
+
score = result["score"]
|
|
241
|
+
if score >= 0.9:
|
|
242
|
+
quality = "Excellent"
|
|
243
|
+
elif score >= 0.75:
|
|
244
|
+
quality = "Good"
|
|
245
|
+
elif score >= 0.6:
|
|
246
|
+
quality = "Fair"
|
|
247
|
+
elif score >= 0.4:
|
|
248
|
+
quality = "Poor"
|
|
249
|
+
else:
|
|
250
|
+
quality = "Very Poor"
|
|
251
|
+
|
|
252
|
+
result["overall_quality"] = quality
|
|
253
|
+
|
|
254
|
+
return result
|
|
255
|
+
|
|
256
|
+
async def _evaluate_criterion(self, output: str, criterion: str, config: Dict[str, Any]) -> float:
|
|
257
|
+
"""Evaluate a specific quality criterion."""
|
|
258
|
+
criterion_type = config.get("type", "keyword")
|
|
259
|
+
|
|
260
|
+
if criterion_type == "keyword":
|
|
261
|
+
return self._evaluate_keyword_presence(output, config)
|
|
262
|
+
elif criterion_type == "length":
|
|
263
|
+
return self._evaluate_length(output, config)
|
|
264
|
+
elif criterion_type == "format":
|
|
265
|
+
return self._evaluate_format(output, config)
|
|
266
|
+
elif criterion_type == "sentiment":
|
|
267
|
+
return self._evaluate_sentiment(output, config)
|
|
268
|
+
else:
|
|
269
|
+
# Default to simple keyword check
|
|
270
|
+
return self._evaluate_keyword_presence(output, config)
|
|
271
|
+
|
|
272
|
+
def _evaluate_keyword_presence(self, output: str, config: Dict[str, Any]) -> float:
|
|
273
|
+
"""Evaluate based on keyword presence."""
|
|
274
|
+
required_keywords = config.get("keywords", [])
|
|
275
|
+
if not required_keywords:
|
|
276
|
+
return 1.0
|
|
277
|
+
|
|
278
|
+
output_lower = output.lower()
|
|
279
|
+
found_keywords = sum(1 for keyword in required_keywords if keyword.lower() in output_lower)
|
|
280
|
+
|
|
281
|
+
return found_keywords / len(required_keywords)
|
|
282
|
+
|
|
283
|
+
def _evaluate_length(self, output: str, config: Dict[str, Any]) -> float:
|
|
284
|
+
"""Evaluate based on output length."""
|
|
285
|
+
min_length = config.get("min_length", 0)
|
|
286
|
+
max_length = config.get("max_length", float("inf"))
|
|
287
|
+
optimal_length = config.get("optimal_length")
|
|
288
|
+
|
|
289
|
+
length = len(output.strip())
|
|
290
|
+
|
|
291
|
+
if length < min_length:
|
|
292
|
+
return 0.0
|
|
293
|
+
elif length > max_length:
|
|
294
|
+
return max(0.0, 1.0 - (length - max_length) / max_length)
|
|
295
|
+
elif optimal_length:
|
|
296
|
+
# Score based on distance from optimal
|
|
297
|
+
distance = abs(length - optimal_length)
|
|
298
|
+
return max(0.0, 1.0 - distance / optimal_length)
|
|
299
|
+
else:
|
|
300
|
+
return 1.0
|
|
301
|
+
|
|
302
|
+
def _evaluate_format(self, output: str, config: Dict[str, Any]) -> float:
|
|
303
|
+
"""Evaluate based on format requirements."""
|
|
304
|
+
format_type = config.get("format", "text")
|
|
305
|
+
|
|
306
|
+
if format_type == "json":
|
|
307
|
+
try:
|
|
308
|
+
json.loads(output.strip())
|
|
309
|
+
return 1.0
|
|
310
|
+
except json.JSONDecodeError:
|
|
311
|
+
return 0.0
|
|
312
|
+
|
|
313
|
+
elif format_type == "code":
|
|
314
|
+
# Simple check for code-like structure
|
|
315
|
+
has_keywords = any(
|
|
316
|
+
keyword in output for keyword in ["def ", "class ", "import ", "function", "var ", "let "]
|
|
317
|
+
)
|
|
318
|
+
has_structure = any(char in output for char in ["{", "}", "(", ")", "[", "]"])
|
|
319
|
+
return 1.0 if has_keywords or has_structure else 0.5
|
|
320
|
+
|
|
321
|
+
elif format_type == "markdown":
|
|
322
|
+
# Check for markdown elements
|
|
323
|
+
has_headers = bool(re.search(r"^#{1,6}\s", output, re.MULTILINE))
|
|
324
|
+
has_formatting = bool(re.search(r"\*\*.*\*\*|\*.*\*|`.*`", output))
|
|
325
|
+
return 1.0 if has_headers or has_formatting else 0.5
|
|
326
|
+
|
|
327
|
+
else:
|
|
328
|
+
return 1.0 # Default to passing for unknown formats
|
|
329
|
+
|
|
330
|
+
def _evaluate_sentiment(self, output: str, config: Dict[str, Any]) -> float:
|
|
331
|
+
"""Evaluate sentiment (simplified implementation)."""
|
|
332
|
+
expected_sentiment = config.get("sentiment", "neutral")
|
|
333
|
+
|
|
334
|
+
# Simple keyword-based sentiment analysis
|
|
335
|
+
positive_words = [
|
|
336
|
+
"good",
|
|
337
|
+
"great",
|
|
338
|
+
"excellent",
|
|
339
|
+
"wonderful",
|
|
340
|
+
"amazing",
|
|
341
|
+
"positive",
|
|
342
|
+
"success",
|
|
343
|
+
]
|
|
344
|
+
negative_words = [
|
|
345
|
+
"bad",
|
|
346
|
+
"terrible",
|
|
347
|
+
"awful",
|
|
348
|
+
"horrible",
|
|
349
|
+
"negative",
|
|
350
|
+
"failure",
|
|
351
|
+
"error",
|
|
352
|
+
]
|
|
353
|
+
|
|
354
|
+
output_lower = output.lower()
|
|
355
|
+
positive_count = sum(1 for word in positive_words if word in output_lower)
|
|
356
|
+
negative_count = sum(1 for word in negative_words if word in output_lower)
|
|
357
|
+
|
|
358
|
+
if expected_sentiment == "positive":
|
|
359
|
+
return 1.0 if positive_count > negative_count else 0.5
|
|
360
|
+
elif expected_sentiment == "negative":
|
|
361
|
+
return 1.0 if negative_count > positive_count else 0.5
|
|
362
|
+
else: # neutral
|
|
363
|
+
return 1.0 if abs(positive_count - negative_count) <= 1 else 0.7
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
class CostEvaluator(BaseEvaluator):
|
|
367
|
+
"""Evaluates cost metrics and efficiency."""
|
|
368
|
+
|
|
369
|
+
def evaluate(self, token_usage: Dict[str, int], model: str, duration: float) -> Dict[str, Any]:
|
|
370
|
+
"""Evaluate cost-related metrics."""
|
|
371
|
+
cost_per_token = MODEL_COSTS.get_cost_for_model(model)
|
|
372
|
+
|
|
373
|
+
input_tokens = token_usage.get("input", 0)
|
|
374
|
+
output_tokens = token_usage.get("output", 0)
|
|
375
|
+
total_tokens = token_usage.get("total", input_tokens + output_tokens)
|
|
376
|
+
|
|
377
|
+
estimated_cost = total_tokens * cost_per_token
|
|
378
|
+
|
|
379
|
+
return {
|
|
380
|
+
"estimated_cost": estimated_cost,
|
|
381
|
+
"cost_per_token": cost_per_token,
|
|
382
|
+
"tokens_per_second": total_tokens / duration if duration > 0 else 0,
|
|
383
|
+
"cost_efficiency_tier": get_cost_tier(estimated_cost),
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
class LLMEvaluator(BaseEvaluator):
|
|
388
|
+
"""Evaluates outputs using another LLM as a judge."""
|
|
389
|
+
|
|
390
|
+
def __init__(self, evaluator_model: str = "openai:gpt-4o-mini"):
|
|
391
|
+
"""Initialize the LLM evaluator.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
evaluator_model: Model to use for evaluation (format: provider:model_name)
|
|
395
|
+
"""
|
|
396
|
+
self.evaluator_model = evaluator_model
|
|
397
|
+
|
|
398
|
+
@staticmethod
|
|
399
|
+
def _normalize_score(score: Any) -> float:
|
|
400
|
+
"""Normalize a score to 0-1 range.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
score: Score value (can be string, int, or float)
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Normalized score between 0.0 and 1.0
|
|
407
|
+
"""
|
|
408
|
+
# Convert to numeric if it's a string
|
|
409
|
+
if isinstance(score, str):
|
|
410
|
+
try:
|
|
411
|
+
score = float(score)
|
|
412
|
+
except ValueError:
|
|
413
|
+
return 0.5 # Default to middle score for non-numeric
|
|
414
|
+
|
|
415
|
+
# Normalize 0-10 range to 0-1
|
|
416
|
+
if isinstance(score, (int, float)) and score > 1:
|
|
417
|
+
score = score / 10.0
|
|
418
|
+
|
|
419
|
+
return max(0.0, min(1.0, float(score)))
|
|
420
|
+
|
|
421
|
+
async def evaluate(
|
|
422
|
+
self,
|
|
423
|
+
output: str,
|
|
424
|
+
task_description: str,
|
|
425
|
+
evaluation_criteria: str,
|
|
426
|
+
expected_format: str = None,
|
|
427
|
+
rubric: Dict[str, Any] = None,
|
|
428
|
+
) -> Dict[str, Any]:
|
|
429
|
+
"""Evaluate output using an LLM judge.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
output: The agent's output to evaluate
|
|
433
|
+
task_description: Description of what the agent was asked to do
|
|
434
|
+
evaluation_criteria: Criteria for evaluation (e.g., "accuracy, clarity, completeness")
|
|
435
|
+
expected_format: Expected format of the output (optional)
|
|
436
|
+
rubric: Detailed scoring rubric (optional)
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
Dictionary with evaluation results
|
|
440
|
+
"""
|
|
441
|
+
try:
|
|
442
|
+
import tempfile
|
|
443
|
+
from pathlib import Path
|
|
444
|
+
|
|
445
|
+
# Create evaluation prompt
|
|
446
|
+
evaluation_prompt = self._create_evaluation_prompt(
|
|
447
|
+
output, task_description, evaluation_criteria, expected_format, rubric
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# Create a temporary evaluator agent
|
|
451
|
+
evaluator_agent_content = self._create_evaluator_agent(self.evaluator_model)
|
|
452
|
+
|
|
453
|
+
# Write to temporary file
|
|
454
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
|
|
455
|
+
f.write(evaluator_agent_content)
|
|
456
|
+
temp_agent_path = f.name
|
|
457
|
+
|
|
458
|
+
try:
|
|
459
|
+
# Import here to avoid circular imports
|
|
460
|
+
from ..agent_runner import run_agent
|
|
461
|
+
|
|
462
|
+
# Run the evaluator agent
|
|
463
|
+
evaluation_result = run_agent(
|
|
464
|
+
agent_path=Path(temp_agent_path),
|
|
465
|
+
prompt=evaluation_prompt,
|
|
466
|
+
context={},
|
|
467
|
+
model_override=None,
|
|
468
|
+
debug=False,
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Parse the evaluation result
|
|
472
|
+
parsed_result = self._parse_evaluation_result(evaluation_result)
|
|
473
|
+
|
|
474
|
+
return {
|
|
475
|
+
"llm_score": parsed_result.get("score", 0.0),
|
|
476
|
+
"llm_feedback": parsed_result.get("feedback", ""),
|
|
477
|
+
"llm_reasoning": parsed_result.get("reasoning", ""),
|
|
478
|
+
"criteria_breakdown": parsed_result.get("criteria_breakdown", {}),
|
|
479
|
+
"overall_assessment": parsed_result.get("assessment", ""),
|
|
480
|
+
"evaluator_model": self.evaluator_model,
|
|
481
|
+
"raw_evaluation": evaluation_result,
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
finally:
|
|
485
|
+
# Clean up temporary file
|
|
486
|
+
import os
|
|
487
|
+
|
|
488
|
+
try:
|
|
489
|
+
os.unlink(temp_agent_path)
|
|
490
|
+
except Exception:
|
|
491
|
+
pass
|
|
492
|
+
|
|
493
|
+
except Exception as e:
|
|
494
|
+
return {
|
|
495
|
+
"llm_score": 0.0,
|
|
496
|
+
"llm_feedback": f"Evaluation failed: {str(e)}",
|
|
497
|
+
"llm_reasoning": "",
|
|
498
|
+
"criteria_breakdown": {},
|
|
499
|
+
"overall_assessment": "Error",
|
|
500
|
+
"evaluator_model": self.evaluator_model,
|
|
501
|
+
"error": str(e),
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
def _create_evaluation_prompt(
|
|
505
|
+
self,
|
|
506
|
+
output: str,
|
|
507
|
+
task_description: str,
|
|
508
|
+
evaluation_criteria: str,
|
|
509
|
+
expected_format: str = None,
|
|
510
|
+
rubric: Dict[str, Any] = None,
|
|
511
|
+
) -> str:
|
|
512
|
+
"""Create the evaluation prompt for the LLM judge."""
|
|
513
|
+
|
|
514
|
+
prompt = f"""You are an expert evaluator tasked with assessing an AI agent's performance.
|
|
515
|
+
|
|
516
|
+
## Task Description
|
|
517
|
+
The agent was asked to: {task_description}
|
|
518
|
+
|
|
519
|
+
## Evaluation Criteria
|
|
520
|
+
Evaluate the output based on: {evaluation_criteria}
|
|
521
|
+
|
|
522
|
+
## Agent's Output
|
|
523
|
+
{output}
|
|
524
|
+
|
|
525
|
+
## Instructions
|
|
526
|
+
1. Carefully analyze the agent's output against the task requirements
|
|
527
|
+
2. Rate each criterion on a scale of 0-10 (0 = completely fails, 10 = exceeds expectations)
|
|
528
|
+
3. Provide constructive feedback explaining your scoring
|
|
529
|
+
4. Give an overall assessment and final score
|
|
530
|
+
|
|
531
|
+
"""
|
|
532
|
+
|
|
533
|
+
if expected_format:
|
|
534
|
+
prompt += f"\n## Expected Format\nThe output should follow this format: {expected_format}\n"
|
|
535
|
+
|
|
536
|
+
if rubric:
|
|
537
|
+
prompt += "\n## Detailed Rubric\n"
|
|
538
|
+
for criterion, details in rubric.items():
|
|
539
|
+
prompt += f"**{criterion}**: {details}\n"
|
|
540
|
+
|
|
541
|
+
prompt += """
|
|
542
|
+
## Required Response Format
|
|
543
|
+
Please respond with a JSON object containing:
|
|
544
|
+
{
|
|
545
|
+
"score": <overall_score_0_to_10>,
|
|
546
|
+
"feedback": "<detailed_feedback>",
|
|
547
|
+
"reasoning": "<explanation_of_scoring>",
|
|
548
|
+
"criteria_breakdown": {
|
|
549
|
+
"<criterion1>": <score_0_to_10>,
|
|
550
|
+
"<criterion2>": <score_0_to_10>
|
|
551
|
+
},
|
|
552
|
+
"assessment": "<overall_quality_assessment>"
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
Provide thorough, constructive feedback that would help improve the agent's performance."""
|
|
556
|
+
|
|
557
|
+
return prompt
|
|
558
|
+
|
|
559
|
+
def _create_evaluator_agent(self, model: str) -> str:
|
|
560
|
+
"""Create a temporary evaluator agent."""
|
|
561
|
+
return f"""---
|
|
562
|
+
name: llm_evaluator
|
|
563
|
+
model: {model}
|
|
564
|
+
max_turns: 3
|
|
565
|
+
tools: []
|
|
566
|
+
---
|
|
567
|
+
|
|
568
|
+
# LLM Evaluator Agent
|
|
569
|
+
|
|
570
|
+
You are an expert AI evaluator with deep knowledge of AI systems, natural language processing, and task completion assessment.
|
|
571
|
+
|
|
572
|
+
Your role is to provide fair, objective, and constructive evaluation of AI agent outputs.
|
|
573
|
+
|
|
574
|
+
## Evaluation Principles
|
|
575
|
+
- **Accuracy**: Does the output correctly address the task?
|
|
576
|
+
- **Completeness**: Are all requirements fulfilled?
|
|
577
|
+
- **Clarity**: Is the output clear and well-structured?
|
|
578
|
+
- **Relevance**: Does the output stay on topic and address the request?
|
|
579
|
+
- **Quality**: Is the output of high quality with attention to detail?
|
|
580
|
+
|
|
581
|
+
## Task
|
|
582
|
+
{{{{ user_prompt }}}}
|
|
583
|
+
|
|
584
|
+
## Instructions
|
|
585
|
+
Analyze the provided output carefully and return a properly formatted JSON response with scores and detailed feedback.
|
|
586
|
+
"""
|
|
587
|
+
|
|
588
|
+
def _parse_evaluation_result(self, evaluation_result: str) -> Dict[str, Any]:
|
|
589
|
+
"""Parse the LLM evaluation result."""
|
|
590
|
+
try:
|
|
591
|
+
# Try to extract JSON from the result
|
|
592
|
+
import re
|
|
593
|
+
|
|
594
|
+
# Look for JSON block in the response
|
|
595
|
+
json_match = re.search(r"```json\s*(\{.*?\})\s*```", evaluation_result, re.DOTALL)
|
|
596
|
+
if json_match:
|
|
597
|
+
json_str = json_match.group(1)
|
|
598
|
+
else:
|
|
599
|
+
# Look for JSON object directly with proper handling of nested braces
|
|
600
|
+
json_match = re.search(r"\{(?:[^{}]|{[^}]*})*\}", evaluation_result, re.DOTALL)
|
|
601
|
+
if json_match and '"score"' in json_match.group(0):
|
|
602
|
+
json_str = json_match.group(0)
|
|
603
|
+
else:
|
|
604
|
+
# Try to find any JSON-like structure
|
|
605
|
+
json_str = evaluation_result.strip()
|
|
606
|
+
|
|
607
|
+
# Parse the JSON
|
|
608
|
+
result = json.loads(json_str)
|
|
609
|
+
|
|
610
|
+
# Normalize score to 0-1 range if it's 0-10
|
|
611
|
+
result["score"] = self._normalize_score(result.get("score", 0))
|
|
612
|
+
|
|
613
|
+
# Normalize criteria breakdown scores
|
|
614
|
+
if "criteria_breakdown" in result:
|
|
615
|
+
normalized_breakdown = {}
|
|
616
|
+
for criterion, score in result["criteria_breakdown"].items():
|
|
617
|
+
normalized_breakdown[criterion] = self._normalize_score(score)
|
|
618
|
+
result["criteria_breakdown"] = normalized_breakdown
|
|
619
|
+
|
|
620
|
+
return result
|
|
621
|
+
|
|
622
|
+
except (json.JSONDecodeError, AttributeError):
|
|
623
|
+
# Fallback parsing if JSON parsing fails
|
|
624
|
+
return self._fallback_parse(evaluation_result)
|
|
625
|
+
|
|
626
|
+
def _fallback_parse(self, evaluation_result: str) -> Dict[str, Any]:
|
|
627
|
+
"""Fallback parsing when JSON extraction fails."""
|
|
628
|
+
import re
|
|
629
|
+
|
|
630
|
+
# Try to extract score using multiple patterns (check in order of specificity)
|
|
631
|
+
score = 0.5 # Default middle score
|
|
632
|
+
|
|
633
|
+
# Pattern 1: Percentage (75%, 85%) - check first since it's most specific
|
|
634
|
+
if re.search(r"(\d+(?:\.\d+)?)\s*%", evaluation_result):
|
|
635
|
+
pct_match = re.search(r"(\d+(?:\.\d+)?)\s*%", evaluation_result)
|
|
636
|
+
score = float(pct_match.group(1)) / 100.0
|
|
637
|
+
|
|
638
|
+
# Pattern 2: "score/rate X out of Y"
|
|
639
|
+
elif re.search(r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)\s*out\s*of\s*(\d+)", evaluation_result, re.IGNORECASE):
|
|
640
|
+
score_match = re.search(
|
|
641
|
+
r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)\s*out\s*of\s*(\d+)", evaluation_result, re.IGNORECASE
|
|
642
|
+
)
|
|
643
|
+
extracted_score = float(score_match.group(1))
|
|
644
|
+
max_score = float(score_match.group(2))
|
|
645
|
+
score = extracted_score / max_score
|
|
646
|
+
|
|
647
|
+
# Pattern 3: "score/rate X" without "out of"
|
|
648
|
+
elif re.search(r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)", evaluation_result, re.IGNORECASE):
|
|
649
|
+
score_match = re.search(r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)", evaluation_result, re.IGNORECASE)
|
|
650
|
+
extracted_score = float(score_match.group(1))
|
|
651
|
+
if extracted_score > 1:
|
|
652
|
+
score = extracted_score / 10.0
|
|
653
|
+
else:
|
|
654
|
+
score = extracted_score
|
|
655
|
+
|
|
656
|
+
# Pattern 4: Just a number followed by descriptive text
|
|
657
|
+
elif re.search(r"\b(\d+(?:\.\d+)?)\b", evaluation_result):
|
|
658
|
+
num_match = re.search(r"\b(\d+(?:\.\d+)?)\b", evaluation_result)
|
|
659
|
+
extracted_score = float(num_match.group(1))
|
|
660
|
+
if extracted_score > 1:
|
|
661
|
+
score = extracted_score / 10.0
|
|
662
|
+
else:
|
|
663
|
+
score = extracted_score
|
|
664
|
+
|
|
665
|
+
return {
|
|
666
|
+
"score": max(0.0, min(1.0, score)),
|
|
667
|
+
"feedback": evaluation_result[:500] + "..." if len(evaluation_result) > 500 else evaluation_result,
|
|
668
|
+
"reasoning": "Fallback parsing - JSON extraction failed",
|
|
669
|
+
"criteria_breakdown": {},
|
|
670
|
+
"assessment": "Evaluation completed with fallback parsing",
|
|
671
|
+
}
|