tsugite-cli 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. tsugite/__init__.py +6 -0
  2. tsugite/agent_composition.py +163 -0
  3. tsugite/agent_inheritance.py +479 -0
  4. tsugite/agent_preparation.py +236 -0
  5. tsugite/agent_runner/__init__.py +45 -0
  6. tsugite/agent_runner/helpers.py +106 -0
  7. tsugite/agent_runner/history_integration.py +248 -0
  8. tsugite/agent_runner/metrics.py +100 -0
  9. tsugite/agent_runner/runner.py +1879 -0
  10. tsugite/agent_runner/validation.py +70 -0
  11. tsugite/agent_utils.py +167 -0
  12. tsugite/attachments/__init__.py +65 -0
  13. tsugite/attachments/auto_context.py +199 -0
  14. tsugite/attachments/base.py +34 -0
  15. tsugite/attachments/file.py +51 -0
  16. tsugite/attachments/inline.py +31 -0
  17. tsugite/attachments/storage.py +178 -0
  18. tsugite/attachments/url.py +59 -0
  19. tsugite/attachments/youtube.py +101 -0
  20. tsugite/benchmark/__init__.py +62 -0
  21. tsugite/benchmark/config.py +183 -0
  22. tsugite/benchmark/core.py +292 -0
  23. tsugite/benchmark/discovery.py +377 -0
  24. tsugite/benchmark/evaluators.py +671 -0
  25. tsugite/benchmark/execution.py +657 -0
  26. tsugite/benchmark/metrics.py +204 -0
  27. tsugite/benchmark/reports.py +420 -0
  28. tsugite/benchmark/utils.py +288 -0
  29. tsugite/builtin_agents/chat-assistant.md +53 -0
  30. tsugite/builtin_agents/default.md +140 -0
  31. tsugite/builtin_agents.py +5 -0
  32. tsugite/cache.py +195 -0
  33. tsugite/cli/__init__.py +1042 -0
  34. tsugite/cli/agents.py +148 -0
  35. tsugite/cli/attachments.py +193 -0
  36. tsugite/cli/benchmark.py +663 -0
  37. tsugite/cli/cache.py +113 -0
  38. tsugite/cli/config.py +272 -0
  39. tsugite/cli/helpers.py +534 -0
  40. tsugite/cli/history.py +193 -0
  41. tsugite/cli/init.py +387 -0
  42. tsugite/cli/mcp.py +193 -0
  43. tsugite/cli/tools.py +419 -0
  44. tsugite/config.py +204 -0
  45. tsugite/console.py +48 -0
  46. tsugite/constants.py +21 -0
  47. tsugite/core/__init__.py +19 -0
  48. tsugite/core/agent.py +774 -0
  49. tsugite/core/executor.py +300 -0
  50. tsugite/core/memory.py +67 -0
  51. tsugite/core/tools.py +271 -0
  52. tsugite/docker_cli.py +270 -0
  53. tsugite/events/__init__.py +55 -0
  54. tsugite/events/base.py +46 -0
  55. tsugite/events/bus.py +62 -0
  56. tsugite/events/events.py +224 -0
  57. tsugite/exceptions.py +40 -0
  58. tsugite/history/__init__.py +29 -0
  59. tsugite/history/index.py +210 -0
  60. tsugite/history/models.py +106 -0
  61. tsugite/history/storage.py +157 -0
  62. tsugite/mcp_client.py +219 -0
  63. tsugite/mcp_config.py +174 -0
  64. tsugite/md_agents.py +751 -0
  65. tsugite/models.py +257 -0
  66. tsugite/renderer.py +151 -0
  67. tsugite/shell_tool_config.py +265 -0
  68. tsugite/templates/assistant.md +14 -0
  69. tsugite/tools/__init__.py +265 -0
  70. tsugite/tools/agents.py +312 -0
  71. tsugite/tools/edit_strategies.py +393 -0
  72. tsugite/tools/fs.py +329 -0
  73. tsugite/tools/http.py +239 -0
  74. tsugite/tools/interactive.py +430 -0
  75. tsugite/tools/shell.py +129 -0
  76. tsugite/tools/shell_tools.py +214 -0
  77. tsugite/tools/tasks.py +339 -0
  78. tsugite/tsugite.py +7 -0
  79. tsugite/ui/__init__.py +46 -0
  80. tsugite/ui/base.py +638 -0
  81. tsugite/ui/chat.py +265 -0
  82. tsugite/ui/chat.tcss +92 -0
  83. tsugite/ui/chat_history.py +286 -0
  84. tsugite/ui/helpers.py +102 -0
  85. tsugite/ui/jsonl.py +125 -0
  86. tsugite/ui/live_template.py +529 -0
  87. tsugite/ui/plain.py +419 -0
  88. tsugite/ui/textual_chat.py +642 -0
  89. tsugite/ui/textual_handler.py +225 -0
  90. tsugite/ui/widgets/__init__.py +6 -0
  91. tsugite/ui/widgets/base_scroll_log.py +27 -0
  92. tsugite/ui/widgets/message_list.py +121 -0
  93. tsugite/ui/widgets/thought_log.py +80 -0
  94. tsugite/ui_context.py +90 -0
  95. tsugite/utils.py +367 -0
  96. tsugite/xdg.py +104 -0
  97. tsugite_cli-0.3.3.dist-info/METADATA +325 -0
  98. tsugite_cli-0.3.3.dist-info/RECORD +101 -0
  99. tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
  100. tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
  101. tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
@@ -0,0 +1,671 @@
1
+ """Evaluators for different aspects of benchmark test results."""
2
+
3
+ import difflib
4
+ import json
5
+ import re
6
+ from abc import ABC, abstractmethod
7
+ from typing import Any, Dict, Optional
8
+
9
+ from .config import (
10
+ MODEL_COSTS,
11
+ SIMILARITY_THRESHOLDS,
12
+ get_cost_tier,
13
+ )
14
+ from .utils import json_similarity, normalize_code
15
+
16
+
17
+ class BaseEvaluator(ABC):
18
+ """Base class for all evaluators."""
19
+
20
+ @abstractmethod
21
+ def evaluate(self, **kwargs) -> Dict[str, Any]:
22
+ """Evaluate the given inputs and return a score and metrics."""
23
+ pass
24
+
25
+
26
+ class CorrectnessEvaluator(BaseEvaluator):
27
+ """Evaluates correctness of outputs against expected results."""
28
+
29
+ def evaluate(self, output: str, expected: str, output_type: str = "string") -> Dict[str, Any]:
30
+ """Evaluate correctness based on output type."""
31
+ result = {
32
+ "passed": False,
33
+ "score": 0.0,
34
+ "similarity": 0.0,
35
+ "exact_match": False,
36
+ "error": None,
37
+ }
38
+
39
+ try:
40
+ if output_type == "string":
41
+ result.update(self._evaluate_string(output, expected))
42
+ elif output_type == "json":
43
+ result.update(self._evaluate_json(output, expected))
44
+ elif output_type == "code":
45
+ result.update(self._evaluate_code(output, expected))
46
+ elif output_type == "number":
47
+ result.update(self._evaluate_number(output, expected))
48
+ else:
49
+ result.update(self._evaluate_string(output, expected))
50
+
51
+ except Exception as e:
52
+ result["error"] = str(e)
53
+
54
+ return result
55
+
56
+ def _evaluate_string(self, output: str, expected: str) -> Dict[str, Any]:
57
+ """Evaluate string output."""
58
+ output_clean = output.strip()
59
+ expected_clean = expected.strip()
60
+
61
+ exact_match = output_clean == expected_clean
62
+ similarity = difflib.SequenceMatcher(None, output_clean.lower(), expected_clean.lower()).ratio()
63
+
64
+ # Consider it passed if exact match or very high similarity
65
+ passed = exact_match or similarity >= SIMILARITY_THRESHOLDS.string_high_similarity
66
+
67
+ return {
68
+ "passed": passed,
69
+ "score": 1.0 if exact_match else similarity,
70
+ "similarity": similarity,
71
+ "exact_match": exact_match,
72
+ }
73
+
74
+ def _evaluate_json(self, output: str, expected: str) -> Dict[str, Any]:
75
+ """Evaluate JSON output."""
76
+ try:
77
+ output_json = json.loads(output.strip())
78
+ expected_json = json.loads(expected.strip())
79
+
80
+ exact_match = output_json == expected_json
81
+ score = 1.0 if exact_match else json_similarity(output_json, expected_json)
82
+
83
+ return {
84
+ "passed": exact_match or score >= SIMILARITY_THRESHOLDS.json_similarity,
85
+ "score": score,
86
+ "similarity": score,
87
+ "exact_match": exact_match,
88
+ }
89
+
90
+ except json.JSONDecodeError as e:
91
+ return {
92
+ "passed": False,
93
+ "score": 0.0,
94
+ "similarity": 0.0,
95
+ "exact_match": False,
96
+ "error": f"JSON decode error: {e}",
97
+ }
98
+
99
+ def _evaluate_code(self, output: str, expected: str) -> Dict[str, Any]:
100
+ """Evaluate code output (simplified)."""
101
+ # Normalize whitespace and remove comments
102
+ output_normalized = normalize_code(output)
103
+ expected_normalized = normalize_code(expected)
104
+
105
+ exact_match = output_normalized == expected_normalized
106
+ similarity = difflib.SequenceMatcher(None, output_normalized, expected_normalized).ratio()
107
+
108
+ return {
109
+ "passed": exact_match or similarity >= SIMILARITY_THRESHOLDS.code_similarity,
110
+ "score": 1.0 if exact_match else similarity,
111
+ "similarity": similarity,
112
+ "exact_match": exact_match,
113
+ }
114
+
115
+ def _evaluate_number(self, output: str, expected: str) -> Dict[str, Any]:
116
+ """Evaluate numeric output."""
117
+ try:
118
+ # Extract numbers from strings
119
+ output_nums = re.findall(r"-?\d+\.?\d*", output.strip())
120
+ expected_nums = re.findall(r"-?\d+\.?\d*", expected.strip())
121
+
122
+ if not output_nums or not expected_nums:
123
+ return {
124
+ "passed": False,
125
+ "score": 0.0,
126
+ "similarity": 0.0,
127
+ "exact_match": False,
128
+ "error": "No numbers found",
129
+ }
130
+
131
+ output_val = float(output_nums[0])
132
+ expected_val = float(expected_nums[0])
133
+
134
+ exact_match = abs(output_val - expected_val) < 1e-10
135
+ relative_error = abs(output_val - expected_val) / max(abs(expected_val), 1e-10)
136
+ score = max(0.0, 1.0 - relative_error)
137
+
138
+ return {
139
+ "passed": exact_match or relative_error < 0.01,
140
+ "score": score,
141
+ "similarity": score,
142
+ "exact_match": exact_match,
143
+ "relative_error": relative_error,
144
+ }
145
+
146
+ except (ValueError, IndexError) as e:
147
+ return {
148
+ "passed": False,
149
+ "score": 0.0,
150
+ "similarity": 0.0,
151
+ "exact_match": False,
152
+ "error": f"Number evaluation error: {e}",
153
+ }
154
+
155
+
156
+ class PerformanceEvaluator(BaseEvaluator):
157
+ """Evaluates performance metrics like speed and efficiency."""
158
+
159
+ def evaluate(self, duration: float, timeout: float, baseline_duration: Optional[float] = None) -> Dict[str, Any]:
160
+ """Evaluate performance based on duration."""
161
+ result = {
162
+ "duration": duration,
163
+ "timeout": timeout,
164
+ "timed_out": duration >= timeout,
165
+ "speed_score": 0.0,
166
+ "efficiency_tier": "unknown",
167
+ }
168
+
169
+ # Calculate speed score (inverse of duration, normalized)
170
+ if timeout > 0:
171
+ # Score based on how much of the timeout was used
172
+ time_ratio = duration / timeout
173
+ speed_score = max(0.0, 1.0 - min(time_ratio, 1.0))
174
+
175
+ # Reward significantly faster executions with a small bonus
176
+ if time_ratio <= 0.1:
177
+ speed_score = min(1.0, speed_score + 0.05)
178
+ elif time_ratio <= 0.3:
179
+ speed_score = min(1.0, speed_score + 0.02)
180
+ else:
181
+ speed_score = 1.0 if duration < 1.0 else max(0.0, 1.0 / duration)
182
+
183
+ result["speed_score"] = speed_score
184
+
185
+ # Efficiency tiers (based on performance thresholds)
186
+ time_ratio = duration / timeout if timeout > 0 else 0
187
+ if time_ratio <= 0.1:
188
+ tier = "Excellent"
189
+ elif time_ratio <= 0.3:
190
+ tier = "Good"
191
+ elif time_ratio <= 0.6:
192
+ tier = "Fair"
193
+ elif time_ratio <= 1.0:
194
+ tier = "Poor"
195
+ else:
196
+ tier = "Timeout"
197
+
198
+ result["efficiency_tier"] = tier
199
+
200
+ # Compare to baseline if provided
201
+ if baseline_duration:
202
+ improvement = max(0.0, (baseline_duration - duration) / baseline_duration)
203
+ result["improvement_over_baseline"] = improvement
204
+ result["relative_speed"] = baseline_duration / duration if duration > 0 else float("inf")
205
+
206
+ return result
207
+
208
+
209
+ class QualityEvaluator(BaseEvaluator):
210
+ """Evaluates quality of outputs using criteria-based assessment."""
211
+
212
+ async def evaluate(self, output: str, criteria: Dict[str, Any]) -> Dict[str, Any]:
213
+ """Evaluate output quality based on criteria."""
214
+ result = {
215
+ "score": 0.0,
216
+ "criteria_scores": {},
217
+ "overall_quality": "unknown",
218
+ "feedback": [],
219
+ }
220
+
221
+ total_score = 0.0
222
+ total_weight = 0.0
223
+
224
+ # Evaluate each criterion
225
+ for criterion, config in criteria.items():
226
+ weight = config.get("weight", 1.0)
227
+ criterion_score = await self._evaluate_criterion(output, criterion, config)
228
+
229
+ result["criteria_scores"][criterion] = criterion_score
230
+ total_score += criterion_score * weight
231
+ total_weight += weight
232
+
233
+ # Calculate overall score
234
+ if total_weight > 0:
235
+ result["score"] = total_score / total_weight
236
+ else:
237
+ result["score"] = 0.0
238
+
239
+ # Determine quality tier
240
+ score = result["score"]
241
+ if score >= 0.9:
242
+ quality = "Excellent"
243
+ elif score >= 0.75:
244
+ quality = "Good"
245
+ elif score >= 0.6:
246
+ quality = "Fair"
247
+ elif score >= 0.4:
248
+ quality = "Poor"
249
+ else:
250
+ quality = "Very Poor"
251
+
252
+ result["overall_quality"] = quality
253
+
254
+ return result
255
+
256
+ async def _evaluate_criterion(self, output: str, criterion: str, config: Dict[str, Any]) -> float:
257
+ """Evaluate a specific quality criterion."""
258
+ criterion_type = config.get("type", "keyword")
259
+
260
+ if criterion_type == "keyword":
261
+ return self._evaluate_keyword_presence(output, config)
262
+ elif criterion_type == "length":
263
+ return self._evaluate_length(output, config)
264
+ elif criterion_type == "format":
265
+ return self._evaluate_format(output, config)
266
+ elif criterion_type == "sentiment":
267
+ return self._evaluate_sentiment(output, config)
268
+ else:
269
+ # Default to simple keyword check
270
+ return self._evaluate_keyword_presence(output, config)
271
+
272
+ def _evaluate_keyword_presence(self, output: str, config: Dict[str, Any]) -> float:
273
+ """Evaluate based on keyword presence."""
274
+ required_keywords = config.get("keywords", [])
275
+ if not required_keywords:
276
+ return 1.0
277
+
278
+ output_lower = output.lower()
279
+ found_keywords = sum(1 for keyword in required_keywords if keyword.lower() in output_lower)
280
+
281
+ return found_keywords / len(required_keywords)
282
+
283
+ def _evaluate_length(self, output: str, config: Dict[str, Any]) -> float:
284
+ """Evaluate based on output length."""
285
+ min_length = config.get("min_length", 0)
286
+ max_length = config.get("max_length", float("inf"))
287
+ optimal_length = config.get("optimal_length")
288
+
289
+ length = len(output.strip())
290
+
291
+ if length < min_length:
292
+ return 0.0
293
+ elif length > max_length:
294
+ return max(0.0, 1.0 - (length - max_length) / max_length)
295
+ elif optimal_length:
296
+ # Score based on distance from optimal
297
+ distance = abs(length - optimal_length)
298
+ return max(0.0, 1.0 - distance / optimal_length)
299
+ else:
300
+ return 1.0
301
+
302
+ def _evaluate_format(self, output: str, config: Dict[str, Any]) -> float:
303
+ """Evaluate based on format requirements."""
304
+ format_type = config.get("format", "text")
305
+
306
+ if format_type == "json":
307
+ try:
308
+ json.loads(output.strip())
309
+ return 1.0
310
+ except json.JSONDecodeError:
311
+ return 0.0
312
+
313
+ elif format_type == "code":
314
+ # Simple check for code-like structure
315
+ has_keywords = any(
316
+ keyword in output for keyword in ["def ", "class ", "import ", "function", "var ", "let "]
317
+ )
318
+ has_structure = any(char in output for char in ["{", "}", "(", ")", "[", "]"])
319
+ return 1.0 if has_keywords or has_structure else 0.5
320
+
321
+ elif format_type == "markdown":
322
+ # Check for markdown elements
323
+ has_headers = bool(re.search(r"^#{1,6}\s", output, re.MULTILINE))
324
+ has_formatting = bool(re.search(r"\*\*.*\*\*|\*.*\*|`.*`", output))
325
+ return 1.0 if has_headers or has_formatting else 0.5
326
+
327
+ else:
328
+ return 1.0 # Default to passing for unknown formats
329
+
330
+ def _evaluate_sentiment(self, output: str, config: Dict[str, Any]) -> float:
331
+ """Evaluate sentiment (simplified implementation)."""
332
+ expected_sentiment = config.get("sentiment", "neutral")
333
+
334
+ # Simple keyword-based sentiment analysis
335
+ positive_words = [
336
+ "good",
337
+ "great",
338
+ "excellent",
339
+ "wonderful",
340
+ "amazing",
341
+ "positive",
342
+ "success",
343
+ ]
344
+ negative_words = [
345
+ "bad",
346
+ "terrible",
347
+ "awful",
348
+ "horrible",
349
+ "negative",
350
+ "failure",
351
+ "error",
352
+ ]
353
+
354
+ output_lower = output.lower()
355
+ positive_count = sum(1 for word in positive_words if word in output_lower)
356
+ negative_count = sum(1 for word in negative_words if word in output_lower)
357
+
358
+ if expected_sentiment == "positive":
359
+ return 1.0 if positive_count > negative_count else 0.5
360
+ elif expected_sentiment == "negative":
361
+ return 1.0 if negative_count > positive_count else 0.5
362
+ else: # neutral
363
+ return 1.0 if abs(positive_count - negative_count) <= 1 else 0.7
364
+
365
+
366
+ class CostEvaluator(BaseEvaluator):
367
+ """Evaluates cost metrics and efficiency."""
368
+
369
+ def evaluate(self, token_usage: Dict[str, int], model: str, duration: float) -> Dict[str, Any]:
370
+ """Evaluate cost-related metrics."""
371
+ cost_per_token = MODEL_COSTS.get_cost_for_model(model)
372
+
373
+ input_tokens = token_usage.get("input", 0)
374
+ output_tokens = token_usage.get("output", 0)
375
+ total_tokens = token_usage.get("total", input_tokens + output_tokens)
376
+
377
+ estimated_cost = total_tokens * cost_per_token
378
+
379
+ return {
380
+ "estimated_cost": estimated_cost,
381
+ "cost_per_token": cost_per_token,
382
+ "tokens_per_second": total_tokens / duration if duration > 0 else 0,
383
+ "cost_efficiency_tier": get_cost_tier(estimated_cost),
384
+ }
385
+
386
+
387
+ class LLMEvaluator(BaseEvaluator):
388
+ """Evaluates outputs using another LLM as a judge."""
389
+
390
+ def __init__(self, evaluator_model: str = "openai:gpt-4o-mini"):
391
+ """Initialize the LLM evaluator.
392
+
393
+ Args:
394
+ evaluator_model: Model to use for evaluation (format: provider:model_name)
395
+ """
396
+ self.evaluator_model = evaluator_model
397
+
398
+ @staticmethod
399
+ def _normalize_score(score: Any) -> float:
400
+ """Normalize a score to 0-1 range.
401
+
402
+ Args:
403
+ score: Score value (can be string, int, or float)
404
+
405
+ Returns:
406
+ Normalized score between 0.0 and 1.0
407
+ """
408
+ # Convert to numeric if it's a string
409
+ if isinstance(score, str):
410
+ try:
411
+ score = float(score)
412
+ except ValueError:
413
+ return 0.5 # Default to middle score for non-numeric
414
+
415
+ # Normalize 0-10 range to 0-1
416
+ if isinstance(score, (int, float)) and score > 1:
417
+ score = score / 10.0
418
+
419
+ return max(0.0, min(1.0, float(score)))
420
+
421
+ async def evaluate(
422
+ self,
423
+ output: str,
424
+ task_description: str,
425
+ evaluation_criteria: str,
426
+ expected_format: str = None,
427
+ rubric: Dict[str, Any] = None,
428
+ ) -> Dict[str, Any]:
429
+ """Evaluate output using an LLM judge.
430
+
431
+ Args:
432
+ output: The agent's output to evaluate
433
+ task_description: Description of what the agent was asked to do
434
+ evaluation_criteria: Criteria for evaluation (e.g., "accuracy, clarity, completeness")
435
+ expected_format: Expected format of the output (optional)
436
+ rubric: Detailed scoring rubric (optional)
437
+
438
+ Returns:
439
+ Dictionary with evaluation results
440
+ """
441
+ try:
442
+ import tempfile
443
+ from pathlib import Path
444
+
445
+ # Create evaluation prompt
446
+ evaluation_prompt = self._create_evaluation_prompt(
447
+ output, task_description, evaluation_criteria, expected_format, rubric
448
+ )
449
+
450
+ # Create a temporary evaluator agent
451
+ evaluator_agent_content = self._create_evaluator_agent(self.evaluator_model)
452
+
453
+ # Write to temporary file
454
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as f:
455
+ f.write(evaluator_agent_content)
456
+ temp_agent_path = f.name
457
+
458
+ try:
459
+ # Import here to avoid circular imports
460
+ from ..agent_runner import run_agent
461
+
462
+ # Run the evaluator agent
463
+ evaluation_result = run_agent(
464
+ agent_path=Path(temp_agent_path),
465
+ prompt=evaluation_prompt,
466
+ context={},
467
+ model_override=None,
468
+ debug=False,
469
+ )
470
+
471
+ # Parse the evaluation result
472
+ parsed_result = self._parse_evaluation_result(evaluation_result)
473
+
474
+ return {
475
+ "llm_score": parsed_result.get("score", 0.0),
476
+ "llm_feedback": parsed_result.get("feedback", ""),
477
+ "llm_reasoning": parsed_result.get("reasoning", ""),
478
+ "criteria_breakdown": parsed_result.get("criteria_breakdown", {}),
479
+ "overall_assessment": parsed_result.get("assessment", ""),
480
+ "evaluator_model": self.evaluator_model,
481
+ "raw_evaluation": evaluation_result,
482
+ }
483
+
484
+ finally:
485
+ # Clean up temporary file
486
+ import os
487
+
488
+ try:
489
+ os.unlink(temp_agent_path)
490
+ except Exception:
491
+ pass
492
+
493
+ except Exception as e:
494
+ return {
495
+ "llm_score": 0.0,
496
+ "llm_feedback": f"Evaluation failed: {str(e)}",
497
+ "llm_reasoning": "",
498
+ "criteria_breakdown": {},
499
+ "overall_assessment": "Error",
500
+ "evaluator_model": self.evaluator_model,
501
+ "error": str(e),
502
+ }
503
+
504
+ def _create_evaluation_prompt(
505
+ self,
506
+ output: str,
507
+ task_description: str,
508
+ evaluation_criteria: str,
509
+ expected_format: str = None,
510
+ rubric: Dict[str, Any] = None,
511
+ ) -> str:
512
+ """Create the evaluation prompt for the LLM judge."""
513
+
514
+ prompt = f"""You are an expert evaluator tasked with assessing an AI agent's performance.
515
+
516
+ ## Task Description
517
+ The agent was asked to: {task_description}
518
+
519
+ ## Evaluation Criteria
520
+ Evaluate the output based on: {evaluation_criteria}
521
+
522
+ ## Agent's Output
523
+ {output}
524
+
525
+ ## Instructions
526
+ 1. Carefully analyze the agent's output against the task requirements
527
+ 2. Rate each criterion on a scale of 0-10 (0 = completely fails, 10 = exceeds expectations)
528
+ 3. Provide constructive feedback explaining your scoring
529
+ 4. Give an overall assessment and final score
530
+
531
+ """
532
+
533
+ if expected_format:
534
+ prompt += f"\n## Expected Format\nThe output should follow this format: {expected_format}\n"
535
+
536
+ if rubric:
537
+ prompt += "\n## Detailed Rubric\n"
538
+ for criterion, details in rubric.items():
539
+ prompt += f"**{criterion}**: {details}\n"
540
+
541
+ prompt += """
542
+ ## Required Response Format
543
+ Please respond with a JSON object containing:
544
+ {
545
+ "score": <overall_score_0_to_10>,
546
+ "feedback": "<detailed_feedback>",
547
+ "reasoning": "<explanation_of_scoring>",
548
+ "criteria_breakdown": {
549
+ "<criterion1>": <score_0_to_10>,
550
+ "<criterion2>": <score_0_to_10>
551
+ },
552
+ "assessment": "<overall_quality_assessment>"
553
+ }
554
+
555
+ Provide thorough, constructive feedback that would help improve the agent's performance."""
556
+
557
+ return prompt
558
+
559
+ def _create_evaluator_agent(self, model: str) -> str:
560
+ """Create a temporary evaluator agent."""
561
+ return f"""---
562
+ name: llm_evaluator
563
+ model: {model}
564
+ max_turns: 3
565
+ tools: []
566
+ ---
567
+
568
+ # LLM Evaluator Agent
569
+
570
+ You are an expert AI evaluator with deep knowledge of AI systems, natural language processing, and task completion assessment.
571
+
572
+ Your role is to provide fair, objective, and constructive evaluation of AI agent outputs.
573
+
574
+ ## Evaluation Principles
575
+ - **Accuracy**: Does the output correctly address the task?
576
+ - **Completeness**: Are all requirements fulfilled?
577
+ - **Clarity**: Is the output clear and well-structured?
578
+ - **Relevance**: Does the output stay on topic and address the request?
579
+ - **Quality**: Is the output of high quality with attention to detail?
580
+
581
+ ## Task
582
+ {{{{ user_prompt }}}}
583
+
584
+ ## Instructions
585
+ Analyze the provided output carefully and return a properly formatted JSON response with scores and detailed feedback.
586
+ """
587
+
588
+ def _parse_evaluation_result(self, evaluation_result: str) -> Dict[str, Any]:
589
+ """Parse the LLM evaluation result."""
590
+ try:
591
+ # Try to extract JSON from the result
592
+ import re
593
+
594
+ # Look for JSON block in the response
595
+ json_match = re.search(r"```json\s*(\{.*?\})\s*```", evaluation_result, re.DOTALL)
596
+ if json_match:
597
+ json_str = json_match.group(1)
598
+ else:
599
+ # Look for JSON object directly with proper handling of nested braces
600
+ json_match = re.search(r"\{(?:[^{}]|{[^}]*})*\}", evaluation_result, re.DOTALL)
601
+ if json_match and '"score"' in json_match.group(0):
602
+ json_str = json_match.group(0)
603
+ else:
604
+ # Try to find any JSON-like structure
605
+ json_str = evaluation_result.strip()
606
+
607
+ # Parse the JSON
608
+ result = json.loads(json_str)
609
+
610
+ # Normalize score to 0-1 range if it's 0-10
611
+ result["score"] = self._normalize_score(result.get("score", 0))
612
+
613
+ # Normalize criteria breakdown scores
614
+ if "criteria_breakdown" in result:
615
+ normalized_breakdown = {}
616
+ for criterion, score in result["criteria_breakdown"].items():
617
+ normalized_breakdown[criterion] = self._normalize_score(score)
618
+ result["criteria_breakdown"] = normalized_breakdown
619
+
620
+ return result
621
+
622
+ except (json.JSONDecodeError, AttributeError):
623
+ # Fallback parsing if JSON parsing fails
624
+ return self._fallback_parse(evaluation_result)
625
+
626
+ def _fallback_parse(self, evaluation_result: str) -> Dict[str, Any]:
627
+ """Fallback parsing when JSON extraction fails."""
628
+ import re
629
+
630
+ # Try to extract score using multiple patterns (check in order of specificity)
631
+ score = 0.5 # Default middle score
632
+
633
+ # Pattern 1: Percentage (75%, 85%) - check first since it's most specific
634
+ if re.search(r"(\d+(?:\.\d+)?)\s*%", evaluation_result):
635
+ pct_match = re.search(r"(\d+(?:\.\d+)?)\s*%", evaluation_result)
636
+ score = float(pct_match.group(1)) / 100.0
637
+
638
+ # Pattern 2: "score/rate X out of Y"
639
+ elif re.search(r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)\s*out\s*of\s*(\d+)", evaluation_result, re.IGNORECASE):
640
+ score_match = re.search(
641
+ r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)\s*out\s*of\s*(\d+)", evaluation_result, re.IGNORECASE
642
+ )
643
+ extracted_score = float(score_match.group(1))
644
+ max_score = float(score_match.group(2))
645
+ score = extracted_score / max_score
646
+
647
+ # Pattern 3: "score/rate X" without "out of"
648
+ elif re.search(r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)", evaluation_result, re.IGNORECASE):
649
+ score_match = re.search(r"(?:score|rating|rate).*?(\d+(?:\.\d+)?)", evaluation_result, re.IGNORECASE)
650
+ extracted_score = float(score_match.group(1))
651
+ if extracted_score > 1:
652
+ score = extracted_score / 10.0
653
+ else:
654
+ score = extracted_score
655
+
656
+ # Pattern 4: Just a number followed by descriptive text
657
+ elif re.search(r"\b(\d+(?:\.\d+)?)\b", evaluation_result):
658
+ num_match = re.search(r"\b(\d+(?:\.\d+)?)\b", evaluation_result)
659
+ extracted_score = float(num_match.group(1))
660
+ if extracted_score > 1:
661
+ score = extracted_score / 10.0
662
+ else:
663
+ score = extracted_score
664
+
665
+ return {
666
+ "score": max(0.0, min(1.0, score)),
667
+ "feedback": evaluation_result[:500] + "..." if len(evaluation_result) > 500 else evaluation_result,
668
+ "reasoning": "Fallback parsing - JSON extraction failed",
669
+ "criteria_breakdown": {},
670
+ "assessment": "Evaluation completed with fallback parsing",
671
+ }