tsugite-cli 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. tsugite/__init__.py +6 -0
  2. tsugite/agent_composition.py +163 -0
  3. tsugite/agent_inheritance.py +479 -0
  4. tsugite/agent_preparation.py +236 -0
  5. tsugite/agent_runner/__init__.py +45 -0
  6. tsugite/agent_runner/helpers.py +106 -0
  7. tsugite/agent_runner/history_integration.py +248 -0
  8. tsugite/agent_runner/metrics.py +100 -0
  9. tsugite/agent_runner/runner.py +1879 -0
  10. tsugite/agent_runner/validation.py +70 -0
  11. tsugite/agent_utils.py +167 -0
  12. tsugite/attachments/__init__.py +65 -0
  13. tsugite/attachments/auto_context.py +199 -0
  14. tsugite/attachments/base.py +34 -0
  15. tsugite/attachments/file.py +51 -0
  16. tsugite/attachments/inline.py +31 -0
  17. tsugite/attachments/storage.py +178 -0
  18. tsugite/attachments/url.py +59 -0
  19. tsugite/attachments/youtube.py +101 -0
  20. tsugite/benchmark/__init__.py +62 -0
  21. tsugite/benchmark/config.py +183 -0
  22. tsugite/benchmark/core.py +292 -0
  23. tsugite/benchmark/discovery.py +377 -0
  24. tsugite/benchmark/evaluators.py +671 -0
  25. tsugite/benchmark/execution.py +657 -0
  26. tsugite/benchmark/metrics.py +204 -0
  27. tsugite/benchmark/reports.py +420 -0
  28. tsugite/benchmark/utils.py +288 -0
  29. tsugite/builtin_agents/chat-assistant.md +53 -0
  30. tsugite/builtin_agents/default.md +140 -0
  31. tsugite/builtin_agents.py +5 -0
  32. tsugite/cache.py +195 -0
  33. tsugite/cli/__init__.py +1042 -0
  34. tsugite/cli/agents.py +148 -0
  35. tsugite/cli/attachments.py +193 -0
  36. tsugite/cli/benchmark.py +663 -0
  37. tsugite/cli/cache.py +113 -0
  38. tsugite/cli/config.py +272 -0
  39. tsugite/cli/helpers.py +534 -0
  40. tsugite/cli/history.py +193 -0
  41. tsugite/cli/init.py +387 -0
  42. tsugite/cli/mcp.py +193 -0
  43. tsugite/cli/tools.py +419 -0
  44. tsugite/config.py +204 -0
  45. tsugite/console.py +48 -0
  46. tsugite/constants.py +21 -0
  47. tsugite/core/__init__.py +19 -0
  48. tsugite/core/agent.py +774 -0
  49. tsugite/core/executor.py +300 -0
  50. tsugite/core/memory.py +67 -0
  51. tsugite/core/tools.py +271 -0
  52. tsugite/docker_cli.py +270 -0
  53. tsugite/events/__init__.py +55 -0
  54. tsugite/events/base.py +46 -0
  55. tsugite/events/bus.py +62 -0
  56. tsugite/events/events.py +224 -0
  57. tsugite/exceptions.py +40 -0
  58. tsugite/history/__init__.py +29 -0
  59. tsugite/history/index.py +210 -0
  60. tsugite/history/models.py +106 -0
  61. tsugite/history/storage.py +157 -0
  62. tsugite/mcp_client.py +219 -0
  63. tsugite/mcp_config.py +174 -0
  64. tsugite/md_agents.py +751 -0
  65. tsugite/models.py +257 -0
  66. tsugite/renderer.py +151 -0
  67. tsugite/shell_tool_config.py +265 -0
  68. tsugite/templates/assistant.md +14 -0
  69. tsugite/tools/__init__.py +265 -0
  70. tsugite/tools/agents.py +312 -0
  71. tsugite/tools/edit_strategies.py +393 -0
  72. tsugite/tools/fs.py +329 -0
  73. tsugite/tools/http.py +239 -0
  74. tsugite/tools/interactive.py +430 -0
  75. tsugite/tools/shell.py +129 -0
  76. tsugite/tools/shell_tools.py +214 -0
  77. tsugite/tools/tasks.py +339 -0
  78. tsugite/tsugite.py +7 -0
  79. tsugite/ui/__init__.py +46 -0
  80. tsugite/ui/base.py +638 -0
  81. tsugite/ui/chat.py +265 -0
  82. tsugite/ui/chat.tcss +92 -0
  83. tsugite/ui/chat_history.py +286 -0
  84. tsugite/ui/helpers.py +102 -0
  85. tsugite/ui/jsonl.py +125 -0
  86. tsugite/ui/live_template.py +529 -0
  87. tsugite/ui/plain.py +419 -0
  88. tsugite/ui/textual_chat.py +642 -0
  89. tsugite/ui/textual_handler.py +225 -0
  90. tsugite/ui/widgets/__init__.py +6 -0
  91. tsugite/ui/widgets/base_scroll_log.py +27 -0
  92. tsugite/ui/widgets/message_list.py +121 -0
  93. tsugite/ui/widgets/thought_log.py +80 -0
  94. tsugite/ui_context.py +90 -0
  95. tsugite/utils.py +367 -0
  96. tsugite/xdg.py +104 -0
  97. tsugite_cli-0.3.3.dist-info/METADATA +325 -0
  98. tsugite_cli-0.3.3.dist-info/RECORD +101 -0
  99. tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
  100. tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
  101. tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
@@ -0,0 +1,657 @@
1
+ """Test execution logic for benchmark framework."""
2
+
3
+ import json
4
+ import re
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Any, Dict
8
+
9
+ import yaml
10
+
11
+ from ..agent_runner import run_agent
12
+ from ..exceptions import AgentExecutionError
13
+ from .config import (
14
+ EVALUATION_WEIGHTS,
15
+ SIMILARITY_THRESHOLDS,
16
+ )
17
+ from .discovery import BenchmarkTest, TestCase
18
+ from .evaluators import CorrectnessEvaluator, LLMEvaluator
19
+ from .metrics import BenchmarkTestResult
20
+
21
+
22
+ class TestExecutor:
23
+ """Executes benchmark tests against models."""
24
+
25
+ def __init__(self, output_dir: Path, llm_evaluator_model: str = "openai:gpt-4o-mini"):
26
+ """Initialize test executor.
27
+
28
+ Args:
29
+ output_dir: Directory for temporary files
30
+ llm_evaluator_model: Model to use for LLM evaluation
31
+ """
32
+ self.output_dir = Path(output_dir)
33
+ self.output_dir.mkdir(parents=True, exist_ok=True)
34
+ self.correctness_evaluator = CorrectnessEvaluator()
35
+ self.llm_evaluator = LLMEvaluator(evaluator_model=llm_evaluator_model)
36
+
37
+ async def run_test(self, model_name: str, test: BenchmarkTest) -> BenchmarkTestResult:
38
+ """Run all test cases for a single agent against a model.
39
+
40
+ Args:
41
+ model_name: Name of model to test
42
+ test: Benchmark test to run
43
+
44
+ Returns:
45
+ Aggregated test result
46
+ """
47
+ start_time = time.time()
48
+
49
+ if not test.test_cases:
50
+ raise ValueError(f"No test cases found for {test.test_id}")
51
+
52
+ try:
53
+ # Create temporary agent file with target model
54
+ temp_agent_path = self._create_temp_agent(test, model_name)
55
+
56
+ try:
57
+ # Run all test cases
58
+ case_results, aggregated_output, total_steps, total_tokens, total_cost = await self._run_all_test_cases(
59
+ test, temp_agent_path, model_name
60
+ )
61
+
62
+ # Calculate aggregate results
63
+ total_passed = sum(1 for r in case_results if r["passed"])
64
+ overall_passed = total_passed == len(test.test_cases)
65
+
66
+ # Calculate weighted score
67
+ total_score = sum(r["score"] * test.test_cases[i].weight for i, r in enumerate(case_results))
68
+ total_weight = sum(tc.weight for tc in test.test_cases)
69
+ overall_score = total_score / total_weight if total_weight > 0 else 0.0
70
+
71
+ # Aggregate metrics
72
+ aggregate_metrics = {
73
+ "case_results": case_results,
74
+ "cases_passed": total_passed,
75
+ "total_cases": len(test.test_cases),
76
+ "pass_rate": total_passed / len(test.test_cases),
77
+ }
78
+
79
+ duration = time.time() - start_time
80
+
81
+ # Use first case's expected output for legacy compatibility
82
+ expected_output = test.test_cases[0].expected_output if test.test_cases else ""
83
+
84
+ return BenchmarkTestResult(
85
+ test_id=test.test_id,
86
+ model=model_name,
87
+ passed=overall_passed,
88
+ score=overall_score,
89
+ duration=duration,
90
+ output=aggregated_output,
91
+ expected_output=expected_output or "",
92
+ category=test.category,
93
+ error=None,
94
+ token_usage={"total": total_tokens},
95
+ cost=total_cost,
96
+ steps_taken=total_steps,
97
+ metrics=aggregate_metrics,
98
+ )
99
+
100
+ finally:
101
+ # Clean up temp file
102
+ if temp_agent_path.exists():
103
+ temp_agent_path.unlink()
104
+
105
+ except AgentExecutionError as e:
106
+ # Agent execution failed with execution details
107
+ duration = time.time() - start_time
108
+
109
+ execution_trace = None
110
+ if e.execution_steps:
111
+ # Build execution trace from steps in exception
112
+ execution_trace = [
113
+ {
114
+ "step": step.step_number,
115
+ "thought": step.thought,
116
+ "code": step.code,
117
+ "output": step.output,
118
+ "tools_called": step.tools_called,
119
+ "error": step.error,
120
+ }
121
+ for step in e.execution_steps
122
+ ]
123
+
124
+ metrics = {}
125
+ if execution_trace:
126
+ metrics["execution_trace"] = execution_trace
127
+
128
+ return BenchmarkTestResult(
129
+ test_id=test.test_id,
130
+ model=model_name,
131
+ passed=False,
132
+ score=0.0,
133
+ duration=duration,
134
+ output="",
135
+ expected_output="",
136
+ category=test.category,
137
+ error=str(e),
138
+ token_usage={"total": e.token_usage} if e.token_usage else {},
139
+ cost=e.cost or 0.0,
140
+ steps_taken=e.step_count,
141
+ metrics=metrics,
142
+ )
143
+
144
+ except Exception as e:
145
+ # Other unexpected errors without execution details
146
+ duration = time.time() - start_time
147
+
148
+ return BenchmarkTestResult(
149
+ test_id=test.test_id,
150
+ model=model_name,
151
+ passed=False,
152
+ score=0.0,
153
+ duration=duration,
154
+ output="",
155
+ expected_output="",
156
+ category=test.category,
157
+ error=str(e),
158
+ token_usage={},
159
+ cost=0.0,
160
+ steps_taken=0,
161
+ metrics={},
162
+ )
163
+
164
+ def _create_temp_agent(self, test: BenchmarkTest, model_name: str) -> Path:
165
+ """Create temporary agent file with specified model.
166
+
167
+ Args:
168
+ test: Benchmark test
169
+ model_name: Model name to inject
170
+
171
+ Returns:
172
+ Path to temporary agent file
173
+ """
174
+ original_content = test.agent_path.read_text()
175
+
176
+ # Find YAML frontmatter boundaries
177
+ lines = original_content.split("\n")
178
+ yaml_end = -1
179
+ for i, line in enumerate(lines[1:], 1):
180
+ if line.strip() == "---":
181
+ yaml_end = i
182
+ break
183
+
184
+ if yaml_end == -1:
185
+ raise ValueError("Invalid YAML frontmatter format")
186
+
187
+ # Parse original YAML
188
+ original_yaml = "\n".join(lines[1:yaml_end])
189
+ yaml_data = yaml.safe_load(original_yaml) or {}
190
+ markdown_content = "\n".join(lines[yaml_end + 1 :])
191
+
192
+ # Create clean YAML with only agent fields
193
+ clean_yaml_data = {
194
+ "name": yaml_data.get("name", test.test_id),
195
+ "description": yaml_data.get("description", ""),
196
+ "model": model_name, # Override model
197
+ "max_turns": yaml_data.get("max_turns", 5),
198
+ "tools": yaml_data.get("tools", []),
199
+ "text_mode": yaml_data.get("text_mode"),
200
+ "prefetch": yaml_data.get("prefetch", []),
201
+ "permissions_profile": yaml_data.get("permissions_profile", "default"),
202
+ "context_budget": yaml_data.get("context_budget"),
203
+ }
204
+
205
+ # Remove None values and empty lists
206
+ clean_yaml_data = {k: v for k, v in clean_yaml_data.items() if v is not None and v != []}
207
+
208
+ # Create clean agent content
209
+ clean_yaml = yaml.dump(clean_yaml_data, default_flow_style=False)
210
+ clean_agent_content = f"---\n{clean_yaml}---\n\n{markdown_content}"
211
+
212
+ # Write temporary agent file
213
+ temp_agent_path = self.output_dir / f"temp_{test.test_id}_{model_name.replace(':', '_')}.md"
214
+ temp_agent_path.write_text(clean_agent_content)
215
+
216
+ return temp_agent_path
217
+
218
+ async def _run_all_test_cases(
219
+ self, test: BenchmarkTest, temp_agent_path: Path, model_name: str
220
+ ) -> tuple[list[Dict[str, Any]], str, int, int, float]:
221
+ """Run all test cases for a test.
222
+
223
+ Args:
224
+ test: Benchmark test
225
+ temp_agent_path: Path to temporary agent file
226
+ model_name: Model name
227
+
228
+ Returns:
229
+ Tuple of (case_results, aggregated_output, total_steps, total_tokens, total_cost)
230
+ """
231
+ case_results = []
232
+ raw_outputs = []
233
+ aggregated_output_parts = []
234
+ total_steps = 0
235
+ total_tokens = 0
236
+ total_cost = 0.0
237
+
238
+ for test_case in test.test_cases:
239
+ case_start = time.time()
240
+
241
+ # Prepare prompt with planning instruction if needed
242
+ final_prompt = self._prepare_prompt(test_case)
243
+
244
+ # Run the agent with token usage tracking to get step count
245
+ result_tuple = run_agent(
246
+ agent_path=temp_agent_path,
247
+ prompt=final_prompt,
248
+ model_override=model_name,
249
+ debug=False,
250
+ return_token_usage=True,
251
+ )
252
+
253
+ # Unpack result: (output, token_count, cost, step_count, execution_steps)
254
+ result, token_count, cost, steps, execution_steps = result_tuple
255
+ total_steps += steps
256
+ total_tokens += token_count or 0
257
+ total_cost += cost or 0.0
258
+
259
+ case_duration = time.time() - case_start
260
+
261
+ # Evaluate this test case
262
+ case_evaluation = await self._evaluate_test_case(test_case, result, case_duration, execution_steps, test)
263
+
264
+ case_results.append(case_evaluation)
265
+ raw_outputs.append(str(result))
266
+ aggregated_output_parts.append(
267
+ f"Test Case: {test_case.name}\nPrompt: {test_case.prompt}\nOutput: {result}\n---\n"
268
+ )
269
+
270
+ # Create aggregated output
271
+ if len(raw_outputs) == 1:
272
+ aggregated_output = raw_outputs[0]
273
+ else:
274
+ aggregated_output = "\n".join(aggregated_output_parts)
275
+
276
+ return case_results, aggregated_output, total_steps, total_tokens, total_cost
277
+
278
+ def _prepare_prompt(self, test_case: TestCase) -> str:
279
+ """Prepare prompt with optional planning instruction.
280
+
281
+ Args:
282
+ test_case: Test case
283
+
284
+ Returns:
285
+ Final prompt
286
+ """
287
+ if not test_case.requires_plan:
288
+ return test_case.prompt
289
+
290
+ planning_instruction = """
291
+
292
+ PLANNING REQUIREMENT: Before executing this task, you must first create a detailed plan. Your response should be structured as follows:
293
+
294
+ 1. **PLAN SECTION**: Start with a clear, step-by-step plan that outlines how you will approach this task
295
+ 2. **EXECUTION SECTION**: Then proceed with executing the plan
296
+
297
+ Make sure your plan includes the key steps and reasoning before you start executing."""
298
+
299
+ return test_case.prompt + planning_instruction
300
+
301
+ async def _evaluate_test_case(
302
+ self,
303
+ test_case: TestCase,
304
+ result: str,
305
+ duration: float,
306
+ execution_steps: list = None,
307
+ test: BenchmarkTest = None,
308
+ ) -> Dict[str, Any]:
309
+ """Evaluate a single test case result.
310
+
311
+ Args:
312
+ test_case: Test case definition
313
+ result: Agent output
314
+ duration: Execution duration
315
+ execution_steps: List of StepResult objects from execution
316
+ test: BenchmarkTest object with agent configuration
317
+
318
+ Returns:
319
+ Evaluation dictionary with passed, score, and metrics
320
+ """
321
+ evaluation = {
322
+ "test_case": test_case.name,
323
+ "passed": False,
324
+ "score": 0.0,
325
+ "metrics": {},
326
+ }
327
+
328
+ try:
329
+ # 1. Basic correctness evaluation
330
+ if test_case.expected_output:
331
+ evaluation = self._evaluate_correctness(test_case, result, evaluation)
332
+
333
+ # 2. Custom evaluation criteria (includes tool checking)
334
+ if test_case.evaluation:
335
+ evaluation = self._evaluate_custom_criteria(test_case, result, evaluation, execution_steps, test)
336
+
337
+ # 3. LLM evaluation (if enabled)
338
+ if test_case.use_llm_evaluation and test_case.llm_evaluation_criteria:
339
+ evaluation = await self._evaluate_with_llm(test_case, result, evaluation)
340
+
341
+ # 4. Plan evaluation (if required)
342
+ if test_case.requires_plan:
343
+ evaluation = self._evaluate_planning(test_case, result, evaluation)
344
+
345
+ except Exception as e:
346
+ evaluation["error"] = str(e)
347
+
348
+ # Add execution trace for failed tests (for debugging)
349
+ # Only save for failures to reduce JSON size
350
+ if execution_steps and not evaluation["passed"]:
351
+ evaluation["execution_trace"] = [
352
+ {
353
+ "step": step.step_number,
354
+ "thought": step.thought,
355
+ "code": step.code,
356
+ "output": step.output,
357
+ "tools_called": step.tools_called,
358
+ "error": step.error,
359
+ }
360
+ for step in execution_steps
361
+ ]
362
+
363
+ return evaluation
364
+
365
+ def _evaluate_correctness(self, test_case: TestCase, result: str, evaluation: Dict[str, Any]) -> Dict[str, Any]:
366
+ """Evaluate output correctness."""
367
+ correctness = self.correctness_evaluator.evaluate(
368
+ output=result,
369
+ expected=test_case.expected_output,
370
+ output_type="string",
371
+ )
372
+ evaluation["passed"] = correctness["passed"]
373
+ evaluation["score"] = correctness["score"]
374
+ evaluation["metrics"]["correctness"] = correctness
375
+ return evaluation
376
+
377
+ def _evaluate_custom_criteria(
378
+ self,
379
+ test_case: TestCase,
380
+ result: str,
381
+ evaluation: Dict[str, Any],
382
+ execution_steps: list = None,
383
+ test: BenchmarkTest = None,
384
+ ) -> Dict[str, Any]:
385
+ """Evaluate custom criteria."""
386
+ custom_checks = {}
387
+
388
+ for criterion, expected_value in test_case.evaluation.items():
389
+ check_passed = self._check_criterion(criterion, expected_value, result, execution_steps, test)
390
+ custom_checks[criterion] = check_passed
391
+
392
+ evaluation["metrics"]["custom"] = custom_checks
393
+
394
+ # ALL custom criteria must pass for score of 1.0, otherwise 0.0
395
+ if not test_case.expected_output:
396
+ all_custom_pass = all(custom_checks.values()) if custom_checks else True
397
+ evaluation["score"] = 1.0 if all_custom_pass else 0.0
398
+ evaluation["passed"] = all_custom_pass
399
+
400
+ return evaluation
401
+
402
+ def _check_criterion(
403
+ self,
404
+ criterion: str,
405
+ expected_value: Any,
406
+ result: str,
407
+ execution_steps: list = None,
408
+ test: BenchmarkTest = None,
409
+ ) -> bool:
410
+ """Check a single custom criterion.
411
+
412
+ Args:
413
+ criterion: Criterion name
414
+ expected_value: Expected value
415
+ result: Agent output
416
+ execution_steps: List of StepResult objects from execution
417
+ test: BenchmarkTest object with agent configuration
418
+
419
+ Returns:
420
+ True if criterion passes
421
+ """
422
+ if criterion == "tool_called":
423
+ # Check if specific tools were called during execution
424
+ if not execution_steps:
425
+ return False
426
+
427
+ # Collect all tools called across all steps
428
+ tools_called = set()
429
+ for step in execution_steps:
430
+ if hasattr(step, "tools_called") and step.tools_called:
431
+ tools_called.update(step.tools_called)
432
+
433
+ # expected_value can be a single tool name or a list of required tools
434
+ if isinstance(expected_value, list):
435
+ # ALL listed tools must have been called
436
+ return all(tool in tools_called for tool in expected_value)
437
+ else:
438
+ # Single tool must have been called
439
+ return expected_value in tools_called
440
+
441
+ elif criterion == "exact_match":
442
+ return result.strip() == expected_value
443
+
444
+ elif criterion == "contains":
445
+ result_lower = result.lower()
446
+ if isinstance(expected_value, list):
447
+ return all(str(item).lower() in result_lower for item in expected_value)
448
+ else:
449
+ return str(expected_value).lower() in result_lower
450
+
451
+ elif criterion == "valid_json":
452
+ try:
453
+ json.loads(result.strip())
454
+ return expected_value # true means should be valid
455
+ except (json.JSONDecodeError, ValueError):
456
+ return not expected_value # false if expecting invalid
457
+
458
+ elif criterion == "contains_keys":
459
+ try:
460
+ parsed = json.loads(result.strip())
461
+ if isinstance(parsed, dict):
462
+ if isinstance(expected_value, list):
463
+ return all(key in parsed for key in expected_value)
464
+ else:
465
+ return expected_value in parsed
466
+ except (json.JSONDecodeError, ValueError, TypeError):
467
+ pass
468
+ return False
469
+
470
+ elif criterion == "content_pattern":
471
+ return bool(re.search(expected_value, result))
472
+
473
+ elif criterion == "matches":
474
+ # Regex pattern matching on agent output
475
+ # expected_value can be a single pattern or list of patterns
476
+ if isinstance(expected_value, list):
477
+ # ALL patterns must match
478
+ return all(bool(re.search(pattern, result)) for pattern in expected_value)
479
+ else:
480
+ # Single pattern must match
481
+ return bool(re.search(expected_value, result))
482
+
483
+ elif criterion == "file_check":
484
+ # Verify file contents directly
485
+ # expected_value is a dict with: path, exists, contains, matches, exact
486
+ if not isinstance(expected_value, dict):
487
+ return False
488
+
489
+ file_path = expected_value.get("path")
490
+ if not file_path:
491
+ return False
492
+
493
+ from pathlib import Path
494
+
495
+ path = Path(file_path)
496
+
497
+ # Check existence
498
+ if expected_value.get("exists"):
499
+ if not path.exists():
500
+ return False
501
+
502
+ # If file doesn't exist and we're not checking existence, fail
503
+ if not path.exists():
504
+ return False
505
+
506
+ # Read file content for verification
507
+ try:
508
+ file_content = path.read_text(encoding="utf-8")
509
+ except Exception:
510
+ # File read failed
511
+ return False
512
+
513
+ # Check exact match
514
+ if "exact" in expected_value:
515
+ if file_content.strip() != expected_value["exact"].strip():
516
+ return False
517
+
518
+ # Check contains (list of strings that must all appear)
519
+ if "contains" in expected_value:
520
+ contains_list = expected_value["contains"]
521
+ if isinstance(contains_list, list):
522
+ if not all(str(item) in file_content for item in contains_list):
523
+ return False
524
+ else:
525
+ if str(contains_list) not in file_content:
526
+ return False
527
+
528
+ # Check regex matches
529
+ if "matches" in expected_value:
530
+ pattern = expected_value["matches"]
531
+ if isinstance(pattern, list):
532
+ # All patterns must match
533
+ if not all(bool(re.search(p, file_content)) for p in pattern):
534
+ return False
535
+ else:
536
+ # Single pattern must match
537
+ if not bool(re.search(pattern, file_content)):
538
+ return False
539
+
540
+ # All checks passed
541
+ return True
542
+
543
+ elif criterion == "min_length":
544
+ return len(result) >= expected_value
545
+
546
+ else:
547
+ # Default to simple substring check
548
+ return str(expected_value).lower() in result.lower()
549
+
550
+ async def _evaluate_with_llm(self, test_case: TestCase, result: str, evaluation: Dict[str, Any]) -> Dict[str, Any]:
551
+ """Evaluate using LLM judge."""
552
+ llm_eval = await self.llm_evaluator.evaluate(
553
+ output=result,
554
+ task_description=test_case.prompt,
555
+ evaluation_criteria=test_case.llm_evaluation_criteria,
556
+ rubric=test_case.llm_evaluation_rubric or {},
557
+ )
558
+ evaluation["metrics"]["llm_evaluation"] = llm_eval
559
+
560
+ # Blend with existing score or use standalone
561
+ if evaluation["score"] == 0.0:
562
+ evaluation["score"] = llm_eval["llm_score"]
563
+ evaluation["passed"] = llm_eval["llm_score"] >= SIMILARITY_THRESHOLDS.llm_evaluation_threshold
564
+ else:
565
+ # Blend LLM score with existing evaluation
566
+ blended_score = (
567
+ evaluation["score"] * EVALUATION_WEIGHTS.base_weight
568
+ + llm_eval["llm_score"] * EVALUATION_WEIGHTS.llm_weight
569
+ )
570
+ evaluation["score"] = blended_score
571
+
572
+ return evaluation
573
+
574
+ def _evaluate_planning(self, test_case: TestCase, result: str, evaluation: Dict[str, Any]) -> Dict[str, Any]:
575
+ """Evaluate planning quality."""
576
+ plan_score = 0.0
577
+ plan_checks = {}
578
+
579
+ # Check for planning indicators
580
+ plan_indicators = ["plan", "step", "approach", "strategy", "outline", "first", "then", "next", "finally"]
581
+ has_plan_structure = any(indicator in result.lower() for indicator in plan_indicators)
582
+ plan_checks["has_planning_structure"] = has_plan_structure
583
+ if has_plan_structure:
584
+ plan_score += 0.3
585
+
586
+ # Check for specific plan elements
587
+ if test_case.expected_plan_elements:
588
+ elements_found = sum(1 for element in test_case.expected_plan_elements if element.lower() in result.lower())
589
+ element_score = elements_found / len(test_case.expected_plan_elements)
590
+ plan_score += element_score * 0.4
591
+
592
+ for element in test_case.expected_plan_elements:
593
+ plan_checks[f"element_{element}"] = element.lower() in result.lower()
594
+
595
+ # Check custom plan evaluation criteria
596
+ if test_case.plan_evaluation:
597
+ custom_plan_score = self._evaluate_plan_criteria(test_case, result, plan_checks)
598
+ plan_score += custom_plan_score * 0.3
599
+
600
+ evaluation["metrics"]["planning"] = {
601
+ "plan_score": plan_score,
602
+ "plan_checks": plan_checks,
603
+ "required": True,
604
+ }
605
+
606
+ # Blend planning score with existing evaluation
607
+ if evaluation["score"] == 0.0:
608
+ evaluation["score"] = plan_score
609
+ evaluation["passed"] = plan_score >= SIMILARITY_THRESHOLDS.behavior_pass_threshold
610
+ else:
611
+ blended_score = (
612
+ evaluation["score"] * (1 - EVALUATION_WEIGHTS.planning_weight)
613
+ + plan_score * EVALUATION_WEIGHTS.planning_weight
614
+ )
615
+ evaluation["score"] = blended_score
616
+ # Plan must meet minimum threshold
617
+ evaluation["passed"] = evaluation["passed"] and plan_score >= EVALUATION_WEIGHTS.planning_minimum
618
+
619
+ return evaluation
620
+
621
+ def _evaluate_plan_criteria(self, test_case: TestCase, result: str, plan_checks: Dict[str, Any]) -> float:
622
+ """Evaluate custom planning criteria.
623
+
624
+ Args:
625
+ test_case: Test case
626
+ result: Agent output
627
+ plan_checks: Dictionary to store check results
628
+
629
+ Returns:
630
+ Custom plan score (0.0 to 1.0)
631
+ """
632
+ custom_plan_score = 0.0
633
+ criteria_count = len(test_case.plan_evaluation)
634
+
635
+ for criterion, expected_value in test_case.plan_evaluation.items():
636
+ if criterion == "min_plan_steps":
637
+ # Count numbered or bulleted steps
638
+ steps = len(re.findall(r"(\d+\.|•|-|\*)\s", result))
639
+ plan_checks[criterion] = steps >= expected_value
640
+ if steps >= expected_value:
641
+ custom_plan_score += 1.0 / criteria_count
642
+
643
+ elif criterion == "structured_sections":
644
+ # Look for section headers
645
+ sections = any(header in result.upper() for header in ["PLAN", "EXECUTION", "APPROACH", "STRATEGY"])
646
+ plan_checks[criterion] = sections == expected_value
647
+ if sections == expected_value:
648
+ custom_plan_score += 1.0 / criteria_count
649
+
650
+ else:
651
+ # Default criterion check
652
+ criterion_met = str(expected_value).lower() in result.lower()
653
+ plan_checks[criterion] = criterion_met
654
+ if criterion_met:
655
+ custom_plan_score += 1.0 / criteria_count
656
+
657
+ return custom_plan_score