isage-benchmark-agent 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
  2. isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
  3. isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
  4. isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
  5. isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
  6. isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
  7. sage/__init__.py +0 -0
  8. sage/benchmark/__init__.py +0 -0
  9. sage/benchmark/benchmark_agent/__init__.py +108 -0
  10. sage/benchmark/benchmark_agent/__main__.py +177 -0
  11. sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
  12. sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
  13. sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
  14. sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  15. sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  16. sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  17. sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  18. sage/benchmark/benchmark_agent/data_paths.py +332 -0
  19. sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  20. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  21. sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
  22. sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
  23. sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
  24. sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
  25. sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
  26. sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
  27. sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
  28. sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  29. sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
  30. sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
  31. sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
  32. sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
  33. sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
  34. sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  35. sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  36. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
  37. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
  38. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
  39. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
  40. sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
  41. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
  42. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
  43. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
  44. sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
  45. sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
  46. sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
  47. sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
  48. sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
  49. sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
  50. sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
  51. sage/benchmark/benchmark_agent/tools_loader.py +212 -0
@@ -0,0 +1,400 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Section 5.3.1: Error Analysis
4
+
5
+ 深入分析各方法的失败模式,找出改进方向。
6
+
7
+ 分析内容:
8
+ 1. Error Type Breakdown - 按 Challenge 分解错误类型
9
+ 2. Failure Cascading Analysis - 早期错误导致的级联失败
10
+
11
+ 输出:
12
+ - figures/fig4_analysis_error_breakdown.pdf
13
+ - tables/table_error_breakdown.tex
14
+
15
+ Usage:
16
+ python exp_analysis_error.py
17
+ python exp_analysis_error.py --challenge timing
18
+ python exp_analysis_error.py --challenge all
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import argparse
24
+ from collections import Counter, defaultdict
25
+ from typing import Any
26
+
27
+ from .exp_utils import (
28
+ get_figures_dir,
29
+ print_section_header,
30
+ print_subsection_header,
31
+ save_results,
32
+ setup_experiment_env,
33
+ )
34
+
35
+ # =============================================================================
36
+ # Error Analysis Functions
37
+ # =============================================================================
38
+
39
+
40
+ def analyze_timing_errors(results: list[dict[str, Any]]) -> dict[str, Any]:
41
+ """
42
+ 分析 Timing 错误类型。
43
+
44
+ 错误类型:
45
+ - false_positive: 不该调用却调用 (调用频率过高)
46
+ - false_negative: 该调用却没调用 (错过关键时机)
47
+ - confidence_miscalibration: 高置信度但错误
48
+ """
49
+ error_counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
50
+ confidence_errors: dict[str, list[float]] = defaultdict(list)
51
+
52
+ for r in results:
53
+ strategy = r.get("strategy", "unknown")
54
+ predictions = r.get("predictions", [])
55
+ references = r.get("references", [])
56
+ confidences = r.get("confidences", [])
57
+
58
+ for i, (pred, ref) in enumerate(zip(predictions, references)):
59
+ if pred != ref:
60
+ if pred and not ref:
61
+ error_counts[strategy]["false_positive"] += 1
62
+ elif not pred and ref:
63
+ error_counts[strategy]["false_negative"] += 1
64
+
65
+ # 置信度校准分析
66
+ if confidences and i < len(confidences):
67
+ conf = confidences[i]
68
+ if conf > 0.8: # 高置信但错误
69
+ error_counts[strategy]["high_conf_error"] += 1
70
+ confidence_errors[strategy].append(conf)
71
+
72
+ return {
73
+ "error_counts": {k: dict(v) for k, v in error_counts.items()},
74
+ "confidence_errors": {
75
+ k: {"count": len(v), "avg_conf": sum(v) / len(v) if v else 0}
76
+ for k, v in confidence_errors.items()
77
+ },
78
+ }
79
+
80
+
81
+ def analyze_planning_errors(results: list[dict[str, Any]]) -> dict[str, Any]:
82
+ """
83
+ 分析 Planning 错误类型。
84
+
85
+ 错误类型:
86
+ - step_missing: 缺失关键步骤
87
+ - wrong_order: 步骤顺序错误
88
+ - invalid_step: 步骤不合理/幻觉
89
+ - extra_steps: 多余步骤
90
+ """
91
+ error_counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
92
+ first_error_indices: dict[str, list[int]] = defaultdict(list)
93
+
94
+ for r in results:
95
+ strategy = r.get("strategy", "unknown")
96
+ predictions = r.get("predictions", [])
97
+ references = r.get("references", [])
98
+
99
+ for pred_plan, ref_plan in zip(predictions, references):
100
+ pred_steps = pred_plan if isinstance(pred_plan, list) else []
101
+ ref_steps = ref_plan if isinstance(ref_plan, list) else []
102
+
103
+ if not ref_steps:
104
+ continue
105
+
106
+ # 分类错误
107
+ pred_set = set(pred_steps)
108
+ ref_set = set(ref_steps)
109
+
110
+ # 缺失步骤
111
+ missing = ref_set - pred_set
112
+ if missing:
113
+ error_counts[strategy]["step_missing"] += len(missing)
114
+
115
+ # 多余步骤 (可能是幻觉)
116
+ extra = pred_set - ref_set
117
+ if extra:
118
+ error_counts[strategy]["extra_steps"] += len(extra)
119
+
120
+ # 顺序错误 (工具集合相同但顺序不同)
121
+ if pred_set == ref_set and pred_steps != ref_steps:
122
+ error_counts[strategy]["wrong_order"] += 1
123
+
124
+ # 首次错误位置
125
+ for i, (p, r) in enumerate(zip(pred_steps, ref_steps)):
126
+ if p != r:
127
+ first_error_indices[strategy].append(i)
128
+ break
129
+
130
+ # 计算首次错误分布
131
+ first_error_dist = {}
132
+ for strategy, indices in first_error_indices.items():
133
+ if indices:
134
+ dist = Counter(indices)
135
+ first_error_dist[strategy] = {
136
+ "distribution": dict(dist),
137
+ "mean_index": sum(indices) / len(indices),
138
+ "total_errors": len(indices),
139
+ }
140
+
141
+ return {
142
+ "error_counts": {k: dict(v) for k, v in error_counts.items()},
143
+ "first_error_distribution": first_error_dist,
144
+ }
145
+
146
+
147
+ def analyze_selection_errors(results: list[dict[str, Any]], k: int = 5) -> dict[str, Any]:
148
+ """
149
+ 分析 Selection 错误类型。
150
+
151
+ 错误类型:
152
+ - top1_miss: 第一个选择就错
153
+ - topk_miss: Top-K 内全错
154
+ - rank_volatility: 正确答案排名不稳定
155
+ - category_confusion: 跨类别混淆
156
+ """
157
+ error_counts: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
158
+ rank_positions: dict[str, list[int]] = defaultdict(list)
159
+
160
+ for r in results:
161
+ strategy = r.get("strategy", "unknown")
162
+ predictions = r.get("predictions", [])
163
+ references = r.get("references", [])
164
+
165
+ for pred_list, ref_list in zip(predictions, references):
166
+ ref_set = set(ref_list) if isinstance(ref_list, list) else {ref_list}
167
+ pred_top_k = pred_list[:k] if isinstance(pred_list, list) else [pred_list]
168
+
169
+ # Top-1 错误
170
+ if pred_top_k and pred_top_k[0] not in ref_set:
171
+ error_counts[strategy]["top1_miss"] += 1
172
+
173
+ # Top-K 全错
174
+ if not (set(pred_top_k) & ref_set):
175
+ error_counts[strategy]["topk_miss"] += 1
176
+
177
+ # 记录正确答案的排名位置
178
+ for i, p in enumerate(pred_list if isinstance(pred_list, list) else [pred_list]):
179
+ if p in ref_set:
180
+ rank_positions[strategy].append(i + 1)
181
+ break
182
+
183
+ # 类别混淆分析
184
+ if pred_top_k:
185
+ pred_categories = {_extract_category(p) for p in pred_top_k}
186
+ ref_categories = {_extract_category(r) for r in ref_set}
187
+ if pred_categories and ref_categories and not (pred_categories & ref_categories):
188
+ error_counts[strategy]["category_confusion"] += 1
189
+
190
+ # 计算排名稳定性
191
+ rank_analysis = {}
192
+ for strategy, positions in rank_positions.items():
193
+ if positions:
194
+ rank_analysis[strategy] = {
195
+ "mean_rank": sum(positions) / len(positions),
196
+ "std_rank": (
197
+ sum((p - sum(positions) / len(positions)) ** 2 for p in positions)
198
+ / len(positions)
199
+ )
200
+ ** 0.5,
201
+ "rank_1_count": sum(1 for p in positions if p == 1),
202
+ "total": len(positions),
203
+ }
204
+
205
+ return {
206
+ "error_counts": {k: dict(v) for k, v in error_counts.items()},
207
+ "rank_analysis": rank_analysis,
208
+ }
209
+
210
+
211
+ def _extract_category(tool_id: str) -> str:
212
+ """从工具 ID 提取类别。"""
213
+ parts = tool_id.split("_")
214
+ return parts[0] if parts else "unknown"
215
+
216
+
217
+ # =============================================================================
218
+ # Cascading Failure Analysis
219
+ # =============================================================================
220
+
221
+
222
+ def analyze_cascading_failures(results: list[dict[str, Any]]) -> dict[str, Any]:
223
+ """
224
+ 分析级联失败模式。
225
+
226
+ 检查早期错误是否导致后续步骤全部失败。
227
+ """
228
+ cascade_stats: dict[str, dict[str, int]] = defaultdict(
229
+ lambda: {"cascading": 0, "non_cascading": 0, "recovery": 0}
230
+ )
231
+
232
+ for r in results:
233
+ strategy = r.get("strategy", "unknown")
234
+ predictions = r.get("predictions", [])
235
+ references = r.get("references", [])
236
+
237
+ for pred_plan, ref_plan in zip(predictions, references):
238
+ if not isinstance(pred_plan, list) or not isinstance(ref_plan, list):
239
+ continue
240
+
241
+ if len(pred_plan) < 2 or len(ref_plan) < 2:
242
+ continue
243
+
244
+ # 找到首次错误位置
245
+ first_error_idx = None
246
+ for i, (p, r) in enumerate(zip(pred_plan, ref_plan)):
247
+ if p != r:
248
+ first_error_idx = i
249
+ break
250
+
251
+ if first_error_idx is None:
252
+ continue # 全对
253
+
254
+ # 检查首次错误后的情况
255
+ remaining_pred = pred_plan[first_error_idx + 1 :]
256
+ remaining_ref = ref_plan[first_error_idx + 1 :]
257
+
258
+ if not remaining_ref:
259
+ continue
260
+
261
+ # 计算剩余步骤的正确率
262
+ remaining_correct = sum(1 for p, r in zip(remaining_pred, remaining_ref) if p == r)
263
+ remaining_acc = remaining_correct / len(remaining_ref) if remaining_ref else 0
264
+
265
+ if remaining_acc < 0.2: # 级联失败:后续几乎全错
266
+ cascade_stats[strategy]["cascading"] += 1
267
+ elif remaining_acc > 0.5: # 恢复:后续大部分正确
268
+ cascade_stats[strategy]["recovery"] += 1
269
+ else: # 部分影响
270
+ cascade_stats[strategy]["non_cascading"] += 1
271
+
272
+ return {
273
+ "cascade_statistics": {k: dict(v) for k, v in cascade_stats.items()},
274
+ "insight": "High cascading rate indicates fragile design without rollback/recovery mechanism.",
275
+ }
276
+
277
+
278
+ # =============================================================================
279
+ # Main Experiment
280
+ # =============================================================================
281
+
282
+
283
+ def run_error_analysis(challenge: str = "all", verbose: bool = True) -> dict[str, Any]:
284
+ """
285
+ 运行错误分析实验。
286
+
287
+ Args:
288
+ challenge: 要分析的挑战 ("timing", "planning", "selection", "all")
289
+ verbose: 是否打印详细信息
290
+
291
+ Returns:
292
+ 错误分析结果字典
293
+ """
294
+ setup_experiment_env(verbose=verbose)
295
+
296
+ print_section_header("Section 5.3.1: Error Analysis")
297
+
298
+ all_results = {}
299
+
300
+ # 加载之前实验的结果
301
+ # TODO: 实际应该从保存的结果文件加载
302
+ # 这里用示例数据结构
303
+
304
+ challenges_to_analyze = []
305
+ if challenge == "all":
306
+ challenges_to_analyze = ["timing", "planning", "selection"]
307
+ else:
308
+ challenges_to_analyze = [challenge]
309
+
310
+ for ch in challenges_to_analyze:
311
+ print_subsection_header(f"Analyzing: {ch.title()}")
312
+
313
+ # 模拟加载结果 (实际应从文件加载)
314
+ results = _load_experiment_results(ch)
315
+
316
+ if ch == "timing":
317
+ analysis = analyze_timing_errors(results)
318
+ elif ch == "planning":
319
+ analysis = analyze_planning_errors(results)
320
+ # 添加级联分析
321
+ analysis["cascading"] = analyze_cascading_failures(results)
322
+ elif ch == "selection":
323
+ analysis = analyze_selection_errors(results)
324
+
325
+ all_results[ch] = analysis
326
+
327
+ # 打印摘要
328
+ if "error_counts" in analysis:
329
+ print(" Error counts by strategy:")
330
+ for strategy, counts in analysis["error_counts"].items():
331
+ print(f" {strategy}: {dict(counts)}")
332
+
333
+ # 保存结果
334
+ output_file = save_results(all_results, "5_3_analysis", "error_analysis")
335
+ print(f"\n Results saved to: {output_file}")
336
+
337
+ # 生成图表
338
+ _generate_error_figures(all_results)
339
+
340
+ return all_results
341
+
342
+
343
+ def _load_experiment_results(challenge: str) -> list[dict]:
344
+ """
345
+ 加载实验结果。
346
+
347
+ TODO: 实际实现应从 section_5_2_main/{challenge}_results.json 加载
348
+ """
349
+ # 示例数据结构
350
+ return [
351
+ {
352
+ "strategy": f"{challenge}_strategy_1",
353
+ "predictions": [],
354
+ "references": [],
355
+ "confidences": [],
356
+ }
357
+ ]
358
+
359
+
360
+ def _generate_error_figures(results: dict) -> None:
361
+ """生成错误分析图表。"""
362
+ try:
363
+ from figure_generator import plot_error_breakdown
364
+
365
+ figures_dir = get_figures_dir()
366
+
367
+ for challenge, analysis in results.items():
368
+ if "error_counts" in analysis and analysis["error_counts"]:
369
+ plot_error_breakdown(
370
+ analysis["error_counts"],
371
+ challenge=challenge,
372
+ output_path=figures_dir / f"fig4_analysis_error_{challenge}.pdf",
373
+ )
374
+ print(f" Figure saved: fig4_analysis_error_{challenge}.pdf")
375
+
376
+ except Exception as e:
377
+ print(f" Warning: Could not generate figures: {e}")
378
+
379
+
380
+ def main():
381
+ parser = argparse.ArgumentParser(description="Section 5.3.1: Error Analysis")
382
+ parser.add_argument(
383
+ "--challenge",
384
+ type=str,
385
+ default="all",
386
+ choices=["timing", "planning", "selection", "all"],
387
+ help="Challenge to analyze",
388
+ )
389
+ parser.add_argument("--verbose", action="store_true", default=True, help="Verbose output")
390
+ args = parser.parse_args()
391
+
392
+ run_error_analysis(challenge=args.challenge, verbose=args.verbose)
393
+
394
+ print("\n" + "=" * 70)
395
+ print("📊 Error Analysis Complete")
396
+ print("=" * 70)
397
+
398
+
399
+ if __name__ == "__main__":
400
+ main()