paperfit-cli 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/.claude/commands/adjust-length.md +21 -0
  2. package/.claude/commands/check-visual.md +27 -0
  3. package/.claude/commands/fix-layout.md +31 -0
  4. package/.claude/commands/migrate-template.md +23 -0
  5. package/.claude/commands/repair-table.md +21 -0
  6. package/.claude/commands/show-status.md +32 -0
  7. package/.claude-plugin/README.md +77 -0
  8. package/.claude-plugin/marketplace.json +41 -0
  9. package/.claude-plugin/plugin.json +39 -0
  10. package/CLAUDE.md +266 -0
  11. package/CONTRIBUTING.md +131 -0
  12. package/LICENSE +21 -0
  13. package/README.md +164 -0
  14. package/agents/code-surgeon-agent.md +214 -0
  15. package/agents/layout-detective-agent.md +229 -0
  16. package/agents/orchestrator-agent.md +254 -0
  17. package/agents/quality-gatekeeper-agent.md +270 -0
  18. package/agents/rule-engine-agent.md +224 -0
  19. package/agents/semantic-polish-agent.md +250 -0
  20. package/bin/paperfit.js +176 -0
  21. package/config/agent_roles.yaml +56 -0
  22. package/config/layout_rules.yaml +54 -0
  23. package/config/templates.yaml +241 -0
  24. package/config/vto_taxonomy.yaml +489 -0
  25. package/config/writing_rules.yaml +64 -0
  26. package/install.sh +30 -0
  27. package/package.json +52 -0
  28. package/requirements.txt +5 -0
  29. package/scripts/benchmark_runner.py +629 -0
  30. package/scripts/compile.sh +244 -0
  31. package/scripts/config_validator.py +339 -0
  32. package/scripts/cv_detector.py +600 -0
  33. package/scripts/evidence_collector.py +167 -0
  34. package/scripts/float_fixers.py +861 -0
  35. package/scripts/inject_defects.py +549 -0
  36. package/scripts/install-claude-global.js +148 -0
  37. package/scripts/install.js +66 -0
  38. package/scripts/install.sh +106 -0
  39. package/scripts/overflow_fixers.py +656 -0
  40. package/scripts/package-for-opensource.sh +138 -0
  41. package/scripts/parse_log.py +260 -0
  42. package/scripts/postinstall.js +38 -0
  43. package/scripts/pre_tool_use.py +265 -0
  44. package/scripts/render_pages.py +244 -0
  45. package/scripts/session_logger.py +329 -0
  46. package/scripts/space_util_fixers.py +773 -0
  47. package/scripts/state_manager.py +352 -0
  48. package/scripts/test_commands.py +187 -0
  49. package/scripts/test_cv_detector.py +214 -0
  50. package/scripts/test_integration.py +290 -0
  51. package/skills/consistency-polisher/SKILL.md +337 -0
  52. package/skills/float-optimizer/SKILL.md +284 -0
  53. package/skills/latex_fixers/__init__.py +82 -0
  54. package/skills/latex_fixers/float_fixers.py +392 -0
  55. package/skills/latex_fixers/fullwidth_fixers.py +375 -0
  56. package/skills/latex_fixers/overflow_fixers.py +250 -0
  57. package/skills/latex_fixers/semantic_micro_tuning.py +362 -0
  58. package/skills/latex_fixers/space_util_fixers.py +389 -0
  59. package/skills/latex_fixers/utils.py +55 -0
  60. package/skills/overflow-repair/SKILL.md +304 -0
  61. package/skills/space-util-fixer/SKILL.md +307 -0
  62. package/skills/taxonomy-vto/SKILL.md +486 -0
  63. package/skills/template-migrator/SKILL.md +251 -0
  64. package/skills/visual-inspector/SKILL.md +217 -0
  65. package/skills/writing-polish/SKILL.md +289 -0
@@ -0,0 +1,629 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ VTO Benchmark 评测运行器
4
+
5
+ 批量评估 PaperFit 对不同缺陷类型的检测和修复能力,输出结构化评测报告。
6
+
7
+ 功能:
8
+ 1. 加载预定义的测试样本(包含已知缺陷)
9
+ 2. 运行 PaperFix 修复流程
10
+ 3. 收集修复前后的指标
11
+ 4. 生成评测报告和准确率统计
12
+
13
+ 用法:
14
+ python benchmark_runner.py [--samples-dir DIR] [--output-dir DIR] [--rounds N]
15
+ """
16
+
17
+ import argparse
18
+ import json
19
+ import subprocess
20
+ import sys
21
+ import time
22
+ from dataclasses import dataclass, field
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+ from typing import Any, Dict, List, Optional, Tuple
26
+
27
+
28
+ # ============================================================
29
+ # 评测指标定义
30
+ # ============================================================
31
+
32
+ @dataclass
33
+ class DefectDetectionResult:
34
+ """缺陷检测结果"""
35
+ defect_id: str
36
+ expected: bool # 是否应该被检测到
37
+ detected: bool # 是否实际被检测到
38
+ confidence: Optional[float] = None # 置信度(如果有)
39
+
40
+ @property
41
+ def is_true_positive(self) -> bool:
42
+ return self.expected and self.detected
43
+
44
+ @property
45
+ def is_false_positive(self) -> bool:
46
+ return not self.expected and self.detected
47
+
48
+ @property
49
+ def is_false_negative(self) -> bool:
50
+ return self.expected and not self.detected
51
+
52
+ @property
53
+ def is_true_negative(self) -> bool:
54
+ return not self.expected and not self.detected
55
+
56
+
57
+ @dataclass
58
+ class DefectRepairResult:
59
+ """缺陷修复结果"""
60
+ defect_id: str
61
+ attempted: bool # 是否尝试修复
62
+ successful: bool # 是否修复成功
63
+ method: str = "" # 使用的修复方法
64
+ side_effects: List[str] = field(default_factory=list) # 副作用列表
65
+
66
+
67
+ @dataclass
68
+ class RoundMetrics:
69
+ """单轮评测指标"""
70
+ round_id: int
71
+ sample_name: str
72
+ initial_defects: List[Dict]
73
+ detected_defects: List[DefectDetectionResult]
74
+ repair_results: List[DefectRepairResult]
75
+ compile_success: bool
76
+ compile_time_sec: float
77
+ total_time_sec: float
78
+ page_count_before: int
79
+ page_count_after: int
80
+
81
+ @property
82
+ def detection_precision(self) -> float:
83
+ """检测查准率"""
84
+ tp = sum(1 for d in self.detected_defects if d.is_true_positive)
85
+ fp = sum(1 for d in self.detected_defects if d.is_false_positive)
86
+ if tp + fp == 0:
87
+ return 0.0
88
+ return tp / (tp + fp)
89
+
90
+ @property
91
+ def detection_recall(self) -> float:
92
+ """检测查全率"""
93
+ tp = sum(1 for d in self.detected_defects if d.is_true_positive)
94
+ fn = sum(1 for d in self.detected_defects if d.is_false_negative)
95
+ if tp + fn == 0:
96
+ return 0.0
97
+ return tp / (tp + fn)
98
+
99
+ @property
100
+ def detection_f1(self) -> float:
101
+ """检测 F1 分数"""
102
+ p = self.detection_precision
103
+ r = self.detection_recall
104
+ if p + r == 0:
105
+ return 0.0
106
+ return 2 * p * r / (p + r)
107
+
108
+ @property
109
+ def repair_success_rate(self) -> float:
110
+ """修复成功率"""
111
+ attempted = sum(1 for r in self.repair_results if r.attempted)
112
+ successful = sum(1 for r in self.repair_results if r.successful)
113
+ if attempted == 0:
114
+ return 0.0
115
+ return successful / attempted
116
+
117
+
118
+ @dataclass
119
+ class BenchmarkSummary:
120
+ """评测汇总"""
121
+ benchmark_id: str
122
+ timestamp: str
123
+ total_samples: int
124
+ total_rounds: int
125
+ avg_detection_precision: float
126
+ avg_detection_recall: float
127
+ avg_detection_f1: float
128
+ avg_repair_success_rate: float
129
+ avg_compile_time_sec: float
130
+ avg_total_time_sec: float
131
+ category_breakdown: Dict[str, Dict[str, float]] = field(default_factory=dict)
132
+ per_sample_results: List[Dict] = field(default_factory=list)
133
+
134
+
135
+ # ============================================================
136
+ # Benchmark 运行器
137
+ # ============================================================
138
+
139
+ class BenchmarkRunner:
140
+ """Benchmark 评测运行器"""
141
+
142
+ def __init__(
143
+ self,
144
+ samples_dir: Path,
145
+ output_dir: Path,
146
+ paperfit_root: Optional[Path] = None,
147
+ ):
148
+ self.samples_dir = samples_dir
149
+ self.output_dir = output_dir
150
+ self.paperfit_root = paperfit_root or Path(__file__).parent.parent
151
+ self.output_dir.mkdir(parents=True, exist_ok=True)
152
+
153
+ # 评测结果存储
154
+ self.all_metrics: List[RoundMetrics] = []
155
+
156
+ def run_benchmark(
157
+ self,
158
+ sample_names: Optional[List[str]] = None,
159
+ max_rounds_per_sample: int = 3,
160
+ ) -> BenchmarkSummary:
161
+ """
162
+ 运行完整评测流程
163
+
164
+ Args:
165
+ sample_names: 要评测的样本名称列表,None 表示评测所有
166
+ max_rounds_per_sample: 每个样本的最大迭代轮数
167
+ """
168
+ # 发现样本
169
+ if sample_names is None:
170
+ sample_names = self._discover_samples()
171
+
172
+ print(f"\n开始评测 {len(sample_names)} 个样本")
173
+ print("=" * 60)
174
+
175
+ for sample_name in sample_names:
176
+ self._run_sample(
177
+ sample_name=sample_name,
178
+ max_rounds=max_rounds_per_sample,
179
+ )
180
+
181
+ # 生成汇总报告
182
+ summary = self._generate_summary()
183
+ self._save_summary(summary)
184
+
185
+ return summary
186
+
187
+ def _discover_samples(self) -> List[str]:
188
+ """发现所有可用的测试样本"""
189
+ sample_files = list(self.samples_dir.glob("*.tex"))
190
+ # 排除干净样本和基础样本
191
+ exclude_prefixes = ["clean", "_base"]
192
+ return [
193
+ f.stem for f in sample_files
194
+ if not any(f.stem.startswith(p) for p in exclude_prefixes)
195
+ ]
196
+
197
+ def _run_sample(
198
+ self,
199
+ sample_name: str,
200
+ max_rounds: int,
201
+ ) -> List[RoundMetrics]:
202
+ """运行单个样本的评测"""
203
+ print(f"\n[评测] {sample_name}")
204
+ print("-" * 40)
205
+
206
+ sample_path = self.samples_dir / f"{sample_name}.tex"
207
+ defects_path = self.samples_dir / f"{sample_name}_defects.json"
208
+
209
+ if not sample_path.exists():
210
+ print(f" [跳过] 样本不存在:{sample_path}")
211
+ return []
212
+
213
+ # 加载缺陷清单
214
+ expected_defects = []
215
+ if defects_path.exists():
216
+ with open(defects_path, "r", encoding="utf-8") as f:
217
+ expected_defects = json.load(f)
218
+ print(f" [加载] 预期缺陷数:{len(expected_defects)}")
219
+ else:
220
+ print(f" [警告] 未找到缺陷清单:{defects_path}")
221
+
222
+ # 复制样本到工作目录
223
+ work_dir = self.output_dir / "work"
224
+ work_dir.mkdir(parents=True, exist_ok=True)
225
+ work_tex = work_dir / f"{sample_name}.tex"
226
+
227
+ # 编译并统计初始页数
228
+ page_before = self._compile_and_count_pages(sample_path, work_tex)
229
+ print(f" [编译] 初始页数:{page_before}")
230
+
231
+ round_metrics_list: List[RoundMetrics] = []
232
+
233
+ for round_id in range(1, max_rounds + 1):
234
+ print(f"\n [轮次] {round_id}/{max_rounds}")
235
+
236
+ start_time = time.time()
237
+
238
+ # 运行 PaperFit 修复流程(调用 fix-layout 命令)
239
+ compile_success, compile_time = self._run_paperfit_fix(work_tex)
240
+
241
+ # 编译后统计
242
+ page_after = self._count_pdf_pages(work_dir / f"{sample_name}.pdf")
243
+
244
+ # 收集检测结果(从 state.json 或日志中解析)
245
+ detected_defects = self._collect_detection_results(work_dir)
246
+ repair_results = self._collect_repair_results(work_dir, expected_defects)
247
+
248
+ elapsed = time.time() - start_time
249
+
250
+ metrics = RoundMetrics(
251
+ round_id=round_id,
252
+ sample_name=sample_name,
253
+ initial_defects=expected_defects,
254
+ detected_defects=detected_defects,
255
+ repair_results=repair_results,
256
+ compile_success=compile_success,
257
+ compile_time_sec=compile_time,
258
+ total_time_sec=elapsed,
259
+ page_count_before=page_before,
260
+ page_count_after=page_after,
261
+ )
262
+
263
+ self.all_metrics.append(metrics)
264
+ round_metrics_list.append(metrics)
265
+
266
+ # 输出本轮指标
267
+ print(f" 检测 F1: {metrics.detection_f1:.2%}")
268
+ print(f" 修复成功率:{metrics.repair_success_rate:.2%}")
269
+ print(f" 编译时间:{compile_time:.2f}s")
270
+
271
+ # 如果达到 DONE 状态或无缺陷可修复,提前结束
272
+ if metrics.detection_f1 == 1.0 or metrics.repair_success_rate == 0.0:
273
+ print(f" [完成] 样本 {sample_name} 已达到最优状态")
274
+ break
275
+
276
+ return round_metrics_list
277
+
278
+ def _compile_and_count_pages(
279
+ self,
280
+ source_tex: Path,
281
+ dest_tex: Path,
282
+ ) -> int:
283
+ """编译并返回页数"""
284
+ # 复制文件到目标位置
285
+ import shutil
286
+ shutil.copy(source_tex, dest_tex)
287
+ shutil.copy(source_tex.parent / "clean_sample.tex", dest_tex.parent, dirs_exist_ok=True)
288
+
289
+ pdf_path = dest_tex.with_suffix(".pdf")
290
+ return self._count_pdf_pages(pdf_path)
291
+
292
+ def _count_pdf_pages(self, pdf_path: Path) -> int:
293
+ """统计 PDF 页数"""
294
+ if not pdf_path.exists():
295
+ return 0
296
+ try:
297
+ # 使用 pdfinfo 工具(poppler-utils)
298
+ result = subprocess.run(
299
+ ["pdfinfo", str(pdf_path)],
300
+ capture_output=True,
301
+ text=True,
302
+ timeout=10,
303
+ )
304
+ for line in result.stdout.split("\n"):
305
+ if line.startswith("Pages:"):
306
+ return int(line.split(":")[1].strip())
307
+ except (subprocess.TimeoutExpired, FileNotFoundError, ValueError):
308
+ pass
309
+ return 0
310
+
311
+ def _run_paperfit_fix(self, tex_path: Path) -> Tuple[bool, float]:
312
+ """运行 PaperFix 修复流程"""
313
+ start_time = time.time()
314
+
315
+ try:
316
+ # 调用 fix-layout 命令(这里使用占位实现)
317
+ # 实际使用时需要根据项目结构调整
318
+ result = subprocess.run(
319
+ [
320
+ sys.executable,
321
+ str(self.paperfit_root / "scripts" / "state_manager.py"),
322
+ "--status",
323
+ ],
324
+ capture_output=True,
325
+ text=True,
326
+ timeout=300, # 5 分钟超时
327
+ cwd=str(tex_path.parent),
328
+ )
329
+
330
+ compile_success = result.returncode == 0
331
+ elapsed = time.time() - start_time
332
+ return compile_success, elapsed
333
+
334
+ except subprocess.TimeoutExpired:
335
+ print(f" [错误] 修复超时(>5 分钟)")
336
+ return False, time.time() - start_time
337
+ except Exception as e:
338
+ print(f" [错误] 执行失败:{e}")
339
+ return False, time.time() - start_time
340
+
341
+ def _collect_detection_results(
342
+ self,
343
+ work_dir: Path,
344
+ ) -> List[DefectDetectionResult]:
345
+ """收集缺陷检测结果"""
346
+ results: List[DefectDetectionResult] = []
347
+
348
+ # 尝试从 state.json 读取检测结果
349
+ state_path = self.paperfit_root / "data" / "state.json"
350
+ if state_path.exists():
351
+ with open(state_path, "r", encoding="utf-8") as f:
352
+ state = json.load(f)
353
+ visual_defects = state.get("visual_defects", [])
354
+ for defect in visual_defects:
355
+ results.append(
356
+ DefectDetectionResult(
357
+ defect_id=defect.get("category", "unknown"),
358
+ expected=True,
359
+ detected=True,
360
+ )
361
+ )
362
+
363
+ return results
364
+
365
+ def _collect_repair_results(
366
+ self,
367
+ work_dir: Path,
368
+ expected_defects: List[Dict],
369
+ ) -> List[DefectRepairResult]:
370
+ """收集缺陷修复结果"""
371
+ results: List[DefectRepairResult] = []
372
+
373
+ # 从状态文件或日志中解析修复结果
374
+ # 这里使用占位实现,实际需要根据项目结构调整
375
+ for defect in expected_defects:
376
+ results.append(
377
+ DefectRepairResult(
378
+ defect_id=defect.get("defect_id", "unknown"),
379
+ attempted=True,
380
+ successful=True, # 占位
381
+ method="auto",
382
+ )
383
+ )
384
+
385
+ return results
386
+
387
+ def _generate_summary(self) -> BenchmarkSummary:
388
+ """生成评测汇总"""
389
+ if not self.all_metrics:
390
+ return BenchmarkSummary(
391
+ benchmark_id=datetime.now().strftime("%Y%m%d_%H%M%S"),
392
+ timestamp=datetime.now().isoformat(),
393
+ total_samples=0,
394
+ total_rounds=0,
395
+ avg_detection_precision=0.0,
396
+ avg_detection_recall=0.0,
397
+ avg_detection_f1=0.0,
398
+ avg_repair_success_rate=0.0,
399
+ avg_compile_time_sec=0.0,
400
+ avg_total_time_sec=0.0,
401
+ )
402
+
403
+ # 计算平均指标
404
+ avg_precision = sum(m.detection_precision for m in self.all_metrics) / len(self.all_metrics)
405
+ avg_recall = sum(m.detection_recall for m in self.all_metrics) / len(self.all_metrics)
406
+ avg_f1 = sum(m.detection_f1 for m in self.all_metrics) / len(self.all_metrics)
407
+ avg_repair = sum(m.repair_success_rate for m in self.all_metrics) / len(self.all_metrics)
408
+ avg_compile = sum(m.compile_time_sec for m in self.all_metrics) / len(self.all_metrics)
409
+ avg_total = sum(m.total_time_sec for m in self.all_metrics) / len(self.all_metrics)
410
+
411
+ # 按类别分解
412
+ category_stats: Dict[str, Dict[str, float]] = {}
413
+ for metrics in self.all_metrics:
414
+ for defect in metrics.initial_defects:
415
+ cat = defect.get("defect_id", "unknown")[0] # 取首字母作为类别
416
+ if cat not in category_stats:
417
+ category_stats[cat] = {"count": 0, "repaired": 0}
418
+ category_stats[cat]["count"] += 1
419
+ # 统计修复成功的数量
420
+ for repair in metrics.repair_results:
421
+ if repair.defect_id.startswith(cat) and repair.successful:
422
+ category_stats[cat]["repaired"] += 1
423
+
424
+ # 计算各类别成功率
425
+ for cat, stats in category_stats.items():
426
+ stats["success_rate"] = (
427
+ stats["repaired"] / stats["count"] if stats["count"] > 0 else 0.0
428
+ )
429
+
430
+ # 按样本分组
431
+ sample_names = set(m.sample_name for m in self.all_metrics)
432
+ per_sample = []
433
+ for name in sample_names:
434
+ sample_metrics = [m for m in self.all_metrics if m.sample_name == name]
435
+ if sample_metrics:
436
+ last = sample_metrics[-1]
437
+ per_sample.append({
438
+ "sample_name": name,
439
+ "final_f1": last.detection_f1,
440
+ "final_repair_rate": last.repair_success_rate,
441
+ "total_rounds": len(sample_metrics),
442
+ })
443
+
444
+ return BenchmarkSummary(
445
+ benchmark_id=datetime.now().strftime("%Y%m%d_%H%M%S"),
446
+ timestamp=datetime.now().isoformat(),
447
+ total_samples=len(sample_names),
448
+ total_rounds=len(self.all_metrics),
449
+ avg_detection_precision=avg_precision,
450
+ avg_detection_recall=avg_recall,
451
+ avg_detection_f1=avg_f1,
452
+ avg_repair_success_rate=avg_repair,
453
+ avg_compile_time_sec=avg_compile,
454
+ avg_total_time_sec=avg_total,
455
+ category_breakdown=category_stats,
456
+ per_sample_results=per_sample,
457
+ )
458
+
459
+ def _save_summary(self, summary: BenchmarkSummary) -> Path:
460
+ """保存评测汇总"""
461
+ output_path = self.output_dir / f"benchmark_{summary.benchmark_id}.json"
462
+
463
+ summary_dict = {
464
+ "benchmark_id": summary.benchmark_id,
465
+ "timestamp": summary.timestamp,
466
+ "total_samples": summary.total_samples,
467
+ "total_rounds": summary.total_rounds,
468
+ "metrics": {
469
+ "avg_detection_precision": summary.avg_detection_precision,
470
+ "avg_detection_recall": summary.avg_detection_recall,
471
+ "avg_detection_f1": summary.avg_detection_f1,
472
+ "avg_repair_success_rate": summary.avg_repair_success_rate,
473
+ "avg_compile_time_sec": summary.avg_compile_time_sec,
474
+ "avg_total_time_sec": summary.avg_total_time_sec,
475
+ },
476
+ "category_breakdown": summary.category_breakdown,
477
+ "per_sample_results": summary.per_sample_results,
478
+ }
479
+
480
+ with open(output_path, "w", encoding="utf-8") as f:
481
+ json.dump(summary_dict, f, indent=2, ensure_ascii=False)
482
+
483
+ print(f"\n[保存] 评测报告:{output_path}")
484
+ return output_path
485
+
486
+
487
+ # ============================================================
488
+ # 报告生成器
489
+ # ============================================================
490
+
491
+ def generate_markdown_report(summary: BenchmarkSummary, output_path: Path) -> None:
492
+ """生成 Markdown 格式的评测报告"""
493
+ report_lines = [
494
+ f"# VTO Benchmark 评测报告",
495
+ f"",
496
+ f"**评测 ID**: {summary.benchmark_id}",
497
+ f"**生成时间**: {summary.timestamp}",
498
+ f"",
499
+ f"## 总体指标",
500
+ f"",
501
+ f"| 指标 | 值 |",
502
+ f"|------|-----|",
503
+ f"| 评测样本数 | {summary.total_samples} |",
504
+ f"| 总轮数 | {summary.total_rounds} |",
505
+ f"| 平均检测查准率 | {summary.avg_detection_precision:.2%} |",
506
+ f"| 平均检测查全率 | {summary.avg_detection_recall:.2%} |",
507
+ f"| 平均检测 F1 | {summary.avg_detection_f1:.2%} |",
508
+ f"| 平均修复成功率 | {summary.avg_repair_success_rate:.2%} |",
509
+ f"| 平均编译时间 | {summary.avg_compile_time_sec:.2f}s |",
510
+ f"| 平均总耗时 | {summary.avg_total_time_sec:.2f}s |",
511
+ f"",
512
+ f"## 按类别分解",
513
+ f"",
514
+ f"| 类别 | 缺陷数 | 修复成功 | 成功率 |",
515
+ f"|------|--------|----------|--------|",
516
+ ]
517
+
518
+ for cat, stats in sorted(summary.category_breakdown.items()):
519
+ report_lines.append(
520
+ f"| Category {cat} | {stats['count']} | {stats['repaired']} | "
521
+ f"{stats.get('success_rate', 0):.2%} |"
522
+ )
523
+
524
+ report_lines.extend([
525
+ f"",
526
+ f"## 各样本结果",
527
+ f"",
528
+ f"| 样本名称 | 最终 F1 | 修复成功率 | 轮数 |",
529
+ f"|----------|---------|------------|------|",
530
+ ])
531
+
532
+ for sample in sorted(summary.per_sample_results, key=lambda x: x["sample_name"]):
533
+ report_lines.append(
534
+ f"| {sample['sample_name']} | {sample['final_f1']:.2%} | "
535
+ f"{sample['final_repair_rate']:.2%} | {sample['total_rounds']} |"
536
+ )
537
+
538
+ report_lines.extend([
539
+ f"",
540
+ f"---",
541
+ f"*报告由 benchmark_runner.py 自动生成*",
542
+ ])
543
+
544
+ output_path.write_text("\n".join(report_lines), encoding="utf-8")
545
+ print(f"[保存] Markdown 报告:{output_path}")
546
+
547
+
548
+ # ============================================================
549
+ # 主函数
550
+ # ============================================================
551
+
552
+ def main():
553
+ """主函数"""
554
+ parser = argparse.ArgumentParser(
555
+ description="VTO Benchmark 评测运行器"
556
+ )
557
+ parser.add_argument(
558
+ "--samples-dir",
559
+ type=str,
560
+ default="data/benchmarks/samples",
561
+ help="测试样本目录"
562
+ )
563
+ parser.add_argument(
564
+ "--output-dir",
565
+ type=str,
566
+ default="data/benchmarks/results",
567
+ help="输出结果目录"
568
+ )
569
+ parser.add_argument(
570
+ "--samples",
571
+ nargs="+",
572
+ default=None,
573
+ help="指定要评测的样本名称,默认评测所有"
574
+ )
575
+ parser.add_argument(
576
+ "--rounds",
577
+ type=int,
578
+ default=3,
579
+ help="每个样本的最大迭代轮数"
580
+ )
581
+ parser.add_argument(
582
+ "--paperfit-root",
583
+ type=str,
584
+ default=None,
585
+ help="PaperFit 项目根目录"
586
+ )
587
+
588
+ args = parser.parse_args()
589
+
590
+ samples_dir = Path(args.samples_dir)
591
+ output_dir = Path(args.output_dir)
592
+ paperfit_root = Path(args.paperfit_root) if args.paperfit_root else None
593
+
594
+ if not samples_dir.exists():
595
+ print(f"[错误] 样本目录不存在:{samples_dir}")
596
+ print("请先运行 inject_defects.py 生成测试样本")
597
+ sys.exit(1)
598
+
599
+ # 创建运行器
600
+ runner = BenchmarkRunner(
601
+ samples_dir=samples_dir,
602
+ output_dir=output_dir,
603
+ paperfit_root=paperfit_root,
604
+ )
605
+
606
+ # 运行评测
607
+ summary = runner.run_benchmark(
608
+ sample_names=args.samples,
609
+ max_rounds_per_sample=args.rounds,
610
+ )
611
+
612
+ # 生成 Markdown 报告
613
+ report_path = output_dir / f"benchmark_{summary.benchmark_id}.md"
614
+ generate_markdown_report(summary, report_path)
615
+
616
+ # 打印摘要
617
+ print("\n" + "=" * 60)
618
+ print("评测摘要")
619
+ print("=" * 60)
620
+ print(f"样本数:{summary.total_samples}")
621
+ print(f"总轮数:{summary.total_rounds}")
622
+ print(f"平均检测 F1: {summary.avg_detection_f1:.2%}")
623
+ print(f"平均修复成功率:{summary.avg_repair_success_rate:.2%}")
624
+ print(f"平均耗时:{summary.avg_total_time_sec:.2f}s")
625
+ print("=" * 60)
626
+
627
+
628
+ if __name__ == "__main__":
629
+ main()